class PRSearch(FacetedSearch): index = 'test-prs' doc_types = [PullRequest] facets = { 'comments': NestedFacet( 'comments', DateHistogramFacet(field='comments.created_at', interval='month')) }
class PRSearch(FacetedSearch): index = "test-prs" doc_types = [PullRequest] facets = { "comments": NestedFacet( "comments", DateHistogramFacet(field="comments.created_at", interval="month"), ) }
def _maybe_get_nested_facet(elasticsearch_field_name, es_facet): """ Returns a NestedFacet for the Elasticsearch field, if the field is nested. Note there can be multiple levels of NestedFacet, eg NestedFacet(outer, NestedFacet(inner, es_facet)) """ parts = elasticsearch_field_name.rsplit('.', 1) # Traverse up the nesting levels from the leaf field, until we reach the root. # Need to traverse until the root, because the root can be a nested field, # for example "samples". All the sub fields can be non-nested, like # "samples.verily-public-data.human_genome_variants.1000_genomes_sample_info.Main_project_LC_platform" # This field needs to be a NestedFacet because an ancestor("samples") is nested. while len(parts) > 1: parent = parts[0] if parent in current_app.config['NESTED_PATHS']: es_facet = NestedFacet(parent, es_facet) parts = parent.rsplit('.', 1) return es_facet
class PageSearchBase(RTDFacetedSearch): facets = { 'project': TermsFacet(field='project'), 'version': TermsFacet(field='version'), 'role_name': NestedFacet('domains', TermsFacet(field='domains.role_name')), } doc_types = [PageDocument] index = PageDocument._doc_type.index _outer_fields = ['title^4'] _section_fields = ['sections.title^3', 'sections.content'] _domain_fields = [ 'domains.name^2', 'domains.docstrings', ] _common_highlight_options = { 'encoder': 'html', 'number_of_fragments': 1, 'pre_tags': ['<span>'], 'post_tags': ['</span>'], } fields = _outer_fields # need to search for both 'and' and 'or' operations # the score of and should be higher as it satisfies both or and and operators = ['and', 'or'] def count(self): """Overriding ``count`` method to return the count of the results after post_filter.""" s = self.build_search() # setting size=0 so that no results are returned, # we are only interested in the total count s = s.extra(size=0) s = s.execute() return s.hits.total def query(self, search, query): """Manipulates query to support nested query.""" search = search.highlight_options(**self._common_highlight_options) all_queries = [] # match query for the title (of the page) field. for operator in self.operators: all_queries.append( SimpleQueryString(query=query, fields=self.fields, default_operator=operator)) # nested query for search in sections sections_nested_query = self.generate_nested_query( query=query, path='sections', fields=self._section_fields, inner_hits={ 'highlight': dict(self._common_highlight_options, fields={ 'sections.title': {}, 'sections.content': {}, }) }) # nested query for search in domains domains_nested_query = self.generate_nested_query( query=query, path='domains', fields=self._domain_fields, inner_hits={ 'highlight': dict(self._common_highlight_options, fields={ 'domains.name': {}, 'domains.docstrings': {}, }) }) all_queries.extend([sections_nested_query, domains_nested_query]) final_query = Bool(should=all_queries) search = search.query(final_query) return search def generate_nested_query(self, query, path, fields, inner_hits): """Generate a nested query with passed parameters.""" queries = [] for operator in self.operators: query_string = SimpleQueryString(query=query, fields=fields, default_operator=operator) queries.append(query_string) bool_query = Bool(should=queries) nested_query = Nested(path=path, inner_hits=inner_hits, query=bool_query) return nested_query
class PageSearchBase(RTDFacetedSearch): facets = { 'project': TermsFacet(field='project'), 'version': TermsFacet(field='version'), 'role_name': NestedFacet('domains', TermsFacet(field='domains.role_name')), } doc_types = [PageDocument] index = PageDocument._index._name # boosting for these fields need to be close enough # to be re-boosted by the page rank. _outer_fields = ['title^1.5'] _section_fields = ['sections.title^2', 'sections.content'] _domain_fields = [ 'domains.name^1.5', 'domains.docstrings', ] fields = _outer_fields # need to search for both 'and' and 'or' operations # the score of and should be higher as it satisfies both or and and operators = ['and', 'or'] excludes = ['rank', 'sections', 'domains', 'commit', 'build'] def total_count(self): """Returns the total count of results of the current query.""" s = self.build_search() # setting size=0 so that no results are returned, # we are only interested in the total count s = s.extra(size=0) s = s.execute() return s.hits.total def query(self, search, query): """ Manipulates the query to support nested queries and a custom rank for pages. If `self.projects` was given, we use it to filter the documents that match the same project and version. """ search = search.highlight_options(**self._highlight_options) search = search.source(excludes=self.excludes) queries = self._get_queries( query=query, fields=self.fields, ) sections_nested_query = self._get_nested_query( query=query, path='sections', fields=self._section_fields, ) domains_nested_query = self._get_nested_query( query=query, path='domains', fields=self._domain_fields, ) queries.extend([sections_nested_query, domains_nested_query]) bool_query = Bool(should=queries) if self.projects: versions_query = [ Bool(must=[ Term(project={'value': project}), Term(version={'value': version}), ]) for project, version in self.projects.items() ] bool_query = Bool(must=[bool_query, Bool(should=versions_query)]) final_query = FunctionScore( query=bool_query, script_score=self._get_script_score(), ) search = search.query(final_query) return search def _get_nested_query(self, *, query, path, fields): """Generate a nested query with passed parameters.""" queries = self._get_queries( query=query, fields=fields, ) raw_fields = ( # Remove boosting from the field re.sub(r'\^.*$', '', field) for field in fields) highlight = dict( self._highlight_options, fields={field: {} for field in raw_fields}, ) return Nested( path=path, inner_hits={'highlight': highlight}, query=Bool(should=queries), ) def _get_script_score(self): """ Gets an ES script to map the page rank to a valid score weight. ES expects the rank to be a number greater than 0, but users can set this between [-10, +10]. We map that range to [0.01, 2] (21 possible values). The first lower rank (0.8) needs to bring the score from the highest boost (sections.title^2) close to the lowest boost (title^1.5), that way exact results take priority: - 2.0 * 0.8 = 1.6 (score close to 1.5, but not lower than it) - 1.5 * 0.8 = 1.2 (score lower than 1.5) The first higher rank (1.2) needs to bring the score from the lowest boost (title^1.5) close to the highest boost (sections.title^2), that way exact results take priority: - 2.0 * 1.3 = 2.6 (score higher thank 2.0) - 1.5 * 1.3 = 1.95 (score close to 2.0, but not higher than it) The next lower and higher ranks need to decrease/increase both scores. See https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html#field-value-factor # noqa """ ranking = [ 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 1.93, 1.96, 2, ] # Each rank maps to a element in the ranking list. # -10 will map to the first element (-10 + 10 = 0) and so on. source = """ int rank = doc['rank'].size() == 0 ? 0 : (int) doc['rank'].value; return params.ranking[rank + 10] * _score; """ return { "script": { "source": source, "params": { "ranking": ranking }, }, }
def get_samples_overview_facet(es_field_names): filters = { facet: Match(**{field: True}) for facet, field in es_field_names.iteritems() } return NestedFacet('samples', FiltersFacet(filters))
class PageSearch(RTDFacetedSearch): facets = { 'project': TermsFacet(field='project'), 'version': TermsFacet(field='version'), 'role_name': NestedFacet('domains', TermsFacet(field='domains.role_name')), } doc_types = [PageDocument] index = PageDocument._index._name # boosting for these fields need to be close enough # to be re-boosted by the page rank. _outer_fields = ['title^1.5'] _section_fields = ['sections.title^2', 'sections.content'] _domain_fields = [ 'domains.name^1.5', 'domains.docstrings', ] fields = _outer_fields excludes = ['rank', 'sections', 'domains', 'commit', 'build'] def _get_projects_query(self): """ Get filter by projects query. If it's a dict, filter by project and version, if it's a list filter by project. """ if not self.projects: return None if isinstance(self.projects, dict): versions_query = [ Bool(filter=[Term(project=project), Term(version=version)]) for project, version in self.projects.items() ] return Bool(should=versions_query) if isinstance(self.projects, list): return Bool(filter=Terms(project=self.projects)) raise ValueError('projects must be a list or a dict!') def query(self, search, query): """ Manipulates the query to support nested queries and a custom rank for pages. If `self.projects` was given, we use it to filter the documents that match the same project and version. """ search = search.highlight_options(**self._highlight_options) search = search.source(excludes=self.excludes) queries = self._get_queries( query=query, fields=self.fields, ) sections_nested_query = self._get_nested_query( query=query, path='sections', fields=self._section_fields, ) domains_nested_query = self._get_nested_query( query=query, path='domains', fields=self._domain_fields, ) queries.extend([sections_nested_query, domains_nested_query]) bool_query = Bool(should=queries) projects_query = self._get_projects_query() if projects_query: bool_query = Bool(must=[bool_query, projects_query]) final_query = FunctionScore( query=bool_query, script_score=self._get_script_score(), ) search = search.query(final_query) return search def _get_nested_query(self, *, query, path, fields): """Generate a nested query with passed parameters.""" queries = self._get_queries( query=query, fields=fields, ) bool_query = Bool(should=queries) raw_fields = [ # Remove boosting from the field re.sub(r'\^.*$', '', field) for field in fields ] # The ``post_filter`` filter will only filter documents # at the parent level (domains is a nested document), # resulting in results with domains that don't match the current # role_name being filtered, so we need to force filtering by role_name # on the ``domains`` document here. See #8268. # TODO: We should use a flattened document instead # to avoid this kind of problems and have faster queries. role_name = self.filter_values.get('role_name') if path == 'domains' and role_name: role_name_query = Bool(must=Terms( **{'domains.role_name': role_name})) bool_query = Bool(must=[role_name_query, bool_query]) highlight = dict( self._highlight_options, fields={field: {} for field in raw_fields}, ) return Nested( path=path, inner_hits={'highlight': highlight}, query=bool_query, ) def _get_script_score(self): """ Gets an ES script to map the page rank to a valid score weight. ES expects the rank to be a number greater than 0, but users can set this between [-10, +10]. We map that range to [0.01, 2] (21 possible values). The first lower rank (0.8) needs to bring the score from the highest boost (sections.title^2) close to the lowest boost (title^1.5), that way exact results take priority: - 2.0 * 0.8 = 1.6 (score close to 1.5, but not lower than it) - 1.5 * 0.8 = 1.2 (score lower than 1.5) The first higher rank (1.2) needs to bring the score from the lowest boost (title^1.5) close to the highest boost (sections.title^2), that way exact results take priority: - 2.0 * 1.3 = 2.6 (score higher thank 2.0) - 1.5 * 1.3 = 1.95 (score close to 2.0, but not higher than it) The next lower and higher ranks need to decrease/increase both scores. See https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html#field-value-factor # noqa """ ranking = [ 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 1.93, 1.96, 2, ] # Each rank maps to a element in the ranking list. # -10 will map to the first element (-10 + 10 = 0) and so on. source = """ int rank = doc['rank'].size() == 0 ? 0 : (int) doc['rank'].value; return params.ranking[rank + 10] * _score; """ return { "script": { "source": source, "params": { "ranking": ranking }, }, }
class PageSearchBase(RTDFacetedSearch): facets = { 'project': TermsFacet(field='project'), 'version': TermsFacet(field='version'), 'role_name': NestedFacet( 'domains', TermsFacet(field='domains.role_name') ), } doc_types = [PageDocument] index = PageDocument._doc_type.index # boosting for these fields need to be close enough # to be re-boosted by the page rank. _outer_fields = ['title^1.5'] _section_fields = ['sections.title^2', 'sections.content'] _domain_fields = [ 'domains.name^1.5', 'domains.docstrings', ] fields = _outer_fields # need to search for both 'and' and 'or' operations # the score of and should be higher as it satisfies both or and and operators = ['and', 'or'] def total_count(self): """Returns the total count of results of the current query.""" s = self.build_search() # setting size=0 so that no results are returned, # we are only interested in the total count s = s.extra(size=0) s = s.execute() return s.hits.total def query(self, search, query): """Manipulates the query to support nested queries and a custom rank for pages.""" search = search.highlight_options(**self._highlight_options) all_queries = [] # match query for the title (of the page) field. for operator in self.operators: query_string = self._get_text_query( query=query, fields=self.fields, operator=operator, ) all_queries.append(query_string) # nested query for search in sections sections_nested_query = self.generate_nested_query( query=query, path='sections', fields=self._section_fields, inner_hits={ 'highlight': dict( self._highlight_options, fields={ 'sections.title': {}, 'sections.content': {}, } ) } ) # nested query for search in domains domains_nested_query = self.generate_nested_query( query=query, path='domains', fields=self._domain_fields, inner_hits={ 'highlight': dict( self._highlight_options, fields={ 'domains.name': {}, 'domains.docstrings': {}, } ) } ) all_queries.extend([sections_nested_query, domains_nested_query]) final_query = FunctionScore( query=Bool(should=all_queries), script_score=self._get_script_score(), ) search = search.query(final_query) return search def _get_script_score(self): """ Gets an ES script to map the page rank to a valid score weight. ES expects the rank to be a number greater than 0, but users can set this between [-10, +10]. We map that range to [0.01, 2] (21 possible values). The first lower rank (0.8) needs to bring the score from the highest boost (sections.title^2) close to the lowest boost (title^1.5), that way exact results take priority: - 2.0 * 0.8 = 1.6 (score close to 1.5, but not lower than it) - 1.5 * 0.8 = 1.2 (score lower than 1.5) The first higher rank (1.2) needs to bring the score from the lowest boost (title^1.5) close to the highest boost (sections.title^2), that way exact results take priority: - 2.0 * 1.3 = 2.6 (score higher thank 2.0) - 1.5 * 1.3 = 1.95 (score close to 2.0, but not higher than it) The next lower and higher ranks need to decrease/increase both scores. See https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html#field-value-factor # noqa """ ranking = [ 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 1.93, 1.96, 2, ] # Each rank maps to a element in the ranking list. # -10 will map to the first element (-10 + 10 = 0) and so on. source = """ int rank = doc['rank'].size() == 0 ? 0 : (int) doc['rank'].value; return params.ranking[rank + 10] * _score; """ return { "script": { "source": source, "params": {"ranking": ranking}, }, } def generate_nested_query(self, query, path, fields, inner_hits): """Generate a nested query with passed parameters.""" queries = [] for operator in self.operators: query_string = self._get_text_query( query=query, fields=fields, operator=operator, ) queries.append(query_string) bool_query = Bool(should=queries) nested_query = Nested( path=path, inner_hits=inner_hits, query=bool_query ) return nested_query