class PRSearch(FacetedSearch):
    index = 'test-prs'
    doc_types = [PullRequest]
    facets = {
        'comments':
        NestedFacet(
            'comments',
            DateHistogramFacet(field='comments.created_at', interval='month'))
    }
Beispiel #2
0
class PRSearch(FacetedSearch):
    index = "test-prs"
    doc_types = [PullRequest]
    facets = {
        "comments": NestedFacet(
            "comments",
            DateHistogramFacet(field="comments.created_at", interval="month"),
        )
    }
def _maybe_get_nested_facet(elasticsearch_field_name, es_facet):
    """
    Returns a NestedFacet for the Elasticsearch field, if the field is nested.

    Note there can be multiple levels of NestedFacet,
    eg NestedFacet(outer, NestedFacet(inner, es_facet))
    """
    parts = elasticsearch_field_name.rsplit('.', 1)
    # Traverse up the nesting levels from the leaf field, until we reach the root.
    # Need to traverse until the root, because the root can be a nested field,
    # for example "samples". All the sub fields can be non-nested, like
    # "samples.verily-public-data.human_genome_variants.1000_genomes_sample_info.Main_project_LC_platform"
    # This field needs to be a NestedFacet because an ancestor("samples") is nested.
    while len(parts) > 1:
        parent = parts[0]
        if parent in current_app.config['NESTED_PATHS']:
            es_facet = NestedFacet(parent, es_facet)
        parts = parent.rsplit('.', 1)

    return es_facet
Beispiel #4
0
class PageSearchBase(RTDFacetedSearch):
    facets = {
        'project': TermsFacet(field='project'),
        'version': TermsFacet(field='version'),
        'role_name': NestedFacet('domains',
                                 TermsFacet(field='domains.role_name')),
    }
    doc_types = [PageDocument]
    index = PageDocument._doc_type.index

    _outer_fields = ['title^4']
    _section_fields = ['sections.title^3', 'sections.content']
    _domain_fields = [
        'domains.name^2',
        'domains.docstrings',
    ]
    _common_highlight_options = {
        'encoder': 'html',
        'number_of_fragments': 1,
        'pre_tags': ['<span>'],
        'post_tags': ['</span>'],
    }
    fields = _outer_fields

    # need to search for both 'and' and 'or' operations
    # the score of and should be higher as it satisfies both or and and
    operators = ['and', 'or']

    def count(self):
        """Overriding ``count`` method to return the count of the results after post_filter."""
        s = self.build_search()

        # setting size=0 so that no results are returned,
        # we are only interested in the total count
        s = s.extra(size=0)
        s = s.execute()
        return s.hits.total

    def query(self, search, query):
        """Manipulates query to support nested query."""
        search = search.highlight_options(**self._common_highlight_options)

        all_queries = []

        # match query for the title (of the page) field.
        for operator in self.operators:
            all_queries.append(
                SimpleQueryString(query=query,
                                  fields=self.fields,
                                  default_operator=operator))

        # nested query for search in sections
        sections_nested_query = self.generate_nested_query(
            query=query,
            path='sections',
            fields=self._section_fields,
            inner_hits={
                'highlight':
                dict(self._common_highlight_options,
                     fields={
                         'sections.title': {},
                         'sections.content': {},
                     })
            })

        # nested query for search in domains
        domains_nested_query = self.generate_nested_query(
            query=query,
            path='domains',
            fields=self._domain_fields,
            inner_hits={
                'highlight':
                dict(self._common_highlight_options,
                     fields={
                         'domains.name': {},
                         'domains.docstrings': {},
                     })
            })

        all_queries.extend([sections_nested_query, domains_nested_query])
        final_query = Bool(should=all_queries)
        search = search.query(final_query)

        return search

    def generate_nested_query(self, query, path, fields, inner_hits):
        """Generate a nested query with passed parameters."""
        queries = []

        for operator in self.operators:
            query_string = SimpleQueryString(query=query,
                                             fields=fields,
                                             default_operator=operator)
            queries.append(query_string)

        bool_query = Bool(should=queries)

        nested_query = Nested(path=path,
                              inner_hits=inner_hits,
                              query=bool_query)
        return nested_query
Beispiel #5
0
class PageSearchBase(RTDFacetedSearch):
    facets = {
        'project': TermsFacet(field='project'),
        'version': TermsFacet(field='version'),
        'role_name': NestedFacet('domains',
                                 TermsFacet(field='domains.role_name')),
    }
    doc_types = [PageDocument]
    index = PageDocument._index._name

    # boosting for these fields need to be close enough
    # to be re-boosted by the page rank.
    _outer_fields = ['title^1.5']
    _section_fields = ['sections.title^2', 'sections.content']
    _domain_fields = [
        'domains.name^1.5',
        'domains.docstrings',
    ]
    fields = _outer_fields

    # need to search for both 'and' and 'or' operations
    # the score of and should be higher as it satisfies both or and and
    operators = ['and', 'or']

    excludes = ['rank', 'sections', 'domains', 'commit', 'build']

    def total_count(self):
        """Returns the total count of results of the current query."""
        s = self.build_search()

        # setting size=0 so that no results are returned,
        # we are only interested in the total count
        s = s.extra(size=0)
        s = s.execute()
        return s.hits.total

    def query(self, search, query):
        """
        Manipulates the query to support nested queries and a custom rank for pages.

        If `self.projects` was given, we use it to filter the documents that
        match the same project and version.
        """
        search = search.highlight_options(**self._highlight_options)
        search = search.source(excludes=self.excludes)

        queries = self._get_queries(
            query=query,
            fields=self.fields,
        )

        sections_nested_query = self._get_nested_query(
            query=query,
            path='sections',
            fields=self._section_fields,
        )

        domains_nested_query = self._get_nested_query(
            query=query,
            path='domains',
            fields=self._domain_fields,
        )

        queries.extend([sections_nested_query, domains_nested_query])
        bool_query = Bool(should=queries)

        if self.projects:
            versions_query = [
                Bool(must=[
                    Term(project={'value': project}),
                    Term(version={'value': version}),
                ]) for project, version in self.projects.items()
            ]
            bool_query = Bool(must=[bool_query, Bool(should=versions_query)])

        final_query = FunctionScore(
            query=bool_query,
            script_score=self._get_script_score(),
        )
        search = search.query(final_query)
        return search

    def _get_nested_query(self, *, query, path, fields):
        """Generate a nested query with passed parameters."""
        queries = self._get_queries(
            query=query,
            fields=fields,
        )

        raw_fields = (
            # Remove boosting from the field
            re.sub(r'\^.*$', '', field) for field in fields)

        highlight = dict(
            self._highlight_options,
            fields={field: {}
                    for field in raw_fields},
        )

        return Nested(
            path=path,
            inner_hits={'highlight': highlight},
            query=Bool(should=queries),
        )

    def _get_script_score(self):
        """
        Gets an ES script to map the page rank to a valid score weight.

        ES expects the rank to be a number greater than 0,
        but users can set this between [-10, +10].
        We map that range to [0.01, 2] (21 possible values).

        The first lower rank (0.8) needs to bring the score from the highest boost (sections.title^2)
        close to the lowest boost (title^1.5), that way exact results take priority:

        - 2.0 * 0.8 = 1.6 (score close to 1.5, but not lower than it)
        - 1.5 * 0.8 = 1.2 (score lower than 1.5)

        The first higher rank (1.2) needs to bring the score from the lowest boost (title^1.5)
        close to the highest boost (sections.title^2), that way exact results take priority:

        - 2.0 * 1.3 = 2.6 (score higher thank 2.0)
        - 1.5 * 1.3 = 1.95 (score close to 2.0, but not higher than it)

        The next lower and higher ranks need to decrease/increase both scores.

        See https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html#field-value-factor  # noqa
        """
        ranking = [
            0.01,
            0.05,
            0.1,
            0.2,
            0.3,
            0.4,
            0.5,
            0.6,
            0.7,
            0.8,
            1,
            1.3,
            1.4,
            1.5,
            1.6,
            1.7,
            1.8,
            1.9,
            1.93,
            1.96,
            2,
        ]
        # Each rank maps to a element in the ranking list.
        # -10 will map to the first element (-10 + 10 = 0) and so on.
        source = """
            int rank = doc['rank'].size() == 0 ? 0 : (int) doc['rank'].value;
            return params.ranking[rank + 10] * _score;
        """
        return {
            "script": {
                "source": source,
                "params": {
                    "ranking": ranking
                },
            },
        }
def get_samples_overview_facet(es_field_names):
    filters = {
        facet: Match(**{field: True})
        for facet, field in es_field_names.iteritems()
    }
    return NestedFacet('samples', FiltersFacet(filters))
Beispiel #7
0
class PageSearch(RTDFacetedSearch):
    facets = {
        'project': TermsFacet(field='project'),
        'version': TermsFacet(field='version'),
        'role_name': NestedFacet('domains',
                                 TermsFacet(field='domains.role_name')),
    }
    doc_types = [PageDocument]
    index = PageDocument._index._name

    # boosting for these fields need to be close enough
    # to be re-boosted by the page rank.
    _outer_fields = ['title^1.5']
    _section_fields = ['sections.title^2', 'sections.content']
    _domain_fields = [
        'domains.name^1.5',
        'domains.docstrings',
    ]
    fields = _outer_fields
    excludes = ['rank', 'sections', 'domains', 'commit', 'build']

    def _get_projects_query(self):
        """
        Get filter by projects query.

        If it's a dict, filter by project and version,
        if it's a list filter by project.
        """
        if not self.projects:
            return None

        if isinstance(self.projects, dict):
            versions_query = [
                Bool(filter=[Term(project=project),
                             Term(version=version)])
                for project, version in self.projects.items()
            ]
            return Bool(should=versions_query)

        if isinstance(self.projects, list):
            return Bool(filter=Terms(project=self.projects))

        raise ValueError('projects must be a list or a dict!')

    def query(self, search, query):
        """
        Manipulates the query to support nested queries and a custom rank for pages.

        If `self.projects` was given, we use it to filter the documents that
        match the same project and version.
        """
        search = search.highlight_options(**self._highlight_options)
        search = search.source(excludes=self.excludes)

        queries = self._get_queries(
            query=query,
            fields=self.fields,
        )

        sections_nested_query = self._get_nested_query(
            query=query,
            path='sections',
            fields=self._section_fields,
        )

        domains_nested_query = self._get_nested_query(
            query=query,
            path='domains',
            fields=self._domain_fields,
        )

        queries.extend([sections_nested_query, domains_nested_query])
        bool_query = Bool(should=queries)

        projects_query = self._get_projects_query()
        if projects_query:
            bool_query = Bool(must=[bool_query, projects_query])

        final_query = FunctionScore(
            query=bool_query,
            script_score=self._get_script_score(),
        )
        search = search.query(final_query)
        return search

    def _get_nested_query(self, *, query, path, fields):
        """Generate a nested query with passed parameters."""
        queries = self._get_queries(
            query=query,
            fields=fields,
        )
        bool_query = Bool(should=queries)

        raw_fields = [
            # Remove boosting from the field
            re.sub(r'\^.*$', '', field) for field in fields
        ]

        # The ``post_filter`` filter will only filter documents
        # at the parent level (domains is a nested document),
        # resulting in results with domains that don't match the current
        # role_name being filtered, so we need to force filtering by role_name
        # on the ``domains`` document here. See #8268.
        # TODO: We should use a flattened document instead
        # to avoid this kind of problems and have faster queries.
        role_name = self.filter_values.get('role_name')
        if path == 'domains' and role_name:
            role_name_query = Bool(must=Terms(
                **{'domains.role_name': role_name}))
            bool_query = Bool(must=[role_name_query, bool_query])

        highlight = dict(
            self._highlight_options,
            fields={field: {}
                    for field in raw_fields},
        )

        return Nested(
            path=path,
            inner_hits={'highlight': highlight},
            query=bool_query,
        )

    def _get_script_score(self):
        """
        Gets an ES script to map the page rank to a valid score weight.

        ES expects the rank to be a number greater than 0,
        but users can set this between [-10, +10].
        We map that range to [0.01, 2] (21 possible values).

        The first lower rank (0.8) needs to bring the score from the highest boost (sections.title^2)
        close to the lowest boost (title^1.5), that way exact results take priority:

        - 2.0 * 0.8 = 1.6 (score close to 1.5, but not lower than it)
        - 1.5 * 0.8 = 1.2 (score lower than 1.5)

        The first higher rank (1.2) needs to bring the score from the lowest boost (title^1.5)
        close to the highest boost (sections.title^2), that way exact results take priority:

        - 2.0 * 1.3 = 2.6 (score higher thank 2.0)
        - 1.5 * 1.3 = 1.95 (score close to 2.0, but not higher than it)

        The next lower and higher ranks need to decrease/increase both scores.

        See https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html#field-value-factor  # noqa
        """
        ranking = [
            0.01,
            0.05,
            0.1,
            0.2,
            0.3,
            0.4,
            0.5,
            0.6,
            0.7,
            0.8,
            1,
            1.3,
            1.4,
            1.5,
            1.6,
            1.7,
            1.8,
            1.9,
            1.93,
            1.96,
            2,
        ]
        # Each rank maps to a element in the ranking list.
        # -10 will map to the first element (-10 + 10 = 0) and so on.
        source = """
            int rank = doc['rank'].size() == 0 ? 0 : (int) doc['rank'].value;
            return params.ranking[rank + 10] * _score;
        """
        return {
            "script": {
                "source": source,
                "params": {
                    "ranking": ranking
                },
            },
        }
class PageSearchBase(RTDFacetedSearch):
    facets = {
        'project': TermsFacet(field='project'),
        'version': TermsFacet(field='version'),
        'role_name': NestedFacet(
            'domains',
            TermsFacet(field='domains.role_name')
        ),
    }
    doc_types = [PageDocument]
    index = PageDocument._doc_type.index

    # boosting for these fields need to be close enough
    # to be re-boosted by the page rank.
    _outer_fields = ['title^1.5']
    _section_fields = ['sections.title^2', 'sections.content']
    _domain_fields = [
        'domains.name^1.5',
        'domains.docstrings',
    ]
    fields = _outer_fields

    # need to search for both 'and' and 'or' operations
    # the score of and should be higher as it satisfies both or and and
    operators = ['and', 'or']

    def total_count(self):
        """Returns the total count of results of the current query."""
        s = self.build_search()

        # setting size=0 so that no results are returned,
        # we are only interested in the total count
        s = s.extra(size=0)
        s = s.execute()
        return s.hits.total

    def query(self, search, query):
        """Manipulates the query to support nested queries and a custom rank for pages."""
        search = search.highlight_options(**self._highlight_options)

        all_queries = []

        # match query for the title (of the page) field.
        for operator in self.operators:
            query_string = self._get_text_query(
                query=query,
                fields=self.fields,
                operator=operator,
            )
            all_queries.append(query_string)

        # nested query for search in sections
        sections_nested_query = self.generate_nested_query(
            query=query,
            path='sections',
            fields=self._section_fields,
            inner_hits={
                'highlight': dict(
                    self._highlight_options,
                    fields={
                        'sections.title': {},
                        'sections.content': {},
                    }
                )
            }
        )

        # nested query for search in domains
        domains_nested_query = self.generate_nested_query(
            query=query,
            path='domains',
            fields=self._domain_fields,
            inner_hits={
                'highlight': dict(
                    self._highlight_options,
                    fields={
                        'domains.name': {},
                        'domains.docstrings': {},
                    }
                )
            }
        )

        all_queries.extend([sections_nested_query, domains_nested_query])

        final_query = FunctionScore(
            query=Bool(should=all_queries),
            script_score=self._get_script_score(),
        )
        search = search.query(final_query)
        return search

    def _get_script_score(self):
        """
        Gets an ES script to map the page rank to a valid score weight.

        ES expects the rank to be a number greater than 0,
        but users can set this between [-10, +10].
        We map that range to [0.01, 2] (21 possible values).

        The first lower rank (0.8) needs to bring the score from the highest boost (sections.title^2)
        close to the lowest boost (title^1.5), that way exact results take priority:

        - 2.0 * 0.8 = 1.6 (score close to 1.5, but not lower than it)
        - 1.5 * 0.8 = 1.2 (score lower than 1.5)

        The first higher rank (1.2) needs to bring the score from the lowest boost (title^1.5)
        close to the highest boost (sections.title^2), that way exact results take priority:

        - 2.0 * 1.3 = 2.6 (score higher thank 2.0)
        - 1.5 * 1.3 = 1.95 (score close to 2.0, but not higher than it)

        The next lower and higher ranks need to decrease/increase both scores.

        See https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html#field-value-factor  # noqa
        """
        ranking = [
            0.01,
            0.05,
            0.1,
            0.2,
            0.3,
            0.4,
            0.5,
            0.6,
            0.7,
            0.8,
            1,
            1.3,
            1.4,
            1.5,
            1.6,
            1.7,
            1.8,
            1.9,
            1.93,
            1.96,
            2,
        ]
        # Each rank maps to a element in the ranking list.
        # -10 will map to the first element (-10 + 10 = 0) and so on.
        source = """
            int rank = doc['rank'].size() == 0 ? 0 : (int) doc['rank'].value;
            return params.ranking[rank + 10] * _score;
        """
        return {
            "script": {
                "source": source,
                "params": {"ranking": ranking},
            },
        }

    def generate_nested_query(self, query, path, fields, inner_hits):
        """Generate a nested query with passed parameters."""
        queries = []

        for operator in self.operators:
            query_string = self._get_text_query(
                query=query,
                fields=fields,
                operator=operator,
            )
            queries.append(query_string)

        bool_query = Bool(should=queries)

        nested_query = Nested(
            path=path,
            inner_hits=inner_hits,
            query=bool_query
        )
        return nested_query