def _generate_type_code_query(self, value):
        """Generate type-code queries.

        Notes:
            If the value of the type-code query exists in `TYPECODE_VALUE_TO_FIELD_AND_VALUE_PAIRS_MAPPING, then we
            query the specified field, along with the given value according to the mapping.
            See: https://github.com/inspirehep/inspire-query-parser/issues/79
            Otherwise, we query both ``document_type`` and ``publication_info``.
        """
        mapping_for_value = self.TYPECODE_VALUE_TO_FIELD_AND_VALUE_PAIRS_MAPPING.get(
            value, None)

        if mapping_for_value:
            return generate_match_query(*mapping_for_value,
                                        with_operator_and=True)
        else:
            return {
                'bool': {
                    'minimum_should_match':
                    1,
                    'should': [
                        generate_match_query('document_type',
                                             value,
                                             with_operator_and=True),
                        generate_match_query('publication_type',
                                             value,
                                             with_operator_and=True),
                    ]
                }
            }
Exemple #2
0
 def visit_nested_keyword_op(self,
                             node):  # TODO Cannot be completed as of yet.
     # FIXME: quick and dirty implementation of refersto:recid:<recid>
     if node.left.value == 'refersto':
         right = node.right
         if hasattr(right, 'left') and hasattr(
                 right, 'right') and right.left.value == 'control_number':
             recid = right.right.value
             citing_records_query = generate_match_query(
                 self.KEYWORD_TO_ES_FIELDNAME['refersto'],
                 recid,
                 with_operator_and=False)
             records_with_collection_literature_query = generate_match_query(
                 '_collections', 'Literature', with_operator_and=False)
             superseded_records_query = generate_match_query(
                 self.RECORD_RELATION_FIELD,
                 'successor',
                 with_operator_and=False)
             self_citation = generate_match_query("control_number",
                                                  recid,
                                                  with_operator_and=False)
             return {
                 'bool': {
                     'must': [
                         citing_records_query,
                         records_with_collection_literature_query
                     ],
                     'must_not': [superseded_records_query, self_citation]
                 }
             }
     logger.warning(
         'Nested keyword queries aren\'t implemented yet, except refersto:recid:<recid>'
     )
Exemple #3
0
    def visit_nested_keyword_op(self,
                                node):  # TODO Cannot be completed as of yet.
        # FIXME: quick and dirty implementation of refersto:recid:<recid>
        if node.left.value == 'refersto':
            right = node.right
            if hasattr(right, 'left') and hasattr(
                    right, 'right') and right.left.value == 'control_number':
                recid = right.right.value
                citing_records_query = generate_match_query(
                    ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['refersto'],
                    recid,
                    with_operator_and=False)
                superseded_records_query = generate_match_query(
                    ElasticSearchVisitor.RECORD_RELATION_FIELD,
                    'successor',
                    with_operator_and=False)
                return {
                    'bool': {
                        'must': [citing_records_query],
                        'must_not': [superseded_records_query]
                    }
                }

        raise NotImplementedError(
            'Nested keyword queries aren\'t implemented yet, except refersto:recid:<recid>'
        )
    def _generate_journal_nested_queries(self, value):
        """Generates ElasticSearch nested query(s).

        Args:
            value (string): Contains the journal_title, journal_volume and artid or start_page separated by a comma.
                            This value should be of type string.

        Notes:
            The value contains at least one of the 3 mentioned items, in this order and at most 3.
            The 3rd is either the artid or the page_start and it will query the corresponding ES field for this item.
            The values are then split on comma and stripped of spaces before being saved in a values list in order to
            be assigned to corresponding fields.
        """
        # Abstract away which is the third field, we care only for its existence.
        third_journal_field = ElasticSearchVisitor.JOURNAL_PAGE_START

        new_publication_info = ElasticSearchVisitor._preprocess_journal_query_value(
            third_journal_field, value)

        # We always expect a journal title, otherwise query would be considered malformed, and thus this method would
        # not have been called.
        queries_for_each_field = [
            generate_match_query(
                ElasticSearchVisitor.JOURNAL_FIELDS_MAPPING[
                    ElasticSearchVisitor.JOURNAL_TITLE],
                new_publication_info[ElasticSearchVisitor.JOURNAL_TITLE],
                with_operator_and=False)
        ]

        if ElasticSearchVisitor.JOURNAL_VOLUME in new_publication_info:
            queries_for_each_field.append(
                generate_match_query(
                    ElasticSearchVisitor.JOURNAL_FIELDS_MAPPING[
                        ElasticSearchVisitor.JOURNAL_VOLUME],
                    new_publication_info[ElasticSearchVisitor.JOURNAL_VOLUME],
                    with_operator_and=False))

        if third_journal_field in new_publication_info:
            artid_or_page_start = new_publication_info[third_journal_field]
            match_queries = [
                generate_match_query(
                    ElasticSearchVisitor.JOURNAL_FIELDS_MAPPING[third_field],
                    artid_or_page_start,
                    with_operator_and=False)
                for third_field in (ElasticSearchVisitor.JOURNAL_PAGE_START,
                                    ElasticSearchVisitor.JOURNAL_ART_ID)
            ]

            queries_for_each_field.append(
                wrap_queries_in_bool_clauses_if_more_than_one(
                    match_queries, use_must_clause=False))

        return generate_nested_query(
            ElasticSearchVisitor.JOURNAL_FIELDS_PREFIX,
            wrap_queries_in_bool_clauses_if_more_than_one(
                queries_for_each_field, use_must_clause=True))
    def _generate_queries_for_title_symbols(title_field, query_value):
        """Generate queries for any symbols in the title against the whitespace tokenized field of titles.

        Returns:
            (dict): The query or queries for the whitespace tokenized field of titles. If none such tokens exist, then
                    returns an empty dict.
        Notes:
            Splits the value stream into tokens according to whitespace.
            Heuristically identifies the ones that contain symbol-indicating-characters (examples of those tokens are
            "g-2", "SU(2)").
        """
        values_tokenized_by_whitespace = query_value.split()

        symbol_queries = []
        for value in values_tokenized_by_whitespace:
            # Heuristic: If there's a symbol-indicating-character in the value, it signifies terms that should be
            # queried against the whitespace-tokenized title.
            if any(character in value for character in
                   ElasticSearchVisitor.TITLE_SYMBOL_INDICATING_CHARACTER):
                symbol_queries.append(
                    generate_match_query('.'.join(
                        [title_field, FieldVariations.search]),
                                         value,
                                         with_operator_and=False))

        return wrap_queries_in_bool_clauses_if_more_than_one(
            symbol_queries, use_must_clause=True)
    def _generate_author_query(self, author_name):
        """Generates a query handling specifically authors.

        Notes:
            The match query is generic enough to return many results. Then, using the filter clause we truncate these
            so that we imitate legacy's behaviour on returning more "exact" results. E.g. Searching for `Smith, John`
            shouldn't return papers of 'Smith, Bob'.

            Additionally, doing a ``match`` with ``"operator": "and"`` in order to be even more exact in our search, by
            requiring that ``full_name`` field contains both
        """
        name_variations = [name_variation.lower()
                           for name_variation
                           in generate_minimal_name_variations(author_name)]

        # When the query contains sufficient data, i.e. full names, e.g. ``Mele, Salvatore`` (and not ``Mele, S`` or
        # ``Mele``) we can improve our filtering in order to filter out results containing records with authors that
        # have the same non lastnames prefix, e.g. 'Mele, Samuele'.
        if author_name_contains_fullnames(author_name):
            specialized_author_filter = [
                {
                    'bool': {
                        'must': [
                            {
                                'term': {ElasticSearchVisitor.AUTHORS_NAME_VARIATIONS_FIELD: names_variation[0]}
                            },
                            generate_match_query(
                                ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['author'],
                                names_variation[1],
                                with_operator_and=True
                            )
                        ]
                    }
                } for names_variation
                in product(name_variations, name_variations)
            ]

        else:
            # In the case of initials or even single lastname search, filter with only the name variations.
            specialized_author_filter = [
                {'term': {ElasticSearchVisitor.AUTHORS_NAME_VARIATIONS_FIELD: name_variation}}
                for name_variation in name_variations
            ]

        query = {
            'bool': {
                'filter': {
                    'bool': {
                        'should': specialized_author_filter
                    }
                },
                'must': {
                    'match': {
                        ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['author']: author_name
                    }
                }
            }
        }

        return generate_nested_query(ElasticSearchVisitor.AUTHORS_NESTED_QUERY_PATH, query)
def test_generate_match_query_with_operator_and_false():
    generated_match_query = generate_match_query('document_type',
                                                 'book',
                                                 with_operator_and=False)

    expected_match_query = {'match': {'document_type': 'book'}}

    assert generated_match_query == expected_match_query
def test_generate_match_query_with_bool_value():
    generated_match_query = generate_match_query('core',
                                                 True,
                                                 with_operator_and=True)

    expected_match_query = {'match': {'core': True}}

    assert generated_match_query == expected_match_query
    def _generate_title_queries(self, value):
        title_field = ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['title']
        q = generate_match_query(title_field, value, with_operator_and=True)

        symbol_queries = ElasticSearchVisitor._generate_queries_for_title_symbols(
            title_field, value)
        return wrap_queries_in_bool_clauses_if_more_than_one(
            [element for element in (q, symbol_queries) if element],
            use_must_clause=True)
def test_generate_match_query_with_operator_and():
    generated_match_query = generate_match_query('author',
                                                 'Ellis, John',
                                                 with_operator_and=True)

    expected_match_query = {
        'match': {
            'author': {
                'query': 'Ellis, John',
                'operator': 'and',
            }
        }
    }

    assert generated_match_query == expected_match_query
def test_boolean_string_argument_in_query_case_insensitive():
    expected = {"match": {"citeable": 'true'}}

    query = generate_match_query('citeable', "true", with_operator_and=True)
    assert expected == query

    query = generate_match_query('citeable', "True", with_operator_and=True)
    assert expected == query

    query = generate_match_query('citeable', "TRUE", with_operator_and=True)
    assert expected == query

    expected = {"match": {"citeable": 'false'}}

    query = generate_match_query('citeable', "false", with_operator_and=True)
    assert expected == query

    query = generate_match_query('citeable', "False", with_operator_and=True)
    assert expected == query

    query = generate_match_query('citeable', "FALSE", with_operator_and=True)
    assert expected == query
    def visit_value(self, node, fieldnames=None):
        if not fieldnames:
            fieldnames = '_all'

        if node.contains_wildcard:
            if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                    'date'] == fieldnames:
                return self._generate_date_with_wildcard_query(node.value)

            bai_fieldnames = self._generate_fieldnames_if_bai_query(
                node.value,
                bai_field_variation=FieldVariations.search,
                query_bai_field_if_dots_in_name=True)

            query = self._generate_query_string_query(node.value,
                                                      fieldnames=bai_fieldnames
                                                      or fieldnames,
                                                      analyze_wildcard=True)

            if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                    'author'] == fieldnames:
                return generate_nested_query(
                    ElasticSearchVisitor.AUTHORS_NESTED_QUERY_PATH, query)
            return query
        else:
            if isinstance(fieldnames, list):
                if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                        'date'] == fieldnames:
                    # Date queries with simple values are transformed into range queries, among the given and the exact
                    # next date, according to the granularity of the given date.
                    return self._generate_range_queries(
                        force_list(fieldnames),
                        {ES_RANGE_EQ_OPERATOR: node.value})

                if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                        'journal'] == fieldnames:
                    return self._generate_journal_nested_queries(node.value)

                return {
                    'multi_match': {
                        'fields': fieldnames,
                        'query': node.value,
                    }
                }
            else:
                if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                        'author'] == fieldnames:
                    bai_fieldnames = self._generate_fieldnames_if_bai_query(
                        node.value,
                        bai_field_variation=FieldVariations.search,
                        query_bai_field_if_dots_in_name=True)
                    if bai_fieldnames:
                        if len(bai_fieldnames) == 1:
                            query = {"match": {bai_fieldnames[0]: node.value}}
                            return generate_nested_query(
                                ElasticSearchVisitor.AUTHORS_NESTED_QUERY_PATH,
                                query)
                        else:
                            # Not an exact BAI pattern match, but node's value looks like BAI (no spaces and dots),
                            # e.g. `S.Mele`. In this case generate a partial match query.
                            return self.visit_partial_match_value(
                                node, bai_fieldnames)

                    return self._generate_author_query(node.value)

                elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                        'exact-author'] == fieldnames:
                    return self._generate_exact_author_query(node.value)

                elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                        'irn'] == fieldnames:
                    return {
                        'term': {
                            fieldnames: ''.join(('SPIRES-', node.value))
                        }
                    }

                elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                        'title'] == fieldnames:
                    return self._generate_title_queries(node.value)

                elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                        'type-code'] == fieldnames:
                    return self._generate_type_code_query(node.value)

                elif fieldnames not in ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME.values(
                ):
                    colon_value = ':'.join([fieldnames, node.value])
                    given_field_query = generate_match_query(
                        fieldnames, node.value, with_operator_and=True)
                    texkey_query = self._generate_term_query('texkeys.raw',
                                                             colon_value,
                                                             boost=2.0)
                    _all_field_query = generate_match_query(
                        '_all', colon_value, with_operator_and=True)
                    return wrap_queries_in_bool_clauses_if_more_than_one(
                        [given_field_query, texkey_query, _all_field_query],
                        use_must_clause=False)

                return generate_match_query(fieldnames,
                                            node.value,
                                            with_operator_and=True)
 def visit_value_op(self, node):
     return generate_match_query('_all',
                                 node.op.value,
                                 with_operator_and=True)