Example #1
0
 def test_out_of_range(self):
     """Term looks like a date partial, but is not a valid date."""
     term, rmd = util.parse_date('0699')
     self.assertIsNone(util.parse_date_partial(term))
Example #2
0
 def test_last_millenium(self):
     """Term is for a pre-2000 paper."""
     term, rmd = util.parse_date('old paper 9505')
     ym = util.parse_date_partial(term)
     self.assertEqual(ym, '1995-05')
     self.assertEqual(rmd, 'old paper', 'Should have a remainder')
Example #3
0
 def test_near_words(self):
     """Term includes date partial plus other terms."""
     term, rmd = util.parse_date('foo 1902 bar')
     ym = util.parse_date_partial(term)
     self.assertEqual(ym, '2019-02')
     self.assertEqual(rmd, "foo bar", "Should have remainder")
Example #4
0
 def test_in_word(self):
     """A false positive in a word."""
     with self.assertRaises(ValueError):
         term, rmd = util.parse_date('notasearch1902foradatepartial')
Example #5
0
 def test_date_partial_only(self):
     """Term includes only a four-digit date partial."""
     term, rmd = util.parse_date('1902')
     ym = util.parse_date_partial(term)
     self.assertEqual(ym, '2019-02')
     self.assertEqual(rmd, '', "Should have no remainder")
Example #6
0
 def test_last_millenium(self):
     """Term is for a pre-2000 paper."""
     term, rmd = util.parse_date("old paper 9505")
     ym = util.parse_date_partial(term)
     self.assertEqual(ym, "1995-05")
     self.assertEqual(rmd, "old paper", "Should have a remainder")
Example #7
0
def _query_all_fields(term: str) -> Q:
    """
    Construct a query against all fields.

    The heart of the query is a `query_string` search against a "combined"
    field, which contains tokens from all of the searchable metadata fields on
    each paper. All tokens in the query must match in that combined field.

    The reason that we do it this way, instead of combining queries across
    multiple fields, is that:

    - To query in a term-centric way across fields (e.g. the `cross_fields`
      query type for `query_string` or `multi_match` searches), all of those
      fields must have the same analyzer. It's a drag to constrain analyzer
      choice on individual fields, so this way we can do what we want with
      individual fields but also support a consistent all-fields search that
      behaves the way that users expect.
    - Performing a disjunct search across all fields can't guarantee that all
      terms match (if we use the disjunct operator within each field), and
      can't handle queries that span fieds (if we use the conjunect operator
      within each field).

    In addition to the combined query, we also perform dijunct queries across
    individual fields to generate field-specific hits, and to provide control
    over scoring.

    Weights are applied using :class:`.SF` (score functions). In the current
    implementation, fields are given monotonically decreasing weights in the
    order applied below. More complex score functions may be introduced, and
    that should happen here.

    Parameters
    ----------
    term : str
        A query string.

    Returns
    -------
    :class:`.Q`
        A search-ready query part, including score functions.

    """
    # We only perform TeX queries on title and abstract.
    if is_tex_query(term):
        return _tex_query("title", term) | _tex_query("abstract", term)

    match_all_fields = _query_combined(term)

    # We include matches of any term in any field, so that we can highlight
    # and score appropriately.
    queries = [
        _query_paper_id(term, operator="or"),
        author_query(term, operator="or"),
        _query_title(term, default_operator="or"),
        _query_abstract(term, default_operator="or"),
        _query_comments(term, default_operator="or"),
        orcid_query(term, operator="or"),
        author_id_query(term, operator="or"),
        _query_doi(term, operator="or"),
        _query_journal_ref(term, operator="or"),
        _query_report_num(term, operator="or"),
        _query_acm_class(term, operator="or"),
        _query_msc_class(term, operator="or"),
        _query_primary(term, operator="or"),
        _query_secondary(term, operator="or"),
    ]

    # If the whole query matches on a specific field, we should consider that
    # responsive even if the query on the combined field does not respond.
    match_individual_field = reduce(
        ior,
        [
            _query_paper_id(term, operator="AND"),
            author_query(term, operator="AND"),
            _query_title(term, default_operator="and"),
            _query_abstract(term, default_operator="and"),
            _query_comments(term, default_operator="and"),
            orcid_query(term, operator="and"),
            author_id_query(term, operator="and"),
            _query_doi(term, operator="and"),
            _query_journal_ref(term, operator="and"),
            _query_report_num(term, operator="and"),
            _query_acm_class(term, operator="and"),
            _query_msc_class(term, operator="and"),
            _query_primary(term, operator="and"),
            _query_secondary(term, operator="and"),
        ],
    )

    # It is possible that the query includes a date-related term, which we
    # interpret as an announcement date of v1 of the paper. We currently
    # support both "standard" `yyyy` or `yyyy-MM`` formats as well as a
    # legacy format ``yyMM``.
    #
    # The general strategy here is to first attempt to match a date fragment
    # using one the formats above, and split the query so that we can handle
    # the date fragment and the remainder of the query separately. If we find
    # something that looks like a date fragment, we perform the all-fields
    # search on the remainder and use the fragment to build queries against the
    # announcement-date of the original paper version.
    date_fragment: Optional[str] = None
    remainder: Optional[str] = None
    try:
        date_fragment, remainder = parse_date(term)
    except ValueError:
        pass

    if date_fragment:
        logger.debug("date: %s; remainder: %s", date_fragment, remainder)
        match_date: Optional[Q] = None
        match_date_partial: Optional[Q] = None
        match_date_announced: Optional[Q] = None
        match_dates: List[Q] = []
        logger.debug("date_fragment: %s", date_fragment)

        # Try to query using legacy yyMM date partial format.
        date_partial = parse_date_partial(date_fragment)
        logger.debug("date_partial: %s", date_partial)
        if date_partial is not None:
            match_date_partial = Q("term", announced_date_first=date_partial)
            match_dates.append(match_date_partial)

        # Also try using yyyy-MM and yyyy formats.
        match_date_announced = _query_announcement_date(date_fragment)
        if match_date_announced:
            match_dates.append(match_date_announced)

        # Build the composite announcement date query here, using the
        # sub-queries based on "standard" and legay date formats.
        if match_dates:
            # The only way to know in the end whether the query matched on
            # the announcement date is to wrap this in a top-level query and
            # give it a ``_name``. This causes the ``_name`` to show up
            # in the ``.meta.matched_queries`` property on the search result.
            match_date = Q(
                "bool",
                should=match_dates,
                minimum_should_match=1,
                _name="announced_date_first",
            )
            logger.debug("match date: %s", match_date)
            queries.insert(0, match_date)

        # Now join the announcement date query with the all-fields queries.
        if match_date is not None:
            if remainder:
                match_remainder = _query_combined(remainder)
                match_all_fields |= match_remainder & match_date

                match_sans_date = reduce(
                    ior,
                    [
                        _query_paper_id(remainder, operator="AND"),
                        author_query(remainder, operator="AND"),
                        _query_title(remainder, default_operator="and"),
                        _query_abstract(remainder, default_operator="and"),
                        _query_comments(remainder, default_operator="and"),
                        orcid_query(remainder, operator="and"),
                        author_id_query(remainder, operator="and"),
                        _query_doi(remainder, operator="and"),
                        _query_journal_ref(remainder, operator="and"),
                        _query_report_num(remainder, operator="and"),
                        _query_acm_class(remainder, operator="and"),
                        _query_msc_class(remainder, operator="and"),
                        _query_primary(remainder, operator="and"),
                        _query_secondary(remainder, operator="and"),
                    ],
                )
                match_individual_field |= match_sans_date & match_date
            else:
                match_all_fields |= match_date

    query = match_all_fields | match_individual_field
    query &= Q("bool", should=queries)  # Partial matches across fields.
    scores = [
        SF({
            "weight": i + 1,
            "filter": q
        }) for i, q in enumerate(queries[::-1])
    ]
    return Q(
        "function_score",
        query=query,
        score_mode="sum",
        functions=scores,
        boost_mode="multiply",
    )