Esempio n. 1
0
def _query_paper_id(term: str, operator: str = "and") -> Q:
    operator = operator.lower()
    logger.debug(f"query paper ID with: {term}")
    q = Q_("match", "paper_id", escape(term), operator=operator) | Q_(
        "match", "paper_id_v", escape(term), operator=operator)
    if is_old_papernum(term):
        q |= Q("wildcard", paper_id=f"*/{term}")
    return q
Esempio n. 2
0
def _query_journal_ref(term: str, boost: int = 1, operator: str = "and") -> Q:
    return Q(
        "query_string",
        fields=["journal_ref"],
        default_operator=operator,
        allow_leading_wildcard=False,
        query=escape(term),
    )
Esempio n. 3
0
def _query_comments(term: str, default_operator: str = "AND") -> Q:
    return Q(
        "query_string",
        fields=["comments"],
        default_operator=default_operator,
        allow_leading_wildcard=False,
        query=escape(term),
    )
Esempio n. 4
0
def string_query(term: str, path: str = "authors", operator: str = "AND") -> Q:
    """Build a query that handles query strings within a single author."""
    q = Q(
        "query_string",
        fields=[f"{path}.full_name"],
        default_operator=operator,
        allow_leading_wildcard=False,
        type="cross_fields",
        query=escape(term),
    )
    return Q("nested", path=path, query=q, score_mode="sum")
Esempio n. 5
0
def _query_abstract(term: str, default_operator: str = "AND") -> Q:
    fields = ["abstract.english"]
    if is_literal_query(term):
        fields += ["abstract"]
    return Q(
        "query_string",
        fields=fields,
        default_operator=default_operator,
        allow_leading_wildcard=False,
        query=escape(term),
        _name="abstract",
    )
Esempio n. 6
0
def _query_combined(term: str) -> Q:
    # Only wildcards in literals should be escaped.
    wildcard_escaped, has_wildcard = wildcard_escape(term)
    query_term = (wildcard_escaped if has_wildcard else escape(term)).lower()
    # All terms must match in the combined field.
    return Q(
        "query_string",
        fields=["combined"],
        default_operator="AND",
        allow_leading_wildcard=False,
        query=query_term,
    )
Esempio n. 7
0
def _query_title(term: str, default_operator: str = "AND") -> Q:
    if is_tex_query(term):
        return Q("match", **{f"title.tex": {"query": term}})
    fields = ["title.english"]
    if is_literal_query(term):
        fields += ["title"]
    return Q(
        "query_string",
        fields=fields,
        default_operator=default_operator,
        allow_leading_wildcard=False,
        query=escape(term),
    )
Esempio n. 8
0
def part_query(term: str, path: str = "authors") -> Q:
    """
    Build a query that matches within a single author using name parts.

    Anything before the first comma is treated as the author's surname, and
    everything after the first comma is treated as the author's first name
    or initials.

    Parameters
    ----------
    term : str
        Search term for a single author.
    path : str
        Nested document path.

    Returns
    -------
    :class:`.Q`

    """
    AUTHOR_QUERY_FIELDS = [
        f"{path}.full_name",
        f"{path}.last_name",
        f"{path}.full_name_initialized",
    ]
    term = term.strip()
    logger.debug(f"{path} part_query for {term}")

    # Commas are used to distinguish surname and forename.
    forename_is_individuated = "," in term
    if forename_is_individuated:
        # We treat the entire part as a search for a single author. The part
        # before the comma is treated as a surname, and the part after the
        # comma is treated as a forename or a prefix of the forename.
        name_parts = [p.strip() for p in term.split(",")]
        surname = name_parts[0].strip()
        forename = " ".join(name_parts[1:]).strip()

        # Doing a query string so that wildcards and literals are just handled.
        q_surname = Q(
            "query_string",
            fields=[f"{path}.last_name"],
            query=escape(surname),
            default_operator="AND",
            allow_leading_wildcard=False,
        )

        if forename:
            # If a wildcard is provided in the forename, we treat it as a
            # query string query. This has the disadvantage of losing term
            # order, but the advantage of handling wildcards as expected.
            logger.debug(f"Forename: {forename}")
            if has_wildcard(forename):
                q_forename = Q(
                    "query_string",
                    fields=[f"{path}.first_name"],
                    query=escape(forename),
                    auto_generate_phrase_queries=True,
                    default_operator="AND",
                    allow_leading_wildcard=False,
                )

            # Otherwise, we expect the forename to match as a phrase. The
            # _prefix bit means that the last word can match as a prefix of the
            # corresponding term.
            else:
                q_forename = Q("match_phrase_prefix",
                               **{f"{path}__first_name": forename})

            # It may be the case that the forename consists of initials or some
            # other prefix/partial forename. For a match of this kind, each
            # part of the forename part must be a prefix of a term in the
            # forename.
            if path == "authors" and forename:
                logger.debug("Consider initials: %s", forename)
                q_forename |= Q("match_phrase_prefix",
                                **{f"{path}__initials": forename})

            # We will treat this as a search for a single author; surname and
            # forename parts must match in the same (nested) author.
            q = q_surname & q_forename
        else:
            q = q_surname
    else:
        # Match across all fields within a single author. We don't know which
        # bits of the query match which bits of the author name. This will
        # handle wildcards, literals, etc.
        q = Q(
            "query_string",
            fields=AUTHOR_QUERY_FIELDS,
            default_operator="AND",
            allow_leading_wildcard=False,
            type="cross_fields",
            query=escape(term),
        )
    return Q("nested", path=path, query=q, score_mode="sum")
Esempio n. 9
0
def Q_(qtype: str, field: str, value: str) -> Q:
    """Generate an appropriate :class:`Q` based on wildcard presence."""
    if has_wildcard(value):
        return Q("wildcard", **{field: {"value": escape(value)}})
    return Q(qtype, **{field: escape(value)})
Esempio n. 10
0
def author_query(term: str, operator: str = "and") -> Q:
    """
    Construct a query based on author (and owner) names.

    Substrings delimited by semicolons should only match if the terms in that
    substring match within a single author.

    If a substring (delimited or not) contains a comma, everything before the
    first comma will be treated as a surname, and the remainder treated as
    either the forename or initials. In this scenario, all terms must match
    within a single author.

    Otherwise, we will simply match all of the parts of the query across all
    of the available author/owner fields. Each part of the query must match in
    at least one field in at least one author/owner.

    Parameters
    ----------
    term: str
        Raw querystring. Should not be escaped or normalized in any way.
    operator : str
        Default: 'AND'; anything else treated as 'OR'. If 'OR', relaxes the
        requirement that all parts of the query match. This is useful for
        "all fields" searches, in which only part of the query may be expected
        to match on an author/owner name.

    Returns
    -------
    :class:`.Q`
        An Elasticsearch DSL query part.

    """
    logger.debug(f"Author query for {term}")
    term = term.lower()

    # Check for balanced double-quotes.
    if '"' in term and term.count('"') % 2 == 0:  # Probably a literal search.
        logger.debug(f"Contains literal: {term}")

        # Apply literal parts of the query separately.
        return reduce(
            iand if operator.upper() == "AND" else ior,
            [(string_query(part, operator=operator)
              | string_query(part, path="owners", operator=operator))
             for part in re.split(STRING_LITERAL, term) if part.strip()],
        )

    term = term.replace('"', "")  # Just ignore unbalanced quotes.

    if ";" in term:  # Authors are individuated.
        logger.debug(f"Authors are individuated: {term}")
        logger.debug(f"Operator: {operator}")
        return reduce(
            iand if operator.upper() == "AND" else ior,
            [(part_query(author_part) | part_query(author_part, "owners"))
             for author_part in term.split(";") if author_part],
        )

    if "," in term:  # Forename is individuated.
        logger.debug(f"Forename is individuated: {term}")
        return part_query(term) | part_query(term, "owners")

    logger.debug(f"General author search: {term}")

    # We include both w/in author and among author matches, so that more
    # precise matches get more weight.
    #
    # A query_string query on the combined field will yield matches among
    # authors.
    q = Q(
        "query_string",
        fields=["authors_combined"],
        query=escape(term, quotes=True),
        default_operator="and",
    )

    # A nested query_string query on full name will match within individual
    # authors.
    q |= Q(
        "nested",
        path="authors",
        score_mode="sum",
        query=Q(
            "query_string",
            fields=["authors.full_name"],
            default_operator=operator,
            allow_leading_wildcard=False,
            query=escape(term, quotes=True),
        ),
    ) | Q(
        "nested",
        path="owners",
        score_mode="sum",
        query=Q(
            "query_string",
            fields=["owners.full_name"],
            default_operator=operator,
            allow_leading_wildcard=False,
            query=escape(term, quotes=True),
        ),
    )
    return q