Exemple #1
0
 def test_intersection_returns_other(self):
     cs = CompleteSet({"bar"})
     assert cs.intersection({"foo"}) == {"foo"}
Exemple #2
0
 def test_union_returns_self(self):
     cs = CompleteSet({"bar"})
     assert cs.union({"foo"}) == {"bar"}
Exemple #3
0
 def test_always_contains_other_element(self):
     assert "foo" in CompleteSet()
Exemple #4
0
    def query(self, query=None, callback=None):
        """Performs a query against the index using the passed lunr.Query
        object.

        If performing programmatic queries against the index, this method is
        preferred over `lunr.Index.search` so as to avoid the additional query
        parsing overhead.

        Args:
            query (lunr.Query): A configured Query to perform the search
                against, use `create_query` to get a preconfigured object
                or use `callback` for convenience.
            callback (callable): An optional function taking a single Query
                object result of `create_query` for further configuration.
        """
        if query is None:
            query = self.create_query()

        if callback is not None:
            callback(query)

        if len(query.clauses) == 0:
            logger.warning(
                "Attempting a query with no clauses. Please add clauses by "
                "either using the `callback` argument or using `create_query` "
                "to create a preconfigured Query, manually adding clauses and "
                "passing it as the `query` argument.")
            return []

        # for each query clause
        # * process terms
        # * expand terms from token set
        # * find matching documents and metadata
        # * get document vectors
        # * score documents

        matching_fields = {}
        query_vectors = {field: Vector() for field in self.fields}
        term_field_cache = {}
        required_matches = {}
        prohibited_matches = defaultdict(set)

        for clause in query.clauses:
            # Unless the pipeline has been disabled for this term, which is
            # the case for terms with wildcards, we need to pass the clause
            # term through the search pipeline. A pipeline returns an array
            # of processed terms. Pipeline functions may expand the passed
            # term, which means we may end up performing multiple index lookups
            # for a single query term.
            if clause.use_pipeline:
                terms = self.pipeline.run_string(clause.term,
                                                 {"fields": clause.fields})
            else:
                terms = [clause.term]

            clause_matches = set()

            for term in terms:
                # Each term returned from the pipeline needs to use the same
                # query clause object, e.g. the same boost and or edit distance
                # The simplest way to do this is to re-use the clause object
                # but mutate its term property.
                clause.term = term

                # From the term in the clause we create a token set which will
                # then be used to intersect the indexes token set to get a list
                # of terms to lookup in the inverted index
                term_token_set = TokenSet.from_clause(clause)
                expanded_terms = self.token_set.intersect(
                    term_token_set).to_list()

                # If a term marked as required does not exist in the TokenSet
                # it is impossible for the search to return any matches.
                # We set all the field-scoped required matches set to empty
                # and stop examining further clauses
                if (len(expanded_terms) == 0
                        and clause.presence == QueryPresence.REQUIRED):
                    for field in clause.fields:
                        required_matches[field] = CompleteSet()
                    break

                for expanded_term in expanded_terms:
                    posting = self.inverted_index[expanded_term]
                    term_index = posting["_index"]

                    for field in clause.fields:
                        # For each field that this query term is scoped by
                        # (by default all fields are in scope) we need to get
                        # all the document refs that have this term in that
                        # field.
                        #
                        # The posting is the entry in the invertedIndex for the
                        # matching term from above.
                        field_posting = posting[field]
                        matching_document_refs = field_posting.keys()
                        term_field = expanded_term + "/" + field
                        matching_documents_set = set(matching_document_refs)

                        # If the presence of this term is required, ensure that
                        # the matching documents are added to the set of
                        # required matches for this clause.
                        if clause.presence == QueryPresence.REQUIRED:
                            clause_matches = clause_matches.union(
                                matching_documents_set)

                            if field not in required_matches:
                                required_matches[field] = CompleteSet()

                        # If the presence of this term is prohibited,
                        # ensure that the matching documents are added to the
                        # set of prohibited matches for this field, creating
                        # that set if it does not exist yet.
                        elif clause.presence == QueryPresence.PROHIBITED:
                            prohibited_matches[field] = prohibited_matches[
                                field].union(matching_documents_set)

                            # prohibited matches should not be part of the
                            # query vector used for similarity scoring and no
                            # metadata should be extracted so we continue
                            # to the next field
                            continue

                        # The query field vector is populated using the
                        # term_index found for the term an a unit value with
                        # the appropriate boost
                        # Using upsert because there could already be an entry
                        # in the vector for the term we are working with.
                        # In that case we just add the scores together.
                        query_vectors[field].upsert(term_index, clause.boost,
                                                    lambda a, b: a + b)

                        # If we've already seen this term, field combo then
                        # we've already collected the matching documents and
                        # metadata, no need to go through all that again
                        if term_field in term_field_cache:
                            continue

                        for matching_document_ref in matching_document_refs:
                            # All metadata for this term/field/document triple
                            # are then extracted and collected into an instance
                            # of lunr.MatchData ready to be returned in the
                            # query results
                            matching_field_ref = FieldRef(
                                matching_document_ref, field)
                            metadata = field_posting[str(
                                matching_document_ref)]

                            if str(matching_field_ref) not in matching_fields:
                                matching_fields[str(
                                    matching_field_ref)] = MatchData(
                                        expanded_term, field, metadata)
                            else:
                                matching_fields[str(matching_field_ref)].add(
                                    expanded_term, field, metadata)

                        term_field_cache[term_field] = True

            # if the presence was required we need to update the required
            # matches field sets, we do this after all fields for the term
            # have collected their matches because the clause terms presence
            # is required in _any_ of the fields, not _all_ of the fields
            if clause.presence == QueryPresence.REQUIRED:
                for field in clause.fields:
                    required_matches[field] = required_matches[
                        field].intersection(clause_matches)

        # We need to combine the field scoped required and prohibited
        # matching documents inot a global set of required and prohibited
        # matches
        all_required_matches = CompleteSet()
        all_prohibited_matches = set()
        for field in self.fields:
            if field in required_matches:
                all_required_matches = all_required_matches.intersection(
                    required_matches[field])
            if field in prohibited_matches:
                all_prohibited_matches = all_prohibited_matches.union(
                    prohibited_matches[field])

        matching_field_refs = matching_fields.keys()
        results = []
        matches = {}

        # If the query is negated (only contains prohibited terms)
        # we need to get _all_ field_refs currently existing in the index.
        # This to avoid any costs of getting all field regs unnecessarily
        # Additionally, blank match data must be created to correctly populate
        # the results
        if query.is_negated():
            matching_field_refs = list(self.field_vectors.keys())

            for matching_field_ref in matching_field_refs:
                field_ref = FieldRef.from_string(matching_field_ref)
                matching_fields[matching_field_ref] = MatchData()

        for matching_field_ref in matching_field_refs:
            # Currently we have document fields that match the query, but we
            # need to return documents. The matchData and scores are combined
            # from multiple fields belonging to the same document.
            #
            # Scores are calculated by field, using the query vectors created
            # above, and combined into a final document score using addition.
            field_ref = FieldRef.from_string(matching_field_ref)
            doc_ref = field_ref.doc_ref

            if doc_ref not in all_required_matches or doc_ref in all_prohibited_matches:
                continue

            field_vector = self.field_vectors[matching_field_ref]
            score = query_vectors[field_ref.field_name].similarity(
                field_vector)

            try:
                doc_match = matches[doc_ref]
                doc_match["score"] += score
                doc_match["match_data"].combine(
                    matching_fields[matching_field_ref])
            except KeyError:
                match = {
                    "ref": doc_ref,
                    "score": score,
                    "match_data": matching_fields[matching_field_ref],
                }
                matches[doc_ref] = match
                results.append(match)

        return sorted(results, key=lambda a: a["score"], reverse=True)