Beispiel #1
0
    def evaluate(self, query: str, options: dict, callback: Callable[[dict], Any]) -> None:
        """
        Evaluates the given query, doing a "phrase prefix search".  E.g., for a supplied query phrase like
        "to the be", we return documents that contain phrases like "to the bearnaise", "to the best",
        "to the behemoth", and so on. I.e., we require that the query phrase starts on a token boundary in the
        document, but it doesn't necessarily have to end on one.

        The matching documents are ranked according to how many times the query substring occurs in the document,
        and only the "best" matches are returned to the client via the supplied callback function. Ties are
        resolved arbitrarily.

        The client can supply a dictionary of options that controls this query evaluation process: The maximum
        number of documents to return to the client is controlled via the "hit_count" (int) option.

        The callback function supplied by the client will receive a dictionary having the keys "score" (int) and
        "document" (Document).
        """
        if not query:
            return
        pattern = self._normalizer.normalize(query).replace('  ',' ').lower()
        hit = options.get('hit_count')
        sieve = Sieve(hit)
        suffixes = self.getSuffixes()

        self.binarySearch(self._suffixes, 0, len(self._suffixes), pattern)
        for term, doc_id in self._counter:
            freq = self._counter[(term, doc_id)]
            sieve.sift(freq, doc_id)

        for win in sieve.winners():
            doc = self._corpus.get_document(win[1])
            callback({'score': win[0], 'document': doc})
        self._counter.clear()
        return suffixes
Beispiel #2
0
    def evaluate(self, query: str, options: dict,
                 callback: Callable[[dict], Any]) -> None:
        """
        Evaluates the given query, doing a "phrase prefix search".  E.g., for a supplied query phrase like
        "to the be", we return documents that contain phrases like "to the bearnaise", "to the best",
        "to the behemoth", and so on. I.e., we require that the query phrase starts on a token boundary in the
        document, but it doesn't necessarily have to end on one.

        The matching documents are ranked according to how many times the query substring occurs in the document,
        and only the "best" matches are returned to the client via the supplied callback function. Ties are
        resolved arbitrarily.

        The client can supply a dictionary of options that controls this query evaluation process: The maximum
        number of documents to return to the client is controlled via the "hit_count" (int) option.

        The callback function supplied by the client will receive a dictionary having the keys "score" (int) and
        "document" (Document).
        """

        # Search for the needle in the haystack, using binary search. Define that the empty query matches
        # nothing, not everything.
        needle = self._normalize(query)
        if not needle:
            return
        where_start = self._binary_search(needle)

        # Helper predicate. Checks if the identified suffix starts with the needle. Since slicing implies copying,
        # cap the length of the slice to the length of the needle. The starts-with relation then becomes the same
        # as equality, which is quick to check.
        def _is_match(i: int) -> bool:
            (index, offset) = self._suffixes[i]
            return self._haystack[index][1][offset:(offset +
                                                    len(needle))] == needle

        # Suffixes sharing a prefix are consecutive in the suffix array. Scan ahead from the located index until
        # we no longer get a match. We expect a low number of matches for typical queries, and we process all the
        # matches below anyway. If we just wanted to count the number of matches without processing them, we
        # could instead of a linear scan do another binary search to locate where the range ends.
        matches = itertools.takewhile(_is_match,
                                      range(where_start, len(self._suffixes)))

        # Deduplicate. A document in the haystack might contain multiple occurrences of the needle.
        # Rank according to occurrence count, and emit in ranked order.
        if matches:
            debug = options.get("debug", False)
            pairs = [self._suffixes[i] for i in matches]
            if debug:
                apply(lambda p: print("*** MATCH", p, self._get_suffix2(p)),
                      pairs)
            counter = Counter([i for (i, _) in pairs])
            sieve = Sieve(max(1, min(100, options.get("hit_count", 10))))
            apply(lambda t: sieve.sift(t[1], t[0]), counter.items())
            apply(lambda w: self._emit_match(w[1], w[0], callback),
                  sieve.winners())
Beispiel #3
0
    def evaluate(self, query: str, options: dict, ranker: Ranker,
                 callback: Callable[[dict], Any]) -> None:
        """
        Evaluates the given query, doing N-out-of-M ranked retrieval. I.e., for a supplied query having M terms,
        a document is considered to be a match if it contains at least N <= M of those terms.

        The matching documents are ranked by the supplied ranker, and only the "best" matches are returned to the
        client via the supplied callback function.

        The client can supply a dictionary of options that controls this query evaluation process: The value of
        N is inferred from the query via the "recall_threshold" (float) option, and the maximum number of documents
        to return to the client is controlled via the "hit_count" (int) option.
        """
        # lager en iterator per term i query
        query = self._inverted_index.get_terms(query)
        qcounter = Counter(query)
        uniqe_terms = tuple(qcounter.keys())
        multi = tuple(qcounter.values())
        sieve = Sieve(options["hit_count"])
        iter_list = []
        query2 = []
        for t in uniqe_terms:
            query2.append(t)
            list_iter = self._inverted_index.get_postings_iterator(t)
            iter_list.append(list_iter)
        # beregner antall terms fra query som må finnes i teksten
        recall_threshold = options["recall_threshold"]
        min_treff = max(1.0, math.floor(recall_threshold * len(iter_list)))
        # lager en liste over første element i alle iteratorene
        peek = []
        for n in range(len(iter_list)):
            peek.append(next(iter_list[n], None))
        antall_iter = len(iter_list)

        def finn_minste():
            # finner doc med lavest id i peek
            minste = peek[0]
            i = 1
            while minste is None:
                minste = peek[i]
                i += 1
                if i >= len(peek):
                    break
            for i in range(len(peek)):
                if peek[i] is not None:
                    if minste.document_id > peek[i].document_id:
                        minste = peek[i]
            return minste

        while antall_iter >= min_treff:
            min_doc = finn_minste()
            if min_doc is None:
                break
            # sjekk om vi tilfredstiller recall threshold for laveste docID
            ranker.reset(min_doc.document_id)
            antall_treff = 0
            for n in range(len(peek)):
                if peek[n] is not None:
                    if peek[n].document_id == min_doc.document_id:
                        antall_treff += 1
                        ranker.update(uniqe_terms[n], multi[n], peek[n])
            # evaluer ranking og kast til sieve
            if antall_treff >= min_treff:
                sieve.sift(ranker.evaluate(), min_doc.document_id)
            # fjerner dokument fra listene
            for n in range(len(iter_list)):
                if peek[n] is not None:
                    if peek[n].document_id == min_doc.document_id:
                        peek[n] = next(iter_list[n], None)
                        if peek[n] is None:
                            antall_iter -= 1
            # rinse and repeat
        iteren = sieve.winners()
        lista = []
        for f in iteren:
            lista.append(f)
            callback({
                "score": int(f[0]),
                "document": self._corpus.get_document(f[1])
            })
Beispiel #4
0
    def evaluate(self, query: str, options: dict, ranker: Ranker,
                 callback: Callable[[dict], Any]) -> None:
        """
        Evaluates the given query, doing N-out-of-M ranked retrieval. I.e., for a supplied query having M terms,
        a document is considered to be a match if it contains at least N <= M of those terms.

        The matching documents are ranked by the supplied ranker, and only the "best" matches are returned to the
        client via the supplied callback function.

        The client can supply a dictionary of options that controls this query evaluation process: The value of
        N is inferred from the query via the "match_threshold" (float) option, and the maximum number of documents
        to return to the client is controlled via the "hit_count" (int) option.

        The callback function supplied by the client will receive a dictionary having the keys "score" (float) and
        "document" (Document).
        """

        # Print verbose debug information?
        debug = options.get("debug", False)

        # Produce the query terms. We must use the same string processing here as we used when
        # building up the inverted index. Some terms might be duplicated (e.g., as in the query
        # "to be or not to be").
        query_terms = self._inverted_index.get_terms(query)
        unique_query_terms = [
            (term, count) for (term, count) in Counter(query_terms).items()
        ]

        # Get the posting lists for the unique query terms.
        posting_lists = [
            self._inverted_index[term] for (term, _) in unique_query_terms
        ]

        # We require that at least N of the M query terms are present in the document,
        # for the document to be considered part of the result set. What should the minimum
        # value of N be?
        # TODO: Take multiplicity into account, and not just uniqueness.
        match_threshold = max(0.0, min(1.0,
                                       options.get("match_threshold", 0.5)))
        required_minimum = max(
            1,
            min(len(unique_query_terms),
                int(match_threshold * len(unique_query_terms))))

        # When traversing the posting lists using document-at-a-time traversal, we need to keep track
        # of where we are in each of the posting lists. Initially, all the cursors "point to" the first entry
        # in each posting list. Keep track of which posting lists that remain to be fully traversed.
        all_cursors = [next(p, None) for p in posting_lists]
        remaining_cursor_ids = [
            i for i in range(len(all_cursors)) if all_cursors[i]
        ]

        # We're doing ranked retrieval. Assess relevance scores per document as we go along, as we're doing
        # document-at-a-time traversal. Keep track of the K highest-scoring documents.
        sieve = Sieve(max(1, min(100, options.get("hit_count", 10))))

        # We're doing at least N-of-M matching. As we reach the end of the posting lists, we can abort when
        # the number of non-exhausted lists drops below the required minimum N.
        while len(remaining_cursor_ids) >= required_minimum:

            # The posting lists are sorted by the document identifiers in ascending order. Define the
            # "frontier" as the subset of non-exhausted posting lists that mention the lowest document
            # identifier. In a sense, if we imagine scanning the posting lists from left to right, the
            # frontier is the subset that has the "leftmost" cursors.
            # TODO: This can easily be done in a single pass over the remaining lists.
            document_id = min(
                [all_cursors[i].document_id for i in remaining_cursor_ids])
            frontier_cursor_ids = [
                i for i in remaining_cursor_ids
                if all_cursors[i].document_id == document_id
            ]

            # The number of elements on the "frontier" needs to be at least N. Otherwise, these documents
            # don't contain enough of the query terms, and aren't part of the result set.
            if len(frontier_cursor_ids) >= required_minimum:
                ranker.reset(document_id)
                for i in frontier_cursor_ids:
                    ranker.update(unique_query_terms[i][0],
                                  unique_query_terms[i][1], all_cursors[i])
                score = ranker.evaluate()
                sieve.sift(score, document_id)
                if debug:
                    print("*** MATCH")
                    print("document =", self._corpus[document_id])
                    print(
                        "matches  =", {
                            unique_query_terms[i][0]: all_cursors[i]
                            for i in frontier_cursor_ids
                        })
                    print("score    =", score)

            # Move along the cursors on the frontier. The cursors not on the frontier remain where they
            # are. We may or may not reach the end of some posting lists when we advance, so the set of
            # remaining non-exhausted lists might shrink.
            for i in frontier_cursor_ids:
                all_cursors[i] = next(posting_lists[i], None)
            remaining_cursor_ids = [
                i for i in range(len(all_cursors)) if all_cursors[i]
            ]

        # Alert the client about the best-matching documents, using the supplied callback function.
        # Emit documents sorted accoring to their relevancy scores.
        for (score, document_id) in sieve.winners():
            callback({"score": score, "document": self._corpus[document_id]})
Beispiel #5
0
    def evaluate(self, query: str, options: dict, callback: Callable[[dict], Any]) -> None:

        """
        Evaluates the given query, doing a "phrase prefix search".  E.g., for a supplied query phrase like
        "to the be", we return documents that contain phrases like "to the bearnaise", "to the best",
        "to the behemoth", and so on. I.e., we require that the query phrase starts on a token boundary in the
        document, but it doesn't necessarily have to end on one.

        The matching documents are ranked according to how many times the query substring occurs in the document,
        and only the "best" matches are returned to the client via the supplied callback function. Ties are
        resolved arbitrarily.

        The client can supply a dictionary of options that controls this query evaluation process: The maximum
        number of documents to return to the client is controlled via the "hit_count" (int) option.
        """
        if query == "":
                return
        treff_id = []
        treff_antall = []
        query = self._normalize(query)
        # lager pekere
        suffix_peker = self._binary_search(query)  # peker på hvor vi er i suffix_array
        haystack_peker = self._suffixes[suffix_peker]  # peker på hvor vi er i teksten
        hit_count = options['hit_count']
        #suffix_peker er nå på første forekomst av første tegn
        tekst = self._haystack[haystack_peker[0]][1]
        query_peker = 1  # peker på hvor vi er i query
        tekst_peker = haystack_peker[1] + 1  # HVA FAEN ER DET HER?
        while query[0] == tekst[haystack_peker[1]]:
            while tekst_peker == len(tekst):
                # ingen flere tegn i teksten.
                # reset pekere
                query_peker = 1
                suffix_peker += 1
                if suffix_peker < len(self._suffixes):
                    # hent neste forekomst i suffix_array
                    haystack_peker = self._suffixes[suffix_peker]
                    tekst = self._haystack[haystack_peker[0]][1]
                    tekst_peker = haystack_peker[1] + 1
            if query[query_peker] == tekst[tekst_peker]:
                # riktig tegn på riktig plass
                # øker pekere med 1
                tekst_peker += 1
                query_peker += 1
            else:
                # feil tegn
                # resetter pekere og henter neste peker fra suffix_array
                query_peker = 1
                suffix_peker += 1
                if suffix_peker < len(self._suffixes):
                    #hent neste forekomst
                    haystack_peker = self._suffixes[suffix_peker]
                    tekst = self._haystack[haystack_peker[0]][1]
                    tekst_peker = haystack_peker[1]+1

            if query_peker >= len(query):
                # funnet treff på query
                registrert = False
                for n in range(len(treff_id)):
                    # sjekker om vi allerede har registrert treff på id tidligere
                    if treff_id[n] == self._haystack[haystack_peker[0]][0]:
                        registrert = True
                        treff_antall[n] += 1
                if registrert == False:
                    # ikke registrert treff tidligere
                    treff_id.append(self._haystack[haystack_peker[0]][0])
                    treff_antall.append(1)
                #reset pekere
                query_peker = 1
                suffix_peker += 1
                if suffix_peker < len(self._suffixes):
                    #hent neste forekomst av pekere
                    haystack_peker = self._suffixes[suffix_peker]
                    tekst = self._haystack[haystack_peker[0]][1]
                    tekst_peker = haystack_peker[1]+1
            if tekst_peker >= len(tekst):
                #reset pekere
                query_peker = 1
                suffix_peker += 1
                if suffix_peker < len(self._suffixes):
                    # hent neste forekomst av pekere
                    haystack_peker = self._suffixes[suffix_peker]
                    tekst = self._haystack[haystack_peker[0]][1]
                    tekst_peker = haystack_peker[1] + 1

            if suffix_peker >= len(self._suffixes):
                # ingen flere treff i listen
                break
        # end while
        sieve = Sieve(hit_count)
        for n in range(len(treff_id)):
            sieve.sift(treff_antall[n], treff_id[n])
        iteren = sieve.winners()
        for f in iteren:
            callback({"score": int(f[0]), "document": self.corpus.get_document(f[1])})
Beispiel #6
0
    def evaluate(self, query: str, options: dict, ranker: Ranker, callback: Callable[[dict], Any]) -> None:
        """
        Evaluates the given query, doing N-out-of-M ranked retrieval. I.e., for a supplied query having M terms,
        a document is considered to be a match if it contains at least N <= M of those terms.

        The matching documents are ranked by the supplied ranker, and only the "best" matches are returned to the
        client via the supplied callback function.

        The client can supply a dictionary of options that controls this query evaluation process: The value of
        N is inferred from the query via the "match_threshold" (float) option, and the maximum number of documents
        to return to the client is controlled via the "hit_count" (int) option.

        The callback function supplied by the client will receive a dictionary having the keys "score" (float) and
        "document" (Document).

        if the query contains m unique query terms, each document in the result set should contain at least n of these m terms.
        """
        terms = list(self._inverted_index.get_terms(query))
        threshhold = options.get("match_threshold")
        debug = options.get("debug", False)
        counter_terms = Counter(terms)
        hit = options.get('hit_count')
        sieve = Sieve(hit)
        m = len(terms)
        n = max(1, min(m, int(threshhold * m)))

        class Aktiv(object):
            def __init__(self, invertedindex, term, multiplicity):
                self.term = term
                self.iterator = invertedindex.get_postings_iterator(term)
                self.posting = next(self.iterator, None)
                self.multiplicity = multiplicity
                self.hasBeenRanked = False

            @property
            def document_id(self):
                return self.posting.document_id

            def neste_posting(self):
                self.posting = next(self.iterator, None)

        aktive = []  # liste av posting liste-iteratorer

        for term in terms:
            aktiv = Aktiv(self._inverted_index, term, counter_terms[term])
            if aktiv.posting is not None:
                aktive.append(aktiv)
        forrige_minste = None
        while len(aktive) > 0:
            (minste, index) = min((v.document_id, i) for i, v in enumerate(aktive))
            current = aktive[index]
            if minste != forrige_minste:
                aktive_docids = [a for a in aktive if a.document_id == minste]
                ranker.reset(current.document_id)
                evaluated_terms = []
                # må gå gjennom aktive_docids for å sjekke term og frequency
                for a in aktive_docids:
                    if a.term not in evaluated_terms:
                        ranker.update(a.term, a.multiplicity, a.posting)
                        evaluated_terms.append(a.term)
                score = ranker.evaluate()
                if threshhold == 1:
                    if not len(aktive_docids) < n and score >= n:
                        sieve.sift(score, minste)
                else:
                    if score >= n and len(aktive_docids) >= n:
                        sieve.sift(score, minste)
            forrige_minste = minste
            current.neste_posting()
            post = current.posting
            if post is None:
                aktive.pop(index)

        for win in sieve.winners(): # append the winners
            doc = self._corpus.get_document(win[1])
            callback({'score': win[0], 'document': doc})
Beispiel #7
0
 def test_sifting(self):
     from utilities import Sieve
     sieve = Sieve(3)
     sieve.sift(1.0, "one")
     sieve.sift(10.0, "ten")
     sieve.sift(9.0, "nine")
     sieve.sift(2.0, "two")
     sieve.sift(5.0, "five")
     sieve.sift(8.0, "eight")
     sieve.sift(7.0, "seven")
     sieve.sift(6.0, "six")
     sieve.sift(3.0, "three")
     sieve.sift(4.0, "four")
     self.assertListEqual(list(sieve.winners()), [(10.0, "ten"),
                                                  (9.0, "nine"),
                                                  (8.0, "eight")])