def evaluate(self, query: str, options: dict, callback: Callable[[dict], Any]) -> None: """ Evaluates the given query, doing a "phrase prefix search". E.g., for a supplied query phrase like "to the be", we return documents that contain phrases like "to the bearnaise", "to the best", "to the behemoth", and so on. I.e., we require that the query phrase starts on a token boundary in the document, but it doesn't necessarily have to end on one. The matching documents are ranked according to how many times the query substring occurs in the document, and only the "best" matches are returned to the client via the supplied callback function. Ties are resolved arbitrarily. The client can supply a dictionary of options that controls this query evaluation process: The maximum number of documents to return to the client is controlled via the "hit_count" (int) option. The callback function supplied by the client will receive a dictionary having the keys "score" (int) and "document" (Document). """ if not query: return pattern = self._normalizer.normalize(query).replace(' ',' ').lower() hit = options.get('hit_count') sieve = Sieve(hit) suffixes = self.getSuffixes() self.binarySearch(self._suffixes, 0, len(self._suffixes), pattern) for term, doc_id in self._counter: freq = self._counter[(term, doc_id)] sieve.sift(freq, doc_id) for win in sieve.winners(): doc = self._corpus.get_document(win[1]) callback({'score': win[0], 'document': doc}) self._counter.clear() return suffixes
def evaluate(self, query: str, options: dict, callback: Callable[[dict], Any]) -> None: """ Evaluates the given query, doing a "phrase prefix search". E.g., for a supplied query phrase like "to the be", we return documents that contain phrases like "to the bearnaise", "to the best", "to the behemoth", and so on. I.e., we require that the query phrase starts on a token boundary in the document, but it doesn't necessarily have to end on one. The matching documents are ranked according to how many times the query substring occurs in the document, and only the "best" matches are returned to the client via the supplied callback function. Ties are resolved arbitrarily. The client can supply a dictionary of options that controls this query evaluation process: The maximum number of documents to return to the client is controlled via the "hit_count" (int) option. The callback function supplied by the client will receive a dictionary having the keys "score" (int) and "document" (Document). """ # Search for the needle in the haystack, using binary search. Define that the empty query matches # nothing, not everything. needle = self._normalize(query) if not needle: return where_start = self._binary_search(needle) # Helper predicate. Checks if the identified suffix starts with the needle. Since slicing implies copying, # cap the length of the slice to the length of the needle. The starts-with relation then becomes the same # as equality, which is quick to check. def _is_match(i: int) -> bool: (index, offset) = self._suffixes[i] return self._haystack[index][1][offset:(offset + len(needle))] == needle # Suffixes sharing a prefix are consecutive in the suffix array. Scan ahead from the located index until # we no longer get a match. We expect a low number of matches for typical queries, and we process all the # matches below anyway. If we just wanted to count the number of matches without processing them, we # could instead of a linear scan do another binary search to locate where the range ends. matches = itertools.takewhile(_is_match, range(where_start, len(self._suffixes))) # Deduplicate. A document in the haystack might contain multiple occurrences of the needle. # Rank according to occurrence count, and emit in ranked order. if matches: debug = options.get("debug", False) pairs = [self._suffixes[i] for i in matches] if debug: apply(lambda p: print("*** MATCH", p, self._get_suffix2(p)), pairs) counter = Counter([i for (i, _) in pairs]) sieve = Sieve(max(1, min(100, options.get("hit_count", 10)))) apply(lambda t: sieve.sift(t[1], t[0]), counter.items()) apply(lambda w: self._emit_match(w[1], w[0], callback), sieve.winners())
def evaluate(self, query: str, options: dict, ranker: Ranker, callback: Callable[[dict], Any]) -> None: """ Evaluates the given query, doing N-out-of-M ranked retrieval. I.e., for a supplied query having M terms, a document is considered to be a match if it contains at least N <= M of those terms. The matching documents are ranked by the supplied ranker, and only the "best" matches are returned to the client via the supplied callback function. The client can supply a dictionary of options that controls this query evaluation process: The value of N is inferred from the query via the "recall_threshold" (float) option, and the maximum number of documents to return to the client is controlled via the "hit_count" (int) option. """ # lager en iterator per term i query query = self._inverted_index.get_terms(query) qcounter = Counter(query) uniqe_terms = tuple(qcounter.keys()) multi = tuple(qcounter.values()) sieve = Sieve(options["hit_count"]) iter_list = [] query2 = [] for t in uniqe_terms: query2.append(t) list_iter = self._inverted_index.get_postings_iterator(t) iter_list.append(list_iter) # beregner antall terms fra query som må finnes i teksten recall_threshold = options["recall_threshold"] min_treff = max(1.0, math.floor(recall_threshold * len(iter_list))) # lager en liste over første element i alle iteratorene peek = [] for n in range(len(iter_list)): peek.append(next(iter_list[n], None)) antall_iter = len(iter_list) def finn_minste(): # finner doc med lavest id i peek minste = peek[0] i = 1 while minste is None: minste = peek[i] i += 1 if i >= len(peek): break for i in range(len(peek)): if peek[i] is not None: if minste.document_id > peek[i].document_id: minste = peek[i] return minste while antall_iter >= min_treff: min_doc = finn_minste() if min_doc is None: break # sjekk om vi tilfredstiller recall threshold for laveste docID ranker.reset(min_doc.document_id) antall_treff = 0 for n in range(len(peek)): if peek[n] is not None: if peek[n].document_id == min_doc.document_id: antall_treff += 1 ranker.update(uniqe_terms[n], multi[n], peek[n]) # evaluer ranking og kast til sieve if antall_treff >= min_treff: sieve.sift(ranker.evaluate(), min_doc.document_id) # fjerner dokument fra listene for n in range(len(iter_list)): if peek[n] is not None: if peek[n].document_id == min_doc.document_id: peek[n] = next(iter_list[n], None) if peek[n] is None: antall_iter -= 1 # rinse and repeat iteren = sieve.winners() lista = [] for f in iteren: lista.append(f) callback({ "score": int(f[0]), "document": self._corpus.get_document(f[1]) })
def evaluate(self, query: str, options: dict, ranker: Ranker, callback: Callable[[dict], Any]) -> None: """ Evaluates the given query, doing N-out-of-M ranked retrieval. I.e., for a supplied query having M terms, a document is considered to be a match if it contains at least N <= M of those terms. The matching documents are ranked by the supplied ranker, and only the "best" matches are returned to the client via the supplied callback function. The client can supply a dictionary of options that controls this query evaluation process: The value of N is inferred from the query via the "match_threshold" (float) option, and the maximum number of documents to return to the client is controlled via the "hit_count" (int) option. The callback function supplied by the client will receive a dictionary having the keys "score" (float) and "document" (Document). """ # Print verbose debug information? debug = options.get("debug", False) # Produce the query terms. We must use the same string processing here as we used when # building up the inverted index. Some terms might be duplicated (e.g., as in the query # "to be or not to be"). query_terms = self._inverted_index.get_terms(query) unique_query_terms = [ (term, count) for (term, count) in Counter(query_terms).items() ] # Get the posting lists for the unique query terms. posting_lists = [ self._inverted_index[term] for (term, _) in unique_query_terms ] # We require that at least N of the M query terms are present in the document, # for the document to be considered part of the result set. What should the minimum # value of N be? # TODO: Take multiplicity into account, and not just uniqueness. match_threshold = max(0.0, min(1.0, options.get("match_threshold", 0.5))) required_minimum = max( 1, min(len(unique_query_terms), int(match_threshold * len(unique_query_terms)))) # When traversing the posting lists using document-at-a-time traversal, we need to keep track # of where we are in each of the posting lists. Initially, all the cursors "point to" the first entry # in each posting list. Keep track of which posting lists that remain to be fully traversed. all_cursors = [next(p, None) for p in posting_lists] remaining_cursor_ids = [ i for i in range(len(all_cursors)) if all_cursors[i] ] # We're doing ranked retrieval. Assess relevance scores per document as we go along, as we're doing # document-at-a-time traversal. Keep track of the K highest-scoring documents. sieve = Sieve(max(1, min(100, options.get("hit_count", 10)))) # We're doing at least N-of-M matching. As we reach the end of the posting lists, we can abort when # the number of non-exhausted lists drops below the required minimum N. while len(remaining_cursor_ids) >= required_minimum: # The posting lists are sorted by the document identifiers in ascending order. Define the # "frontier" as the subset of non-exhausted posting lists that mention the lowest document # identifier. In a sense, if we imagine scanning the posting lists from left to right, the # frontier is the subset that has the "leftmost" cursors. # TODO: This can easily be done in a single pass over the remaining lists. document_id = min( [all_cursors[i].document_id for i in remaining_cursor_ids]) frontier_cursor_ids = [ i for i in remaining_cursor_ids if all_cursors[i].document_id == document_id ] # The number of elements on the "frontier" needs to be at least N. Otherwise, these documents # don't contain enough of the query terms, and aren't part of the result set. if len(frontier_cursor_ids) >= required_minimum: ranker.reset(document_id) for i in frontier_cursor_ids: ranker.update(unique_query_terms[i][0], unique_query_terms[i][1], all_cursors[i]) score = ranker.evaluate() sieve.sift(score, document_id) if debug: print("*** MATCH") print("document =", self._corpus[document_id]) print( "matches =", { unique_query_terms[i][0]: all_cursors[i] for i in frontier_cursor_ids }) print("score =", score) # Move along the cursors on the frontier. The cursors not on the frontier remain where they # are. We may or may not reach the end of some posting lists when we advance, so the set of # remaining non-exhausted lists might shrink. for i in frontier_cursor_ids: all_cursors[i] = next(posting_lists[i], None) remaining_cursor_ids = [ i for i in range(len(all_cursors)) if all_cursors[i] ] # Alert the client about the best-matching documents, using the supplied callback function. # Emit documents sorted accoring to their relevancy scores. for (score, document_id) in sieve.winners(): callback({"score": score, "document": self._corpus[document_id]})
def evaluate(self, query: str, options: dict, callback: Callable[[dict], Any]) -> None: """ Evaluates the given query, doing a "phrase prefix search". E.g., for a supplied query phrase like "to the be", we return documents that contain phrases like "to the bearnaise", "to the best", "to the behemoth", and so on. I.e., we require that the query phrase starts on a token boundary in the document, but it doesn't necessarily have to end on one. The matching documents are ranked according to how many times the query substring occurs in the document, and only the "best" matches are returned to the client via the supplied callback function. Ties are resolved arbitrarily. The client can supply a dictionary of options that controls this query evaluation process: The maximum number of documents to return to the client is controlled via the "hit_count" (int) option. """ if query == "": return treff_id = [] treff_antall = [] query = self._normalize(query) # lager pekere suffix_peker = self._binary_search(query) # peker på hvor vi er i suffix_array haystack_peker = self._suffixes[suffix_peker] # peker på hvor vi er i teksten hit_count = options['hit_count'] #suffix_peker er nå på første forekomst av første tegn tekst = self._haystack[haystack_peker[0]][1] query_peker = 1 # peker på hvor vi er i query tekst_peker = haystack_peker[1] + 1 # HVA FAEN ER DET HER? while query[0] == tekst[haystack_peker[1]]: while tekst_peker == len(tekst): # ingen flere tegn i teksten. # reset pekere query_peker = 1 suffix_peker += 1 if suffix_peker < len(self._suffixes): # hent neste forekomst i suffix_array haystack_peker = self._suffixes[suffix_peker] tekst = self._haystack[haystack_peker[0]][1] tekst_peker = haystack_peker[1] + 1 if query[query_peker] == tekst[tekst_peker]: # riktig tegn på riktig plass # øker pekere med 1 tekst_peker += 1 query_peker += 1 else: # feil tegn # resetter pekere og henter neste peker fra suffix_array query_peker = 1 suffix_peker += 1 if suffix_peker < len(self._suffixes): #hent neste forekomst haystack_peker = self._suffixes[suffix_peker] tekst = self._haystack[haystack_peker[0]][1] tekst_peker = haystack_peker[1]+1 if query_peker >= len(query): # funnet treff på query registrert = False for n in range(len(treff_id)): # sjekker om vi allerede har registrert treff på id tidligere if treff_id[n] == self._haystack[haystack_peker[0]][0]: registrert = True treff_antall[n] += 1 if registrert == False: # ikke registrert treff tidligere treff_id.append(self._haystack[haystack_peker[0]][0]) treff_antall.append(1) #reset pekere query_peker = 1 suffix_peker += 1 if suffix_peker < len(self._suffixes): #hent neste forekomst av pekere haystack_peker = self._suffixes[suffix_peker] tekst = self._haystack[haystack_peker[0]][1] tekst_peker = haystack_peker[1]+1 if tekst_peker >= len(tekst): #reset pekere query_peker = 1 suffix_peker += 1 if suffix_peker < len(self._suffixes): # hent neste forekomst av pekere haystack_peker = self._suffixes[suffix_peker] tekst = self._haystack[haystack_peker[0]][1] tekst_peker = haystack_peker[1] + 1 if suffix_peker >= len(self._suffixes): # ingen flere treff i listen break # end while sieve = Sieve(hit_count) for n in range(len(treff_id)): sieve.sift(treff_antall[n], treff_id[n]) iteren = sieve.winners() for f in iteren: callback({"score": int(f[0]), "document": self.corpus.get_document(f[1])})
def evaluate(self, query: str, options: dict, ranker: Ranker, callback: Callable[[dict], Any]) -> None: """ Evaluates the given query, doing N-out-of-M ranked retrieval. I.e., for a supplied query having M terms, a document is considered to be a match if it contains at least N <= M of those terms. The matching documents are ranked by the supplied ranker, and only the "best" matches are returned to the client via the supplied callback function. The client can supply a dictionary of options that controls this query evaluation process: The value of N is inferred from the query via the "match_threshold" (float) option, and the maximum number of documents to return to the client is controlled via the "hit_count" (int) option. The callback function supplied by the client will receive a dictionary having the keys "score" (float) and "document" (Document). if the query contains m unique query terms, each document in the result set should contain at least n of these m terms. """ terms = list(self._inverted_index.get_terms(query)) threshhold = options.get("match_threshold") debug = options.get("debug", False) counter_terms = Counter(terms) hit = options.get('hit_count') sieve = Sieve(hit) m = len(terms) n = max(1, min(m, int(threshhold * m))) class Aktiv(object): def __init__(self, invertedindex, term, multiplicity): self.term = term self.iterator = invertedindex.get_postings_iterator(term) self.posting = next(self.iterator, None) self.multiplicity = multiplicity self.hasBeenRanked = False @property def document_id(self): return self.posting.document_id def neste_posting(self): self.posting = next(self.iterator, None) aktive = [] # liste av posting liste-iteratorer for term in terms: aktiv = Aktiv(self._inverted_index, term, counter_terms[term]) if aktiv.posting is not None: aktive.append(aktiv) forrige_minste = None while len(aktive) > 0: (minste, index) = min((v.document_id, i) for i, v in enumerate(aktive)) current = aktive[index] if minste != forrige_minste: aktive_docids = [a for a in aktive if a.document_id == minste] ranker.reset(current.document_id) evaluated_terms = [] # må gå gjennom aktive_docids for å sjekke term og frequency for a in aktive_docids: if a.term not in evaluated_terms: ranker.update(a.term, a.multiplicity, a.posting) evaluated_terms.append(a.term) score = ranker.evaluate() if threshhold == 1: if not len(aktive_docids) < n and score >= n: sieve.sift(score, minste) else: if score >= n and len(aktive_docids) >= n: sieve.sift(score, minste) forrige_minste = minste current.neste_posting() post = current.posting if post is None: aktive.pop(index) for win in sieve.winners(): # append the winners doc = self._corpus.get_document(win[1]) callback({'score': win[0], 'document': doc})
def test_sifting(self): from utilities import Sieve sieve = Sieve(3) sieve.sift(1.0, "one") sieve.sift(10.0, "ten") sieve.sift(9.0, "nine") sieve.sift(2.0, "two") sieve.sift(5.0, "five") sieve.sift(8.0, "eight") sieve.sift(7.0, "seven") sieve.sift(6.0, "six") sieve.sift(3.0, "three") sieve.sift(4.0, "four") self.assertListEqual(list(sieve.winners()), [(10.0, "ten"), (9.0, "nine"), (8.0, "eight")])