Beispiel #1
0
 def find_docs(self, q: Query, merge: Callable[[List[int], List[int]],
                                               List[int]]):
     vectors = [
         self.index[term] if term in self.index else []
         for term in get_terms(q)
     ]
     return np.array([self.docs_by_id[id] for id in reduce(merge, vectors)])
 def find_docs(self, q: Query) -> Iterator[Tuple[str, float]]:
     terms = sorted(get_terms(q))
     results = heapq.merge(*(self.search_terms(file_path, terms)
                             for file_path in self.dictionaries),
                           key=get_fst)
     unified = map(
         get_snd,
         groupbyfst(results, lambda xs, ys: sorted(xs + ys, key=get_fst),
                    []))
     docs_scored = filter(lambda p: p[1] > 0, self.merge(unified))
     return sorted(docs_scored, key=get_snd, reverse=True)
Beispiel #3
0
 def __init__(self, docs: List[Document]):
     docs_terms = zip(docs, (sorted(get_terms(doc)) for doc in docs))
     doc_term_pairs = map(self.associate_doc, docs_terms)
     sorted_pairs = heapq.merge(*doc_term_pairs, key=get_fst)
     docs_by_term = ((id, sorted(map(get_snd, xs)))
                     for id, xs in groupby(sorted_pairs, get_fst))
     posting_lists = ((id, [get_fst(p) for p in groupby(xs)])
                      for id, xs in docs_by_term)
     self.index: Dict[str, List[int]] = dict(posting_lists)
     self.docs_by_id = ["-d1"] * (len(docs) + 1)
     for doc in docs:
         self.docs_by_id[doc.numeric_id] = doc.id
Beispiel #4
0
 def find_docs_by(self, q: Query) -> np.ndarray:
     binmat = self.matrix
     docs = binmat.columns.to_numpy()
     rows = binmat.index
     ones = np.ones(len(docs), dtype=np.bool_)
     zeroes = np.zeros(len(docs), dtype=np.bool_)
     vectors = [
         binmat.loc[term] if term in rows else zeroes
         for term in get_terms(q)
     ]
     intersection = reduce(np.logical_and, vectors, ones)
     return np.msort(docs[intersection == True])
Beispiel #5
0
 def __init__(self, docs: List[Document]):
     if len(docs) == 0:
         self.matrix = pd.DataFrame()
         return
     terms_by_doc = [list(get_terms(doc)) for doc in docs]
     term_sets = [set(terms) for terms in terms_by_doc]
     doc_ids = [doc.id for doc in docs]
     vocabulary = list(chain_unique(terms_by_doc))
     vectors = [
         self.build_incidence_vec(term, term_sets) for term in vocabulary
     ]
     self.matrix = pd.DataFrame(index=vocabulary,
                                columns=doc_ids,
                                data=np.vstack(vectors))
Beispiel #6
0
 def find_docs(
         self, q: Query, fetch: Callable[[str, List[str]],
                                         Iterator[Tuple[str, List[int]]]],
         merge: Callable[[List[int], List[int]], List[int]]) -> np.ndarray:
     terms = sorted(get_terms(q))
     results = heapq.merge(*(fetch(filepath, terms)
                             for filepath in self.dictionaries),
                           key=get_fst)
     unified = map(get_snd,
                   groupbyfst(results, lambda xs, ys: sorted(xs + ys), []))
     if len(terms) == 1:
         return np.array(
             [Document.make_id(id, 3) for id in reduce(merge_or, unified)])
     else:
         return np.array(
             [Document.make_id(id, 3) for id in reduce(merge, unified)])
Beispiel #7
0
 def zip_with_doc(doc: Document):
     return doc, sorted(get_terms(doc))