def find_docs(self, q: Query, merge: Callable[[List[int], List[int]], List[int]]): vectors = [ self.index[term] if term in self.index else [] for term in get_terms(q) ] return np.array([self.docs_by_id[id] for id in reduce(merge, vectors)])
def find_docs(self, q: Query) -> Iterator[Tuple[str, float]]: terms = sorted(get_terms(q)) results = heapq.merge(*(self.search_terms(file_path, terms) for file_path in self.dictionaries), key=get_fst) unified = map( get_snd, groupbyfst(results, lambda xs, ys: sorted(xs + ys, key=get_fst), [])) docs_scored = filter(lambda p: p[1] > 0, self.merge(unified)) return sorted(docs_scored, key=get_snd, reverse=True)
def __init__(self, docs: List[Document]): docs_terms = zip(docs, (sorted(get_terms(doc)) for doc in docs)) doc_term_pairs = map(self.associate_doc, docs_terms) sorted_pairs = heapq.merge(*doc_term_pairs, key=get_fst) docs_by_term = ((id, sorted(map(get_snd, xs))) for id, xs in groupby(sorted_pairs, get_fst)) posting_lists = ((id, [get_fst(p) for p in groupby(xs)]) for id, xs in docs_by_term) self.index: Dict[str, List[int]] = dict(posting_lists) self.docs_by_id = ["-d1"] * (len(docs) + 1) for doc in docs: self.docs_by_id[doc.numeric_id] = doc.id
def find_docs_by(self, q: Query) -> np.ndarray: binmat = self.matrix docs = binmat.columns.to_numpy() rows = binmat.index ones = np.ones(len(docs), dtype=np.bool_) zeroes = np.zeros(len(docs), dtype=np.bool_) vectors = [ binmat.loc[term] if term in rows else zeroes for term in get_terms(q) ] intersection = reduce(np.logical_and, vectors, ones) return np.msort(docs[intersection == True])
def __init__(self, docs: List[Document]): if len(docs) == 0: self.matrix = pd.DataFrame() return terms_by_doc = [list(get_terms(doc)) for doc in docs] term_sets = [set(terms) for terms in terms_by_doc] doc_ids = [doc.id for doc in docs] vocabulary = list(chain_unique(terms_by_doc)) vectors = [ self.build_incidence_vec(term, term_sets) for term in vocabulary ] self.matrix = pd.DataFrame(index=vocabulary, columns=doc_ids, data=np.vstack(vectors))
def find_docs( self, q: Query, fetch: Callable[[str, List[str]], Iterator[Tuple[str, List[int]]]], merge: Callable[[List[int], List[int]], List[int]]) -> np.ndarray: terms = sorted(get_terms(q)) results = heapq.merge(*(fetch(filepath, terms) for filepath in self.dictionaries), key=get_fst) unified = map(get_snd, groupbyfst(results, lambda xs, ys: sorted(xs + ys), [])) if len(terms) == 1: return np.array( [Document.make_id(id, 3) for id in reduce(merge_or, unified)]) else: return np.array( [Document.make_id(id, 3) for id in reduce(merge, unified)])
def zip_with_doc(doc: Document): return doc, sorted(get_terms(doc))