def get_posting_lists(self, filepath: str) -> Iterator[Tuple[str, List[int]]]: docs = read_docs(filepath) def zip_with_doc(doc: Document): return doc, sorted(get_terms(doc)) doc_term_pairs = map(zip_with_doc, docs) term_doc_pairs = map(self.make_term_doc_pair, doc_term_pairs) sorted_pairs = heapq.merge(*term_doc_pairs, key=get_fst) docs_by_term = groupby(sorted_pairs, get_fst) def sort_docs(pair: Tuple[str, List[Tuple[str, int]]]): term, docs = pair docs_sorted = sorted(map(get_snd, docs)) return term, list(unique(docs_sorted)) return map(sort_docs, docs_by_term)
def get_posting_lists( self, filepath: str) -> Iterator[Tuple[str, List[Tuple[int, int]]]]: docs = read_docs(filepath) def zip_with_doc(doc: Document): return doc, sorted(get_terms_freqs(doc), key=get_fst) doc_term_pairs = map(zip_with_doc, docs) term_doc_pairs = map(self.make_term_doc_pair, doc_term_pairs) sorted_pairs = heapq.merge(*term_doc_pairs, key=get_fst) docs_by_term = groupby(sorted_pairs, get_fst) def sort_docs(tup: Tuple[str, List[Tuple[str, int, int]]]): term, docs_tfs = tup docs_sorted = sorted(((doc, tf) for term, doc, tf in docs_tfs), key=get_fst) grouped = groupbyfst(docs_sorted, op.add, 0) return term, list(grouped) return map(sort_docs, docs_by_term)
def get_docs_text(filepath: str): return (get_terms(doc.raw) for doc in sorted(read_docs(filepath), key=lambda d: d.id))
merged = [] i, j = 0, 0 while i < len(xs) and j < len(ys): x, y = xs[i], ys[j] if x == y: i += 1 j += 1 elif x > y: merged.append(y) j += 1 else: merged.append(x) i += 1 extra = xs[i:] if i < len(xs) else ys[j:] if j < len(ys) else [] return merged + extra docs = list(read_docs(docs_path)) inv_index = InvertedIndex(docs) queries = read_queries(queries_path) sorted_queries = sorted(queries, key=lambda q: q.id) print("merge with conjunction ---------------") for q in sorted_queries: docs = inv_index.find_docs(q, merge_and) print(q.id, docs) print("merge with disjunction --------------") for q in sorted_queries: docs = inv_index.find_docs(q, merge_or) print(q.id, docs)