def load_documents(use_tfidf=False, doc_frequencies=None): """Build and return a dict from names of documents in the documents directories to their term vectors.""" out = {} for root, dirs, fileshere in os.walk("documents"): for fn in fileshere: if fn.endswith(".txt"): whole_pathname = os.path.join(root, fn) termvector = countwords(listwords(whole_pathname)) if use_tfidf: tfidf_vector = make_tfidf(termvector, doc_frequencies) out[whole_pathname] = tfidf_vector else: out[whole_pathname] = termvector return out
def load_documents(use_tfidf=False,doc_frequencies=None): """Build and return a dict from names of documents in the documents directories to their term vectors.""" out = {} for root, dirs, fileshere in os.walk("documents"): for fn in fileshere: if fn.endswith(".txt"): whole_pathname = os.path.join(root, fn) termvector = countwords(listwords(whole_pathname)) if use_tfidf: tfidf_vector = make_tfidf(termvector,doc_frequencies) out[whole_pathname] = tfidf_vector else: out[whole_pathname] = termvector return out
def main(): frequencies = document_frequencies() # documents = load_documents(use_tfidf=True, doc_frequencies=frequencies) documents = load_documents() print("press ctrl-D to exit!") while True: try: querytext = input("query> ") querytext = querytext.strip().lower() if not querytext: continue query = countwords(querytext.split()) for hit in search(query, documents): if hit[1]: print(hit) except EOFError: print() break print("OK thanks!")
def testCountWords(self): counts = countwords(words) assert counts['the'] == 2 assert counts['fox'] == 1 assert counts['mango'] == 0