def test_at_least_one_of_the_tokens(self): test_doc_list = [ TextDocument("there is a city named New York", "doc1"), TextDocument("In Britain there is a city named York", "doc2") ] expected_list = ['doc1', 'doc2'] small_collection = DocumentCollection.from_document_list(test_doc_list) searcher = SearchEngine(small_collection) query = "New York Britain" top_docs = searcher.ranked_documents(query) result_list = [] for doc, sim in top_docs: result_list.append(doc.id) self.assertEqual(result_list, expected_list)
def setUp(self): test_doc_list = [ TextDocument(text_and_id[0], text_and_id[1]) for text_and_id in [("the cat sat on a mat", "doc1"), ("a rose is a rose", "doc2")] ] self.small_collection = DocumentCollection.from_document_list( test_doc_list)
def test_unknown_word_cosine(self): """ Return 0 if cosine similarity is called for documents with only out-of-vocabulary words. """ # Document that only contains words that never occurred in the document collection. query_doc = TextDocument(text="unknownwords", id=None) # Some document from collection. collection_doc = self.small_collection.docid_to_doc["doc1"] # Similarity should be zero (instead of undefined). self.assertEqual( self.small_collection.cosine_similarity(query_doc, collection_doc), 0.)
def setUp(self): test_doc_list = [TextDocument("the cat sat\non a mat", "doc1")] self.small_collection = DocumentCollection.from_document_list( test_doc_list) self.searcher = SearchEngine(self.small_collection)
def setUp(self): self.test_reply_email = TextDocument(">>>>forword", "doc1")