class TestTFIDF(unittest.TestCase): def setUp(self): stopwords = "stop".split() keywords = "aaa bbb ccc ddd eee fff".split() documents = [ ("document 1 ccc", "aaa aaa aaa ccc"), ("document 2 stop", "stop aaa bbb ccc"), ("document 3 stop", "aaa"), ("document 4 ddd", "aaa bbb ccc ddd eee"), ] self.s = TFIDF(keywords, documents, Cleaner(stopwords)) def test_keyword_setup(self): actual = self.s.keywords.items() expected = [("aaa", 0), ("bbb", 1), ("ccc", 2), ("ddd", 3), ("eee", 4), ("fff", 5)] self.assertEqual(actual, expected) def test_documents_setup(self): actual = self.s.document_vectors expected = {0: [3, 0, 2, 0, 0, 0], 1: [1, 1, 1, 0, 0, 0], 2: [1, 0, 0, 0, 0, 0], 3: [1, 1, 1, 2, 1, 0]} self.assertEqual(actual, expected) def test_search_with_no_results(self): actual = self.s.search("fff") expected = [] self.assertEqual(actual, expected) def test_search_with_only_popular_terms(self): actual = self.s.search("aaa") expected = [] # because idf=0 self.assertEqual(actual, expected) def test_tf(self): document = self.s.document_vectors[0] actual = self.s.tf(document, "ccc") expected = 0.6666666666 self.assertAlmostEqual(actual, expected) document = self.s.document_vectors[0] actual = self.s.tf(document, "aaa") expected = 1.0 self.assertAlmostEqual(actual, expected) document = self.s.document_vectors[1] actual = self.s.tf(document, "aaa") expected = 1.0 self.assertAlmostEqual(actual, expected) document = self.s.document_vectors[2] actual = self.s.tf(document, "aaa") expected = 1.0 self.assertAlmostEqual(actual, expected) document = self.s.document_vectors[3] actual = self.s.tf(document, "aaa") expected = 0.5 self.assertAlmostEqual(actual, expected) def test_idf(self): expected_results = [ ("aaa", math.log(1.0, 10)), ("bbb", math.log(2.0, 10)), ("ccc", math.log(1.3333333333333, 10)), ("ddd", math.log(4.0, 10)), ("eee", math.log(4.0, 10)), ("fff", 0.0), ] for term, expected in expected_results: actual = self.s.idf(term) self.assertAlmostEqual(actual, expected)
parser.add_argument('-s', '--stopwords', help="Stopwords file path", default="data/stopwords.txt") parser.add_argument('-d', '--documents', help="Documents file path", default="data/documents-2.txt") parser.add_argument('-n', '--noresults', help="Number of displayed results", default="5") parser.add_argument('-v', '--version', action='version', version='%(prog)s 0.3') args = parser.parse_args() keywords = Loader.load_keywords(args.keywords) stopwords = Loader.load_stopwords(args.stopwords) documents = Loader.load_documents(args.documents) n = int(args.noresults) cleaner = Cleaner(stopwords) tfidf = TFIDF(keywords, documents, cleaner) question = raw_input("Enter search string or \"exit()\" and press enter: ") while question != "exit()": found = tfidf.search(question) for title, similarity, index in found[:n]: print "{0:4f}\t{1}".format(similarity, title) groups = tfidf.group_kmeans(9, 10) for i, group in enumerate(groups): print "\nGroup {0}:\n".format(i) for doc_id in group: print "\t{0}\n".format(documents[doc_id][0]) question = raw_input("\nEnter search string or \"exit()\" and " "press enter: ")
class TestTFIDF_InfoRetrieval(unittest.TestCase): def setUp(self): stopwords = "stop".split() keywords = "information agency retrieval".split() # documents = [ # ("Document 1", "information retrieval information retrieval"), # ("Document 2", "retrieval retrieval retrieval retrieval"), # ("Document 3", "agency information retrieval agency"), # ("Document 4", "retrieval agency retrieval agency"), # ] documents = Loader.load_documents("data/documents-lab1.txt") self.s = TFIDF(keywords, documents, Cleaner(stopwords)) def test_keyword_setup(self): actual = self.s.keywords.items() expected = [("agenc", 0), ("inform", 1), ("retriev", 2)] self.assertEqual(actual, expected) def test_documents_setup(self): actual = self.s.document_vectors expected = {0: [0, 2, 2], 1: [0, 0, 4], 2: [2, 1, 1], 3: [2, 0, 2]} self.assertEqual(actual, expected) def test_tf(self): expected_results = [(0, [0, 1, 1]), (1, [0, 0, 1]), (2, [1, 0.5, 0.5]), (3, [1, 0, 1])] for index, expected_vector in expected_results: document = self.s.document_vectors[index] for word, i in self.s.keywords.items(): actual = self.s.tf(document, word) expected = expected_vector[i] self.assertEqual(actual, expected) def test_idf(self): expected_results = [("inform", math.log(2, 10)), ("retriev", 0.0), ("agenc", math.log(2, 10))] for term, expected in expected_results: actual = self.s.idf(term) self.assertAlmostEqual(actual, expected, places=6) def test_tfidf(self): expected_results = [ (0, [0, math.log(2, 10), 0]), (1, [0, 0, 0]), (2, [math.log(2, 10), 0.5 * math.log(2, 10), 0]), (3, [math.log(2, 10), 0, 0]), ] for index, expected_vector in expected_results: document = self.s.document_vectors[index] actual_vector = self.s.tfidf(document) for actual, expected in zip(actual_vector, expected_vector): self.assertAlmostEqual(actual, expected, places=6) def test_similarity(self): expected_results = [(0, 1), (1, 0), (2, math.sqrt(0.2)), (3, 0)] question_vector = self.s.phrase_to_vector("information retrieval") question_tfidfs = self.s.tfidf(question_vector) for index, expected in expected_results: actual = self.s.doc_question_similarity(index, question_tfidfs) self.assertEqual(actual, expected) def test_search(self): expected = [("Document 1", 1.0, 0), ("Document 3", math.sqrt(0.2), 2)] actual = self.s.search("information retrieval") self.assertEqual(actual, expected)