def main(args): indexer = Indexer() open_func = gzip.open if args.zip else io.open index_ext = ".json.gz" if args.zip else ".json" with open_func(args.index + index_ext) as f: indexer.load(f) with io.open(args.texts, encoding="utf-8") as f: for line in f: line = line.strip() if len(line) == 0: continue bag = BagOfWords(line, filter_stopwords=False) print(">" * 80) print("Query:\n {}".format(bag.text)) print(">" * 80) for result, score in indexer.search(bag, args.limit): print("{}:\n {}".format( score, result[0:args.show].encode("utf-8"))) print("<" * 80) print("\n\n") return 0
class TestSearch(unittest.TestCase): """ Esta prueba usa el siguiente ejemplo como modelo https://janav.wordpress.com/2013/10/27/tf-idf-and-cosine-similarity/ """ texts = [ "The game of life is a game of everlasting learning", "The unexamined life is not worth living", "Never stop learning" ] indexer = None def setUp(self): self.indexer = Indexer() for text in self.texts: text = text.strip() bag = BagOfWords(text, enable_stemming=False, filter_stopwords=False) self.indexer.index(bag) def test_search_1(self): bag = BagOfWords("life learning", enable_stemming=False, filter_stopwords=False) results = sorted(((text, round(score, 5)) for text, score in self.indexer.search(bag, 10)), reverse=True, key=lambda x: x[1]) self.assertEqual(len(results), 3) result0, _ = results[0] self.assertEqual(result0, self.texts[0]) result1, _ = results[1] self.assertEqual(result1, self.texts[1]) result2, _ = results[2] self.assertEqual(result2, self.texts[2]) def test_search_2(self): bag = BagOfWords("learning", enable_stemming=False, filter_stopwords=False) results = sorted(((text, round(score, 5)) for text, score in self.indexer.search(bag, 10)), reverse=True, key=lambda x: x[1]) self.assertEqual(len(results), 2) result0, _ = results[0] self.assertEqual(result0, self.texts[0]) result1, _ = results[1] self.assertEqual(result1, self.texts[2]) def test_search_3(self): bag = BagOfWords("ñu life", enable_stemming=False, filter_stopwords=False) results = sorted(((text, round(score, 5)) for text, score in self.indexer.search(bag, 10)), reverse=True, key=lambda x: x[1]) self.assertEqual(len(results), 2) result0, _ = results[0] self.assertEqual(result0, self.texts[1]) result1, _ = results[1] self.assertEqual(result1, self.texts[0]) def test_search_4(self): bag = BagOfWords("life ñu", enable_stemming=False, filter_stopwords=False) results = sorted(((text, round(score, 5)) for text, score in self.indexer.search(bag, 10)), reverse=True, key=lambda x: x[1]) self.assertEqual(len(results), 2) result0, _ = results[0] self.assertEqual(result0, self.texts[1]) result1, _ = results[1] self.assertEqual(result1, self.texts[0]) def test_search_5(self): bag = BagOfWords("foo bar", enable_stemming=False, filter_stopwords=False) results = sorted(((text, round(score, 5)) for text, score in self.indexer.search(bag, 10)), reverse=True, key=lambda x: x[1]) self.assertEqual(len(results), 0)