def test_iter(self): self.assertSequenceEqual(sorted(iter(BagOfWords(""))), []) self.assertSequenceEqual( sorted(iter(BagOfWords("cat cow dog"))), [("cat", 1), ("cow", 1), ("dog", 1)]) self.assertSequenceEqual( sorted(iter(BagOfWords(text="cat dog cat"))), [("cat", 2), ("dog", 1)])
def test_init(self): self.assertDictEqual( BagOfWords("cat dog cow").values, { "cat": 1, "dog": 1, "cow": 1 }) self.assertDictEqual( BagOfWords("Cat dog cat").values, { "cat": 2, "dog": 1 })
def test_init_with_symbols_in_str(self): self.assertDictEqual( BagOfWords(" cat, dog! cow.").values, { "cat": 1, "dog": 1, "cow": 1 }) self.assertDictEqual( BagOfWords(text="cat dog?? cat!!! ").values, { "cat": 2, "dog": 1 })
def test_union(self): bag1 = BagOfWords("cat dog cow fish cat cat fish") bag2 = BagOfWords("dog grape banana peach") self.assertDictEqual( bag1.union(bag2).values, { "banana": 1, "cat": 3, "cow": 1, "dog": 2, "fish": 2, "grape": 1, "peach": 1 })
def test_document_len(self): bag1 = BagOfWords("cat dog cow fish cat cat fish") bag2 = BagOfWords("dog grape banana peach") self.assertEqual(bag1.document_len(), 7) self.assertEqual(bag2.document_len(), 4) self.assertEqual(bag1.intersection(bag2).document_len(), 1) self.assertEqual(bag1.union(bag2).document_len(), 11)
def test_init_with_symbols_in_str(self): """Prueba la inicialización con strings que contengan símbolos de puntuación """ self.assertDictEqual( BagOfWords(" cat, dog! cow.").values, { "cat": 1, "dog": 1, "cow": 1 }) self.assertDictEqual( BagOfWords(text="cat dog?? cat!!! ").values, { "cat": 2, "dog": 1 })
def test_init_with_str(self): """Prueba la inicialización con strings """ self.assertDictEqual( BagOfWords("cat dog cow").values, { "cat": 1, "dog": 1, "cow": 1 }) self.assertDictEqual( BagOfWords(text="Cat dog cat").values, { "cat": 2, "dog": 1 })
def setUp(self): self.indexer = Indexer() for text in self.texts: text = text.strip() bag = BagOfWords( text, enable_stemming=False, filter_stopwords=False) self.indexer.index(bag)
def test_str(self): txt = str(BagOfWords("cat dog cow")) self.assertTrue(txt.startswith("{")) self.assertIn("'cat': 1", txt) self.assertIn("'dog': 1", txt) self.assertIn("'cow': 1", txt) self.assertTrue(txt.endswith("}"))
def test_document_len(self): """Prueba el tamaño del documento, no del vector """ bag1 = BagOfWords("cat dog cow fish cat cat fish") bag2 = BagOfWords("dog grape banana peach") self.assertEqual(bag1.document_len(), 7) self.assertEqual(bag2.document_len(), 4) self.assertEqual(bag1.intersection(bag2).document_len(), 1) self.assertEqual(bag1.union(bag2).document_len(), 11)
def test_search_4(self): bag = BagOfWords("life ñu", enable_stemming=False, filter_stopwords=False) results = sorted(((text, round(score, 5)) for text, score in self.indexer.search(bag, 10)), reverse=True, key=lambda x: x[1]) self.assertEqual(len(results), 2) result0, _ = results[0] self.assertEqual(result0, self.texts[1]) result1, _ = results[1] self.assertEqual(result1, self.texts[0])
def test_str(self): """Prueba la conversión a string """ txt = str(BagOfWords("cat dog cow")) self.assertTrue(txt.startswith("{")) self.assertIn("'cat': 1", txt) self.assertIn("'dog': 1", txt) self.assertIn("'cow': 1", txt) self.assertTrue(txt.endswith("}"))
def test_to_dict(self): self.assertDictEqual( BagOfWords(" cat, dog! cow.").to_dict(), { "text": " cat, dog! cow.", "values": { "cat": 1, "dog": 1, "cow": 1 } })
def test_from_values_dict(self): self.assertDictEqual( BagOfWords.from_values_dict({ "cat": 1, "dog": 1, "cow": 1 }).values, { "cat": 1, "dog": 1, "cow": 1 }) self.assertDictEqual( BagOfWords.from_values_dict({ "cat": 2, "dog": 1 }).values, { "cat": 2, "dog": 1 })
def test_init_with_dict(self): """Prueba la inicialización con diccinoarios """ self.assertDictEqual( BagOfWords(values={ "cat": 1, "dog": 1, "cow": 1 }).values, { "cat": 1, "dog": 1, "cow": 1 }) self.assertDictEqual( BagOfWords(values={ "cat": 2, "dog": 1 }).values, { "cat": 2, "dog": 1 })
def main(args): indexer = Indexer() with io.open(args.texts, encoding="utf-8") as f: for line in f: line = line.strip() bag = BagOfWords(line, filter_stopwords=False) indexer.index(bag) open_func = gzip.open if args.zip else io.open index_ext = ".json.gz" if args.zip else ".json" with open_func(args.index + index_ext, mode="wb") as f: indexer.dump(f) return 0
def test_index_creation(self): self.maxDiff = None indexer = Indexer() for text in self.texts: text = text.strip() bag = BagOfWords( text, enable_stemming=False, filter_stopwords=False) indexer.index(bag) got = indexer.to_dict() self.assertSequenceEqual(self.expected["docs_index"], got["docs_index"]) self.assertDictEqual(self.expected["terms_index"], got["terms_index"])
def test_dump(self): indexer = Indexer() for text in self.texts: text = text.strip() bag = BagOfWords( text, enable_stemming=False, filter_stopwords=False) indexer.index(bag) fd = StringIO() indexer.dump(fd) fd.seek(0) got = json.load(fd) self.assertSequenceEqual(self.expected["docs_index"], got["docs_index"]) self.assertDictEqual(self.expected["terms_index"], got["terms_index"])
def test_dump(self): """Prueba que el fichero JSON generado sea correcto """ indexer = Indexer() for text in self.texts: text = text.strip() bag = BagOfWords(text, enable_stemming=False, filter_stopwords=False) indexer.index(bag) fd = StringIO() indexer.dump(fd) fd.seek(0) got = json.load(fd) self.assertSequenceEqual(self.expected["docs_index"], got["docs_index"]) self.assertDictEqual(self.expected["words_index"], got["words_index"])
def test_from_dict(self): bag = BagOfWords.from_dict({ "text": "cat dog cow", "values": { "cat": 1, "dog": 1, "cow": 1 } }) self.assertEqual(bag.text, "cat dog cow") self.assertDictEqual(bag.values, {"cat": 1, "dog": 1, "cow": 1}) with self.assertRaises(ValueError): BagOfWords.from_dict({}) with self.assertRaises(ValueError): BagOfWords.from_dict({"text": "blablabla"}) with self.assertRaises(ValueError): BagOfWords.from_dict({"values": {"a": 1, "b": 1}})
def test_index_creation(self): """Prueba la creación del indice Esta prueba usa el siguiente ejemplo como modelo https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Example_of_tf%E2%80%93idf """ self.maxDiff = None indexer = Indexer() for text in self.texts: text = text.strip() bag = BagOfWords(text, enable_stemming=False, filter_stopwords=False) indexer.index(bag) self.assertSequenceEqual(self.expected["docs_index"], indexer.docs_index) self.assertDictEqual(self.expected["words_index"], indexer.words_index)
def main(args): indexer = Indexer() open_func = gzip.open if args.zip else io.open index_ext = ".json.gz" if args.zip else ".json" with open_func(args.index + index_ext) as f: indexer.load(f) with io.open(args.texts, encoding="utf-8") as f: for line in f: line = line.strip() if len(line) == 0: continue bag = BagOfWords(line, filter_stopwords=False) print(">" * 80) print("Query:\n {}".format(bag.text)) print(">" * 80) for result, score in indexer.search(bag, args.limit): print("{}:\n {}".format(score, result[0:args.show].encode("utf-8"))) print("<" * 80) print("\n\n") return 0
def test_score(self): """Prueba los scores de una palabra para cada documento Esta prueba usa el siguiente ejemplo como modelo https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Example_of_tf%E2%80%93idf """ indexer = Indexer() for text in self.texts: text = text.strip() bag = BagOfWords(text, enable_stemming=False, filter_stopwords=False) indexer.index(bag) # Tests the score of "example" scores = sorted(indexer.score("example", enable_stemming=False, filter_stopwords=False), reverse=True) self.assertAlmostEqual(scores[0][0], 0.129, places=3) self.assertEqual(scores[0][1], 1) # Tests the score of "this" scores = sorted(indexer.score("this", enable_stemming=False, filter_stopwords=False), reverse=True) self.assertAlmostEqual(scores[0][0], 0, places=3) self.assertEqual(scores[0][1], 1) self.assertAlmostEqual(scores[1][0], 0, places=3) self.assertEqual(scores[1][1], 0) # Tests the score of "sample" scores = sorted(indexer.score("sample", enable_stemming=False, filter_stopwords=False), reverse=True) self.assertAlmostEqual(scores[0][0], 0.060, places=3) self.assertEqual(scores[0][1], 0)
def test_len(self): self.assertEqual(len(BagOfWords("")), 0) self.assertEqual(len(BagOfWords("cat dog cow")), 3) self.assertEqual(len(BagOfWords(text="cat dog cat")), 2)
def test_search_5(self): bag = BagOfWords("foo bar", enable_stemming=False, filter_stopwords=False) results = sorted(((text, round(score, 5)) for text, score in self.indexer.search(bag, 10)), reverse=True, key=lambda x: x[1]) self.assertEqual(len(results), 0)
def test_len(self): """Prueba el tamaño del vector """ self.assertEqual(len(BagOfWords()), 0) self.assertEqual(len(BagOfWords("cat dog cow")), 3) self.assertEqual(len(BagOfWords(text="cat dog cat")), 2)
def test_intersection(self): """Prueba la interesección de dos bag-of-words """ bag1 = BagOfWords("cat dog cow fish cat cat fish") bag2 = BagOfWords("dog grape banana peach") self.assertDictEqual(bag1.intersection(bag2).values, {"dog": 1})
def test_intersection(self): bag1 = BagOfWords("cat dog cow fish cat cat fish") bag2 = BagOfWords("dog grape banana peach") self.assertDictEqual(bag1.intersection(bag2).values, {"dog": 1})