def test_init(self): self.assertDictEqual( BagOfWords("cat dog cow").values, { "cat": 1, "dog": 1, "cow": 1 }) self.assertDictEqual( BagOfWords("Cat dog cat").values, { "cat": 2, "dog": 1 })
def test_init_with_symbols_in_str(self): self.assertDictEqual( BagOfWords(" cat, dog! cow.").values, { "cat": 1, "dog": 1, "cow": 1 }) self.assertDictEqual( BagOfWords(text="cat dog?? cat!!! ").values, { "cat": 2, "dog": 1 })
def test_union(self): bag1 = BagOfWords("cat dog cow fish cat cat fish") bag2 = BagOfWords("dog grape banana peach") self.assertDictEqual( bag1.union(bag2).values, { "banana": 1, "cat": 3, "cow": 1, "dog": 2, "fish": 2, "grape": 1, "peach": 1 })
def test_document_len(self): bag1 = BagOfWords("cat dog cow fish cat cat fish") bag2 = BagOfWords("dog grape banana peach") self.assertEqual(bag1.document_len(), 7) self.assertEqual(bag2.document_len(), 4) self.assertEqual(bag1.intersection(bag2).document_len(), 1) self.assertEqual(bag1.union(bag2).document_len(), 11)
def test_str(self): txt = str(BagOfWords("cat dog cow")) self.assertTrue(txt.startswith("{")) self.assertIn("'cat': 1", txt) self.assertIn("'dog': 1", txt) self.assertIn("'cow': 1", txt) self.assertTrue(txt.endswith("}"))
def setUp(self): self.indexer = Indexer() for text in self.texts: text = text.strip() bag = BagOfWords(text, enable_stemming=False, filter_stopwords=False) self.indexer.index(bag)
def test_from_values_dict(self): self.assertDictEqual( BagOfWords.from_values_dict({ "cat": 1, "dog": 1, "cow": 1 }).values, { "cat": 1, "dog": 1, "cow": 1 }) self.assertDictEqual( BagOfWords.from_values_dict({ "cat": 2, "dog": 1 }).values, { "cat": 2, "dog": 1 })
def test_to_dict(self): self.assertDictEqual( BagOfWords(" cat, dog! cow.").to_dict(), { "text": " cat, dog! cow.", "values": { "cat": 1, "dog": 1, "cow": 1 } })
def test_search_5(self): bag = BagOfWords("foo bar", enable_stemming=False, filter_stopwords=False) results = sorted(((text, round(score, 5)) for text, score in self.indexer.search(bag, 10)), reverse=True, key=lambda x: x[1]) self.assertEqual(len(results), 0)
def main(args): indexer = Indexer() with io.open(args.texts, encoding="utf-8") as f: for line in f: line = line.strip() bag = BagOfWords(line, filter_stopwords=False) indexer.index(bag) open_func = gzip.open if args.zip else io.open index_ext = ".json.gz" if args.zip else ".json" with open_func(args.index + index_ext, mode="wb") as f: indexer.dump(f) return 0
def test_search_4(self): bag = BagOfWords("life ñu", enable_stemming=False, filter_stopwords=False) results = sorted(((text, round(score, 5)) for text, score in self.indexer.search(bag, 10)), reverse=True, key=lambda x: x[1]) self.assertEqual(len(results), 2) result0, _ = results[0] self.assertEqual(result0, self.texts[1]) result1, _ = results[1] self.assertEqual(result1, self.texts[0])
def test_index_creation(self): self.maxDiff = None indexer = Indexer() for text in self.texts: text = text.strip() bag = BagOfWords(text, enable_stemming=False, filter_stopwords=False) indexer.index(bag) got = indexer.to_dict() self.assertSequenceEqual(self.expected["docs_index"], got["docs_index"]) self.assertDictEqual(self.expected["terms_index"], got["terms_index"])
def test_dump(self): indexer = Indexer() for text in self.texts: text = text.strip() bag = BagOfWords(text, enable_stemming=False, filter_stopwords=False) indexer.index(bag) fd = StringIO() indexer.dump(fd) fd.seek(0) got = json.load(fd) self.assertSequenceEqual(self.expected["docs_index"], got["docs_index"]) self.assertDictEqual(self.expected["terms_index"], got["terms_index"])
def test_from_dict(self): bag = BagOfWords.from_dict({ "text": "cat dog cow", "values": { "cat": 1, "dog": 1, "cow": 1 } }) self.assertEqual(bag.text, "cat dog cow") self.assertDictEqual(bag.values, {"cat": 1, "dog": 1, "cow": 1}) with self.assertRaises(ValueError): BagOfWords.from_dict({}) with self.assertRaises(ValueError): BagOfWords.from_dict({"text": "blablabla"}) with self.assertRaises(ValueError): BagOfWords.from_dict({"values": {"a": 1, "b": 1}})
def main(args): indexer = Indexer() open_func = gzip.open if args.zip else io.open index_ext = ".json.gz" if args.zip else ".json" with open_func(args.index + index_ext) as f: indexer.load(f) with io.open(args.texts, encoding="utf-8") as f: for line in f: line = line.strip() if len(line) == 0: continue bag = BagOfWords(line, filter_stopwords=False) print(">" * 80) print("Query:\n {}".format(bag.text)) print(">" * 80) for result, score in indexer.search(bag, args.limit): print("{}:\n {}".format( score, result[0:args.show].encode("utf-8"))) print("<" * 80) print("\n\n") return 0
def test_len(self): self.assertEqual(len(BagOfWords("")), 0) self.assertEqual(len(BagOfWords("cat dog cow")), 3) self.assertEqual(len(BagOfWords(text="cat dog cat")), 2)
def test_intersection(self): bag1 = BagOfWords("cat dog cow fish cat cat fish") bag2 = BagOfWords("dog grape banana peach") self.assertDictEqual(bag1.intersection(bag2).values, {"dog": 1})
def test_iter(self): self.assertSequenceEqual(sorted(iter(BagOfWords(""))), []) self.assertSequenceEqual(sorted(iter(BagOfWords("cat cow dog"))), [("cat", 1), ("cow", 1), ("dog", 1)]) self.assertSequenceEqual(sorted(iter(BagOfWords(text="cat dog cat"))), [("cat", 2), ("dog", 1)])