def test_invalid_settings(num_bands, default_hasher, default_cache): with pytest.raises(AssertionError): lsh = Cache(default_hasher, num_bands=num_bands) lsh.add_doc('Hi', 1) lsh.get_duplicates_of('Hello') default_cache.add_doc('Hi', 0) with pytest.raises(ValueError): default_cache.get_duplicates_of(doc_id=123)
def test_cache_json_serialisation(tmpdir, default_cache): path = str(tmpdir.join("cache.json")) # easy case- the bins array is empty default_cache.to_json(path) loaded_cache = Cache.from_json(path) # now add some data doc = "This is a document" default_cache.add_doc(doc, 0) loaded_cache.add_doc(doc, 0) assert (default_cache.get_duplicates_of(doc) == loaded_cache.get_duplicates_of(doc)) assert (default_cache.get_duplicates_of( doc_id=0) == loaded_cache.get_duplicates_of(doc_id=0)) default_cache.to_json(path) loaded_cache = Cache.from_json(path) default_cache.add_doc("The king of Denmark", 1) loaded_cache.add_doc("The king of Denmark", 1) default_cache.add_doc("The queen of Zerg", 2) loaded_cache.add_doc("The queen of Zerg", 2) default_cache.to_json(path) loaded_cache = Cache.from_json(path) assert (default_cache.get_duplicates_of(doc) == loaded_cache.get_duplicates_of(doc)) assert (default_cache.get_duplicates_of( doc_id=0) == loaded_cache.get_duplicates_of(doc_id=0)) assert (default_cache.get_duplicates_of(doc) == loaded_cache.get_duplicates_of(doc)) for id in [0, 1, 2]: assert (default_cache.get_duplicates_of( doc_id=id) == loaded_cache.get_duplicates_of(doc_id=id))
def test_num_bands(doc): """ add near-duplicate documents to three caches with different settings check that hashers with low band_width finds more matches (over 50 runs) """ suffixes = ['teamless', 'retired', 'awesome', 'overweight'] duplicates = [] divisors_of_200 = [4, 10, 20, 25, 40, 50, 100] for seed in range(10): hasher = MinHasher(seeds=200, char_ngram=5, random_state=seed) caches = [Cache(hasher, num_bands=n) for n in divisors_of_200] for c in caches: c.add_doc(doc + suffixes[0], 0) for s in suffixes[1:]: duplicates.append([c.is_duplicate(doc + s) for c in caches]) sums = np.array(duplicates).sum(axis=0) print(sums) assert is_nondecreasing(sums)
def test_cache(char_ngram, hashbytes, num_bands, seed): hasher = MinHasher(seeds=200, char_ngram=char_ngram, hashbytes=hashbytes, random_state=seed) lsh = Cache(hasher, num_bands=num_bands) # very small band width => always find duplicates short_doc = 'This is a simple document' another_doc = 'Some text about animals.' long_doc = 'A much longer document that contains lots of information\ different words. The document produces many more shingles.' assert not lsh.is_duplicate(short_doc) lsh.add_doc(short_doc, 0) assert lsh.get_duplicates_of(short_doc) == {0} assert lsh.is_duplicate(short_doc, doc_id=0) assert lsh.is_duplicate(short_doc) assert not lsh.is_duplicate(long_doc) lsh.add_doc(long_doc, 1) lsh.add_doc(another_doc, 2) assert lsh.is_duplicate(another_doc) assert lsh.is_duplicate(long_doc, doc_id=1) assert lsh.is_duplicate(long_doc) words = long_doc.split() long_doc_missing_word = ' '.join([words[0]] + words[2:]) assert lsh.get_duplicates_of(long_doc_missing_word) == {1} assert lsh.is_duplicate(long_doc_missing_word) assert lsh.is_duplicate(long_doc + ' Word.') assert lsh.get_all_duplicates() == set() lsh.add_doc(long_doc_missing_word, 3) assert lsh.get_all_duplicates() == {(1, 3)} lsh.add_doc(long_doc_missing_word, 4) assert lsh.get_all_duplicates() == {(1, 3), (1, 4), (3, 4)}
def dedup(self): deduper = Cache(MinHasher(100)) for x, doc in enumerate(self.data): deduper.add_doc(doc[0], x) dups = deduper.get_all_duplicates(min_jaccard=0.80) return dups
def default_cache(default_hasher): return Cache(default_hasher)