def test_dedup_with_np_dump(tmp_path: Path): hashes = tmp_path / "hashes.bin" documents = [ dict(text=text("_Hello", "_World", "I'm so original")), dict(text=text("_world", "I'm originaler", "_Hello")), ] with dedup.HashesCollector(field="text", output=hashes) as d: list(d.map(documents)) results = FlatHashSet() results.load_np(hashes) expected = set( str_hash(l) for l in ["_hello", "_world", "i'm so original", "i'm originaler"] ) assert expected == set(results.keys())
def test_dedup_with_np_dump(self): tmp = self.get_tmpdir() documents = [ dict(text=text("_Hello", "_World", "I'm so original")), dict(text=text("_world", "I'm originaler", "_Hello")), ] with dedup.HashesCollector(field="text", output=tmp("hashes.bin")) as d: list(d.map(documents)) results = FlatHashSet() results.load_np(tmp("hashes.bin")) expected = set( str_hash(l) for l in ["_hello", "_world", "i'm so original", "i'm originaler"]) self.assertEqual(expected, set(results.keys()))
def merge(hashes_1, hashes_2, output): if isinstance(hashes_1, str): h1 = FlatHashSet() h1.load(hashes_1) else: h1 = hashes_1 if isinstance(hashes_2, str): h2 = FlatHashSet() h2.load(hashes_2) else: h2 = hashes_2 h2_np = np.fromiter(h2.keys(), dtype=FlatHashSet.dtype, count=len(h2)) dup = h1.__contains__(h2_np) # Dups between h1 and h2 will be set to 1, keys unique to h2 are copied to # h1 with their value. h1[h2_np] = dup if output: h1.dump(output) return h1