def test_dedup_with_np_dump(tmp_path: Path): hashes = tmp_path / "hashes.bin" documents = [ dict(text=text("_Hello", "_World", "I'm so original")), dict(text=text("_world", "I'm originaler", "_Hello")), ] with dedup.HashesCollector(field="text", output=hashes) as d: list(d.map(documents)) results = FlatHashSet() results.load_np(hashes) expected = set( str_hash(l) for l in ["_hello", "_world", "i'm so original", "i'm originaler"] ) assert expected == set(results.keys())
def test_dedup_with_np_dump(self): tmp = self.get_tmpdir() documents = [ dict(text=text("_Hello", "_World", "I'm so original")), dict(text=text("_world", "I'm originaler", "_Hello")), ] with dedup.HashesCollector(field="text", output=tmp("hashes.bin")) as d: list(d.map(documents)) results = FlatHashSet() results.load_np(tmp("hashes.bin")) expected = set( str_hash(l) for l in ["_hello", "_world", "i'm so original", "i'm originaler"]) self.assertEqual(expected, set(results.keys()))