def __init__(self, field: str, output: Path = None, hashes: AbstractDedupHashSet = None): super().__init__() self.n_lines = 0 self.field = field self.output = output self.hashes = FlatHashSet() if hashes is None else hashes self.num_hashes_start = len(self.hashes)
def test_dedup_with_np_dump(tmp_path: Path): hashes = tmp_path / "hashes.bin" documents = [ dict(text=text("_Hello", "_World", "I'm so original")), dict(text=text("_world", "I'm originaler", "_Hello")), ] with dedup.HashesCollector(field="text", output=hashes) as d: list(d.map(documents)) results = FlatHashSet() results.load_np(hashes) expected = set( str_hash(l) for l in ["_hello", "_world", "i'm so original", "i'm originaler"] ) assert expected == set(results.keys())
def test_dedup_with_np_dump(self): tmp = self.get_tmpdir() documents = [ dict(text=text("_Hello", "_World", "I'm so original")), dict(text=text("_world", "I'm originaler", "_Hello")), ] with dedup.HashesCollector(field="text", output=tmp("hashes.bin")) as d: list(d.map(documents)) results = FlatHashSet() results.load_np(tmp("hashes.bin")) expected = set( str_hash(l) for l in ["_hello", "_world", "i'm so original", "i'm originaler"]) self.assertEqual(expected, set(results.keys()))
class HashesCollector(jsonql.Transformer): """ Collect all hashes found of lines found in the `field` of the source documents. """ parallelisable = False def __init__(self, field: str, output: Path = None, hashes: AbstractDedupHashSet = None): super().__init__() self.n_lines = 0 self.field = field self.output = output self.hashes = FlatHashSet() if hashes is None else hashes self.num_hashes_end = 0 self.num_hashes_start = len(self.hashes) def summary(self) -> List[str]: summ = super().summary() h = self.num_hashes_end if self.hashes is None else len(self.hashes) h = (h - self.num_hashes_start) // 1000 max_mem = mem_footprint_gb() n = self.n_lines // 1000 summ.append( f"Found {h:_}k unique hashes over {n:_}k lines. Using {max_mem:.1f}GB of RAM." ) return summ def do(self, doc: dict) -> None: doc_hashes = compute_hashes(doc.get(self.field)) if doc_hashes is None: return self.hashes.add(doc_hashes) self.n_lines += doc_hashes.size def close(self): if self.output and self.hashes: self.hashes.dump(self.output) self.log(f"Saved {len(self.hashes)} hashes to {self.output}") # Save the number of hashes. self.num_hashes_end = len(self.hashes) # Free up mem even if the transformer is kept somewhere else. self.hashes = None # type: ignore
def _prepare(self): if self.duplicates is not None: return self.duplicates = FlatHashSet() start = time.time() for h in self.hashes_files: shard_start = time.time() self.duplicates.load(str(h)) delay = time.time() - shard_start self.log( f"Loaded hashes from {h} ({mem_footprint_gb():.3f}GB total, took {delay / 60:.1}m)" ) delay = time.time() - start self.log( f"Loaded {len(self.duplicates):_d} hashes from {len(self.hashes_files)} files. ({mem_footprint_gb():.1f}GB total, took {delay / 60:.1}m)" )
def deduplicate_concatenated(files, outputs, field, output_hashes, finalize=True): """Deduplicate several files at once, using the same set of hashes for all.""" hashes = FlatHashSet() dedup_kwargs = dict( field=field, hashes=hashes, add_hashes=True, output_hashes=None, finalize=finalize, ) assert len(files) == len(outputs) for f, o in zip(files, outputs): jsonql.run_pipe(deduplicate, dedup_kwargs, file=f, output=o) log(f"Saw {len(hashes)} hashes.") if output_hashes: log(f"Dumping {len(hashes)} hashes to {output_hashes}.") hashes.dump(output_hashes)
def merge_shard(hash_files, output): h = FlatHashSet() h.load(hash_files[0]) for hash_file in hash_files[1:]: h = merge(h, hash_file, output=None) print(f"Merged {hash_file}. We now have {len(h)} hashes.") h.dump(output) print(f"Saved {len(h)} hashes to {output}.")
def merge(hashes_1, hashes_2, output): if isinstance(hashes_1, str): h1 = FlatHashSet() h1.load(hashes_1) else: h1 = hashes_1 if isinstance(hashes_2, str): h2 = FlatHashSet() h2.load(hashes_2) else: h2 = hashes_2 h2_np = np.fromiter(h2.keys(), dtype=FlatHashSet.dtype, count=len(h2)) dup = h1.__contains__(h2_np) # Dups between h1 and h2 will be set to 1, keys unique to h2 are copied to # h1 with their value. h1[h2_np] = dup if output: h1.dump(output) return h1
class DuplicatesRemover(jsonql.Transformer): """DuplicatesRemover""" # The hashes can't be pickled so they will have to be read back from disk. warn_when_pickling = True def __init__(self, field: str, hashes_files: List[Path], collect: bool = False): """ Remove duplicates """ super().__init__() self.field = field self.collect = collect self.hashes_files = hashes_files self.duplicates: Optional[AbstractDedupHashSet] = None self.n_lines, self.n_lines_kept = 0, 0 self.n_chars, self.n_chars_kept = 0, 0 def _prepare(self): if self.duplicates is not None: return self.duplicates = FlatHashSet() start = time.time() for h in self.hashes_files: shard_start = time.time() self.duplicates.load(str(h)) delay = time.time() - shard_start self.log( f"Loaded hashes from {h} ({mem_footprint_gb():.3f}GB total, took {delay / 60:.1}m)" ) delay = time.time() - start self.log( f"Loaded {len(self.duplicates):_d} hashes from {len(self.hashes_files)} files. ({mem_footprint_gb():.1f}GB total, took {delay / 60:.1}m)" ) def do(self, doc: dict) -> Optional[dict]: content = doc.get(self.field) if not content: return None doc_hashes = compute_hashes(content) assert self.duplicates is not None seen = (self.duplicates.add(doc_hashes) if self.collect else self.duplicates[doc_hashes]) keep = seen < True kept = keep.sum() if kept == 0: return None doc_hashes = doc_hashes * keep self.n_lines += keep.size self.n_lines_kept += kept chars, kept_chars = finalize_doc(doc, self.field, hashes=doc_hashes) self.n_chars += chars self.n_chars_kept += kept_chars return doc def summary(self) -> List[str]: summ = super().summary() end_time = time.time() n_lines_kept, n_lines, n_docs = self.n_lines_kept, self.n_lines, self.processed speed = n_docs / (end_time - self.start_time) summ.append( f"Processed {self.n_lines} lines in {n_docs} docs. [{speed:.1f} doc/s]" ) selectivity = self.n_lines_kept / self.n_lines if n_lines else 0 summ.append( f"Kept {n_lines_kept} lines out of {n_lines} ({selectivity:.1%}).") n_chars_kept, n_chars = self.n_chars_kept, self.n_chars selectivity = n_chars_kept / n_chars if n_chars else 0 summ.append( f"Kept {n_chars_kept} chars out of {n_chars} ({selectivity:.1%}).") return summ
def remove_duplicates_sharded( files: List[Path], outputs: List[Path], hashes_dir: FilesOrDir, field: str, group_hashes: int = 1, tmp_dir: Path = None, min_len: int = 0, ): """Remove duplicates in several passes, when all hashes don't fit in RAM. Note: The current implementation is not doing a 'perfect' deduplication. If a hash appear exactly once in each shard of hashes it won't be detected as a duplicate. This can be fixed if hashes are fully dedup beforehand. """ assert len(files) == len(outputs) if isinstance(hashes_dir, list): hashes_files = hashes_dir else: hashes_files = sorted(h for h in Path(hashes_dir).iterdir() if h.suffix == ".bin") assert len(hashes_files) > 0, f"no hashes files found in: {hashes_dir}" if len(hashes_files) <= group_hashes: log(f"All hashes can be done in one pass, using DuplicatesRemover on {files}" ) rm_dups = DuplicatesRemover(field, hashes_files) rm_dups._prepare() run_par((jsonql.run_pipes, (rm_dups, ), dict(file=f, output=o)) for f, o in zip(files, outputs)) return log(f"Starting deduplicate_sharded on {files}.") tmp_directory = tempfile.TemporaryDirectory( dir=str(tmp_dir) if tmp_dir else None) def tmp_files(i): return [ Path(tmp_directory.name) / (f.name.split(".")[0] + f".{i}.bin") for f in files ] last = tmp_files(0) run_par((_dump_sentence_hashes, (f, tmp, field), {}) for f, tmp in zip(files, last)) if isinstance(hashes_dir, list): hashes_files = hashes_dir else: hashes_files = sorted(h for h in Path(hashes_dir).iterdir() if h.suffix == ".bin") for i, group in enumerate(jsonql.grouper(hashes_files, group_hashes)): hashes = FlatHashSet() for h in group: hashes.load(h) log(f"Loaded {h}, up to {len(hashes)} hashes ({mem_footprint_gb()}GB)" ) intermediates = tmp_files(i + 1) # Remove hashes in parallel. Since modern OS have "copy-on-write" and # `hashes` is read-only, we will only have one version of it in RAM. run_par((_remove_duplicate_hashes, (hashes, f, tmp), {}) for f, tmp in zip(last, intermediates)) # Force hashes to be freed, before we start allocating a new one. del hashes gc.collect() for tmp in last: os.remove(tmp) last = intermediates def finalize(source, dedup_hashes, min_len): n_chars, n_chars_kept = 0, 0 with open(dedup_hashes, "rb") as hashes: for doc in jsonql.read_jsons(source): content = doc.get(field) if not content or len(content) < min_len: continue sentences = content.split("\n") doc_hashes = np.fromfile(hashes, dtype=HASH_TYPE, count=len(sentences)) chars, kept_chars = finalize_doc(doc, field, doc_hashes) n_chars += chars n_chars_kept += kept_chars yield doc selectivity = n_chars_kept / n_chars if n_chars else 0 log(f"Kept {n_chars_kept} chars out of {n_chars} ({selectivity:.1%}).") dedup_hashes = last run_par([( jsonql.run_pipe, (finalize, ), dict(kwargs=dict(dedup_hashes=h, min_len=min_len), file=f, output=o), ) for h, f, o in zip(dedup_hashes, files, outputs)]) tmp_directory.cleanup()
def test_remove_duplicates_sharded(self): data = self.get_tmpdir() part_0 = [["Hello", "_World", "I'm so original"]] write_docs(data("part_0.json"), part_0) part_1 = [["_Good morning", "_World", "I'm originaler"]] write_docs(data("part_1.json"), part_1) h = self.get_tmpdir() h0 = FlatHashSet() h0.add([str_hash(s.lower()) for doc in part_0 for s in doc]) h0.add([str_hash("_world")]) h0.dump(h("part_0.bin")) self.assertEqual( { str_hash("hello"): False, str_hash("_world"): True, str_hash("i'm so original"): False, }, as_dict(h0), ) h1 = FlatHashSet() h1.add([str_hash(s.lower()) for doc in part_1 for s in doc]) h1.add([str_hash("_good morning")]) h1.dump(h("part_1.bin")) self.assertEqual( { str_hash("_good morning"): True, str_hash("_world"): False, str_hash("i'm originaler"): False, }, as_dict(h1), ) res = self.get_tmpdir() # dedup.DISABLE_MULTI_PROCESSING = True # Simplifies debugging dedup.remove_duplicates_sharded( files=[data("part_0.json"), data("part_1.json")], outputs=[res("part_0.json"), res("part_1.json")], field="text", hashes_dir=h(), ) with open(res("part_0.json")) as o: lines = o.readlines() print(lines) results_0 = list(jsonql.read_jsons(lines)) expected_0 = [ dict(text=text("Hello", "I'm so original"), original_nlines=3, nlines=2) ] assert_documents_equal(expected_0, results_0, ignoring=CUMBERSOME) with open(res("part_1.json")) as o: results_1 = [json.loads(l) for l in o.readlines()] # First pass removes "_world", second "_good morning". expected_1 = [ dict(text=text("I'm originaler"), original_nlines=3, nlines=1) ] assert_documents_equal(expected_1, results_1, ignoring=CUMBERSOME)
def deduplicate(source, field, hashes=None, output_hashes=None, add_hashes=True, finalize=True): """ DOES TOO MANY THINGS Removes duplicate lines found in the field `field` of the source documents. Finds duplicate lines based on the hashes. Either hashes can be computed when reading the documents or they can be loaded from a binary file. If `add_hashes` is set to False only the given hashes will be considered. This grants a better control on memory footprint. """ hash_field = field + "_hash" if isinstance(hashes, str) or isinstance(hashes, Path): seen = FlatHashSet() seen.load(hashes) elif hashes is not None: seen = hashes else: seen = FlatHashSet() log(f"Loaded {len(seen)} unique hashes.") n_doc = 0 batch_size = 100_000 n_lines, n_lines_kept = 0, 0 n_chars, n_chars_kept = 0, 0 t = time.time() def log_stats(start_time): end_time = time.time() speed = batch_size / (end_time - start_time) if add_hashes: log(f"Saw {len(seen)} unique hashes over {n_lines} lines in {n_doc} docs. [{speed:.1f} doc/s]" ) else: log(f"Processed {n_lines} lines in {n_doc} docs. [{speed:.1f} doc/s]" ) max_mem = mem_footprint_gb() log(f"Used up to {max_mem:.1f}GB of RAM.") selectivity = n_lines_kept / n_lines if n_lines else 0 log(f"Kept {n_lines_kept} lines out of {n_lines} ({selectivity:.1%}).") if finalize: selectivity = n_chars_kept / n_chars if n_chars else 0 log(f"Kept {n_chars_kept} chars out of {n_chars} ({selectivity:.1%})." ) for doc in jsonql.read_jsons(source): n_doc += 1 if n_doc % batch_size == 0: log_stats(t) t = time.time() hashes = doc.get(hash_field) or compute_hashes(doc.get(field)) if hashes is None: continue if isinstance(hashes, list): hashes = np.array(hashes, dtype=HASH_TYPE) duplicate = seen.__contains__(hashes) if add_hashes: seen.add(hashes, duplicate) keep = duplicate < 1 kept = keep.sum() hashes = hashes * keep doc[hash_field] = list(int(x) for x in hashes) n_lines += keep.size n_lines_kept += kept if finalize: chars, kept_chars = finalize_doc(doc, field) n_chars += chars n_chars_kept += kept_chars if kept > 0: yield doc log_stats(t) if output_hashes: log(f"Dumping {len(seen)} hashes to {output_hashes}.") seen.dump(output_hashes)
def close(self): if self.output and self.hashes: self.hashes.dump(self.output) # Free up mem even if the transformer is kept somewhere else. self.hashes = FlatHashSet()
def test_remove_duplicates_sharded(tmp_path: Path): data = tmp_path / "data" part_0 = [["Hello", "_World", "I'm so original"]] write_docs(data / "part_0.json", part_0) part_1 = [["_Good morning", "_World", "I'm originaler"]] write_docs(data / "part_1.json", part_1) h = tmp_path / "hashes" h.mkdir() h0 = FlatHashSet() h0.add([str_hash(s.lower()) for doc in part_0 for s in doc]) h0.add([str_hash("_world")]) h0.dump(h / "part_0.bin") assert { str_hash("hello"): False, str_hash("_world"): True, str_hash("i'm so original"): False, } == as_dict(h0) h1 = FlatHashSet() h1.add([str_hash(s.lower()) for doc in part_1 for s in doc]) h1.add([str_hash("_good morning")]) h1.dump(h / "part_1.bin") assert { str_hash("_good morning"): True, str_hash("_world"): False, str_hash("i'm originaler"): False, } == as_dict(h1) res = tmp_path / "res" res.mkdir() # dedup.DISABLE_MULTI_PROCESSING = True # Simplifies debugging dedup.remove_duplicates_sharded( files=[data / "part_0.json", data / "part_1.json"], outputs=[res / "part_0.json", res / "part_1.json"], field="text", hashes_dir=h, ) results_0 = list(jsonql.read_jsons(res / "part_0.json")) expected_0 = [ dict( text=text("Hello", "I'm so original"), original_nlines=3, nlines=2, line_ids=[0, 2], ) ] assert_documents_equal(expected_0, results_0, ignoring=LENGTHS) # First pass removes "_world", second "_good morning". results_1 = list(jsonql.read_jsons(res / "part_1.json")) expected_1 = [ dict(text=text("I'm originaler"), original_nlines=3, nlines=1, line_ids=[2]) ] assert_documents_equal(expected_1, results_1, ignoring=LENGTHS)