def test_remove_duplicates_sharded(tmp_path: Path): data = tmp_path / "data" part_0 = [["Hello", "_World", "I'm so original"]] write_docs(data / "part_0.json", part_0) part_1 = [["_Good morning", "_World", "I'm originaler"]] write_docs(data / "part_1.json", part_1) h = tmp_path / "hashes" h.mkdir() h0 = FlatHashSet() h0.add([str_hash(s.lower()) for doc in part_0 for s in doc]) h0.add([str_hash("_world")]) h0.dump(h / "part_0.bin") assert { str_hash("hello"): False, str_hash("_world"): True, str_hash("i'm so original"): False, } == as_dict(h0) h1 = FlatHashSet() h1.add([str_hash(s.lower()) for doc in part_1 for s in doc]) h1.add([str_hash("_good morning")]) h1.dump(h / "part_1.bin") assert { str_hash("_good morning"): True, str_hash("_world"): False, str_hash("i'm originaler"): False, } == as_dict(h1) res = tmp_path / "res" res.mkdir() # dedup.DISABLE_MULTI_PROCESSING = True # Simplifies debugging dedup.remove_duplicates_sharded( files=[data / "part_0.json", data / "part_1.json"], outputs=[res / "part_0.json", res / "part_1.json"], field="text", hashes_dir=h, ) results_0 = list(jsonql.read_jsons(res / "part_0.json")) expected_0 = [ dict( text=text("Hello", "I'm so original"), original_nlines=3, nlines=2, line_ids=[0, 2], ) ] assert_documents_equal(expected_0, results_0, ignoring=LENGTHS) # First pass removes "_world", second "_good morning". results_1 = list(jsonql.read_jsons(res / "part_1.json")) expected_1 = [ dict(text=text("I'm originaler"), original_nlines=3, nlines=1, line_ids=[2]) ] assert_documents_equal(expected_1, results_1, ignoring=LENGTHS)
class HashesCollector(jsonql.Transformer): """ Collect all hashes found of lines found in the `field` of the source documents. """ parallelisable = False def __init__(self, field: str, output: Path = None, hashes: AbstractDedupHashSet = None): super().__init__() self.n_lines = 0 self.field = field self.output = output self.hashes = FlatHashSet() if hashes is None else hashes self.num_hashes_end = 0 self.num_hashes_start = len(self.hashes) def summary(self) -> List[str]: summ = super().summary() h = self.num_hashes_end if self.hashes is None else len(self.hashes) h = (h - self.num_hashes_start) // 1000 max_mem = mem_footprint_gb() n = self.n_lines // 1000 summ.append( f"Found {h:_}k unique hashes over {n:_}k lines. Using {max_mem:.1f}GB of RAM." ) return summ def do(self, doc: dict) -> None: doc_hashes = compute_hashes(doc.get(self.field)) if doc_hashes is None: return self.hashes.add(doc_hashes) self.n_lines += doc_hashes.size def close(self): if self.output and self.hashes: self.hashes.dump(self.output) self.log(f"Saved {len(self.hashes)} hashes to {self.output}") # Save the number of hashes. self.num_hashes_end = len(self.hashes) # Free up mem even if the transformer is kept somewhere else. self.hashes = None # type: ignore
class DuplicatesRemover(jsonql.Transformer): """DuplicatesRemover""" # The hashes can't be pickled so they will have to be read back from disk. warn_when_pickling = True def __init__(self, field: str, hashes_files: List[Path], collect: bool = False): """ Remove duplicates """ super().__init__() self.field = field self.collect = collect self.hashes_files = hashes_files self.duplicates: Optional[AbstractDedupHashSet] = None self.n_lines, self.n_lines_kept = 0, 0 self.n_chars, self.n_chars_kept = 0, 0 def _prepare(self): if self.duplicates is not None: return self.duplicates = FlatHashSet() start = time.time() for h in self.hashes_files: shard_start = time.time() self.duplicates.load(str(h)) delay = time.time() - shard_start self.log( f"Loaded hashes from {h} ({mem_footprint_gb():.3f}GB total, took {delay / 60:.1}m)" ) delay = time.time() - start self.log( f"Loaded {len(self.duplicates):_d} hashes from {len(self.hashes_files)} files. ({mem_footprint_gb():.1f}GB total, took {delay / 60:.1}m)" ) def do(self, doc: dict) -> Optional[dict]: content = doc.get(self.field) if not content: return None doc_hashes = compute_hashes(content) assert self.duplicates is not None seen = (self.duplicates.add(doc_hashes) if self.collect else self.duplicates[doc_hashes]) keep = seen < True kept = keep.sum() if kept == 0: return None doc_hashes = doc_hashes * keep self.n_lines += keep.size self.n_lines_kept += kept chars, kept_chars = finalize_doc(doc, self.field, hashes=doc_hashes) self.n_chars += chars self.n_chars_kept += kept_chars return doc def summary(self) -> List[str]: summ = super().summary() end_time = time.time() n_lines_kept, n_lines, n_docs = self.n_lines_kept, self.n_lines, self.processed speed = n_docs / (end_time - self.start_time) summ.append( f"Processed {self.n_lines} lines in {n_docs} docs. [{speed:.1f} doc/s]" ) selectivity = self.n_lines_kept / self.n_lines if n_lines else 0 summ.append( f"Kept {n_lines_kept} lines out of {n_lines} ({selectivity:.1%}).") n_chars_kept, n_chars = self.n_chars_kept, self.n_chars selectivity = n_chars_kept / n_chars if n_chars else 0 summ.append( f"Kept {n_chars_kept} chars out of {n_chars} ({selectivity:.1%}).") return summ
def test_remove_duplicates_sharded(self): data = self.get_tmpdir() part_0 = [["Hello", "_World", "I'm so original"]] write_docs(data("part_0.json"), part_0) part_1 = [["_Good morning", "_World", "I'm originaler"]] write_docs(data("part_1.json"), part_1) h = self.get_tmpdir() h0 = FlatHashSet() h0.add([str_hash(s.lower()) for doc in part_0 for s in doc]) h0.add([str_hash("_world")]) h0.dump(h("part_0.bin")) self.assertEqual( { str_hash("hello"): False, str_hash("_world"): True, str_hash("i'm so original"): False, }, as_dict(h0), ) h1 = FlatHashSet() h1.add([str_hash(s.lower()) for doc in part_1 for s in doc]) h1.add([str_hash("_good morning")]) h1.dump(h("part_1.bin")) self.assertEqual( { str_hash("_good morning"): True, str_hash("_world"): False, str_hash("i'm originaler"): False, }, as_dict(h1), ) res = self.get_tmpdir() # dedup.DISABLE_MULTI_PROCESSING = True # Simplifies debugging dedup.remove_duplicates_sharded( files=[data("part_0.json"), data("part_1.json")], outputs=[res("part_0.json"), res("part_1.json")], field="text", hashes_dir=h(), ) with open(res("part_0.json")) as o: lines = o.readlines() print(lines) results_0 = list(jsonql.read_jsons(lines)) expected_0 = [ dict(text=text("Hello", "I'm so original"), original_nlines=3, nlines=2) ] assert_documents_equal(expected_0, results_0, ignoring=CUMBERSOME) with open(res("part_1.json")) as o: results_1 = [json.loads(l) for l in o.readlines()] # First pass removes "_world", second "_good morning". expected_1 = [ dict(text=text("I'm originaler"), original_nlines=3, nlines=1) ] assert_documents_equal(expected_1, results_1, ignoring=CUMBERSOME)
def deduplicate(source, field, hashes=None, output_hashes=None, add_hashes=True, finalize=True): """ DOES TOO MANY THINGS Removes duplicate lines found in the field `field` of the source documents. Finds duplicate lines based on the hashes. Either hashes can be computed when reading the documents or they can be loaded from a binary file. If `add_hashes` is set to False only the given hashes will be considered. This grants a better control on memory footprint. """ hash_field = field + "_hash" if isinstance(hashes, str) or isinstance(hashes, Path): seen = FlatHashSet() seen.load(hashes) elif hashes is not None: seen = hashes else: seen = FlatHashSet() log(f"Loaded {len(seen)} unique hashes.") n_doc = 0 batch_size = 100_000 n_lines, n_lines_kept = 0, 0 n_chars, n_chars_kept = 0, 0 t = time.time() def log_stats(start_time): end_time = time.time() speed = batch_size / (end_time - start_time) if add_hashes: log(f"Saw {len(seen)} unique hashes over {n_lines} lines in {n_doc} docs. [{speed:.1f} doc/s]" ) else: log(f"Processed {n_lines} lines in {n_doc} docs. [{speed:.1f} doc/s]" ) max_mem = mem_footprint_gb() log(f"Used up to {max_mem:.1f}GB of RAM.") selectivity = n_lines_kept / n_lines if n_lines else 0 log(f"Kept {n_lines_kept} lines out of {n_lines} ({selectivity:.1%}).") if finalize: selectivity = n_chars_kept / n_chars if n_chars else 0 log(f"Kept {n_chars_kept} chars out of {n_chars} ({selectivity:.1%})." ) for doc in jsonql.read_jsons(source): n_doc += 1 if n_doc % batch_size == 0: log_stats(t) t = time.time() hashes = doc.get(hash_field) or compute_hashes(doc.get(field)) if hashes is None: continue if isinstance(hashes, list): hashes = np.array(hashes, dtype=HASH_TYPE) duplicate = seen.__contains__(hashes) if add_hashes: seen.add(hashes, duplicate) keep = duplicate < 1 kept = keep.sum() hashes = hashes * keep doc[hash_field] = list(int(x) for x in hashes) n_lines += keep.size n_lines_kept += kept if finalize: chars, kept_chars = finalize_doc(doc, field) n_chars += chars n_chars_kept += kept_chars if kept > 0: yield doc log_stats(t) if output_hashes: log(f"Dumping {len(seen)} hashes to {output_hashes}.") seen.dump(output_hashes)