def test_blocked_gzip(tmp_path): file = tmp_path / "test.gz" # Each object is 10/11 bytes long. We have 2 of them by block. content = [f'{{"xx": {i}}}' for i in range(80)] with jsonql.BlockedGzipWriter(file, "wt", block_size="20B") as o: for line in content: print(line, file=o) with jsonql.JsonReader(strict=True) as jr: with jsonql.smart_open(file) as f: read_as_one_file = list(jr.map(f)) expected = list(jr.map(content)) assert expected == read_as_one_file with jsonql.smart_open(str(file) + "[0/40]") as f: reader = list(f) assert expected[:2] == list(jr.map(l for l in reader)) with jsonql.smart_open(str(file) + "[39/40]") as f: reader = list(f) assert expected[-2:] == list(jr.map(l for l in reader)) readers = jsonql.get_block_readers(file, 9) read_as_several_files = [list(jr.map(r)) for r in readers] # 40 splits of 2 docs, 9 readers -> 5 splits, 10 docs per reader assert list(jsonql.grouper(expected, 10)) == read_as_several_files
def determine_groups(inputs: List[Path], target_size: int = 4 * 1024**3) -> List[List[Path]]: if len(inputs) == 0: return [] sample = inputs[:10] typical_size = sum(s.stat().st_size for s in sample) / len(sample) group_size = min(target_size // typical_size, len(inputs)) group_size = max(group_size, 1) return jsonql.grouper(inputs, group_size)
def mine(conf: Config) -> List[Path]: """Remove dups, run LID and LMs, and split by lang and quality.""" mined_dir = conf.output_dir / "mined" / conf.dump if conf.will_split: # Give a directories when splitting outputs = [ mined_dir / f"{shard:04d}" for shard in range(conf.num_shards) ] else: # Files otherwise outputs = [ mined_dir / f"{shard:04d}.json.gz" for shard in range(conf.num_shards) ] # TODO: try to reduce this / make it a function of "hash_in_mem" / num_langs mem_gb = 60 + 1 * conf.hash_in_mem timeout_hour = 5 if "hashes" in conf.experiments: # HACK: used for generating paper figures outputs = [ conf.output_dir / f"hashes_exp/{conf.dump}_0000_dedup{h:03d}.json.gz" for h in HASHES_IN_MEM ] mem_gb = int(max(HASHES_IN_MEM) * 1.2) timeout_hour = 8 missing_outputs = [(shard, o) for shard, o in enumerate(outputs) if not o.exists()] if not missing_outputs: return outputs # Compute hashes firsts. hashes_groups = list(jsonql.grouper(hashes(conf), conf.hash_in_mem)) mined_dir.mkdir(parents=True, exist_ok=True) ex = conf.get_executor( f"mine_{conf.dump}", mem_gb=mem_gb, timeout_hour=timeout_hour, cpus=conf.mine_num_processes + 1, ) hashes_files = [ hashes_groups[shard // conf.hash_in_mem] for shard, o in missing_outputs ] ex(_mine_shard, repeat(conf), hashes_files, *_transpose(missing_outputs)) assert all(o.exists() for o in outputs) return outputs
def test_blocked_gzip(tmp_path: Path): file = tmp_path / "test.gz" f = str(file) # Each object is 10/11 bytes long. We have 2 of them by block. content = ['{"xx": %d}' % i for i in range(80)] with jsonql.BlockedGzipWriter(file, "wt", block_size="20B") as o: for line in content: print(line, file=o) jr = jsonql.JsonReader(strict=True) expected = list(jr.map(content)) # read as one file assert expected == list(jsonql.read_jsons(file)) # read first block assert expected[:2] == list(jsonql.read_jsons(f + "[0/40]")) # read last block assert expected[-2:] == list(jsonql.read_jsons(f + "[39/40]")) readers = jsonql.get_block_readers(file, 9) read_as_several_files = [list(jsonql.read_jsons(r)) for r in readers] # 40 splits of 2 docs, 9 readers -> 5 splits, 10 docs per reader assert list(jsonql.grouper(expected, 10)) == read_as_several_files
def remove_duplicates_sharded( files: List[Path], outputs: List[Path], hashes_dir: FilesOrDir, field: str, group_hashes: int = 1, tmp_dir: Path = None, min_len: int = 0, ): """Remove duplicates in several passes, when all hashes don't fit in RAM. Note: The current implementation is not doing a 'perfect' deduplication. If a hash appear exactly once in each shard of hashes it won't be detected as a duplicate. This can be fixed if hashes are fully dedup beforehand. """ assert len(files) == len(outputs) if isinstance(hashes_dir, list): hashes_files = hashes_dir else: hashes_files = sorted(h for h in Path(hashes_dir).iterdir() if h.suffix == ".bin") assert len(hashes_files) > 0, f"no hashes files found in: {hashes_dir}" if len(hashes_files) <= group_hashes: log(f"All hashes can be done in one pass, using DuplicatesRemover on {files}" ) rm_dups = DuplicatesRemover(field, hashes_files) rm_dups._prepare() run_par((jsonql.run_pipes, (rm_dups, ), dict(file=f, output=o)) for f, o in zip(files, outputs)) return log(f"Starting deduplicate_sharded on {files}.") tmp_directory = tempfile.TemporaryDirectory( dir=str(tmp_dir) if tmp_dir else None) def tmp_files(i): return [ Path(tmp_directory.name) / (f.name.split(".")[0] + f".{i}.bin") for f in files ] last = tmp_files(0) run_par((_dump_sentence_hashes, (f, tmp, field), {}) for f, tmp in zip(files, last)) if isinstance(hashes_dir, list): hashes_files = hashes_dir else: hashes_files = sorted(h for h in Path(hashes_dir).iterdir() if h.suffix == ".bin") for i, group in enumerate(jsonql.grouper(hashes_files, group_hashes)): hashes = FlatHashSet() for h in group: hashes.load(h) log(f"Loaded {h}, up to {len(hashes)} hashes ({mem_footprint_gb()}GB)" ) intermediates = tmp_files(i + 1) # Remove hashes in parallel. Since modern OS have "copy-on-write" and # `hashes` is read-only, we will only have one version of it in RAM. run_par((_remove_duplicate_hashes, (hashes, f, tmp), {}) for f, tmp in zip(last, intermediates)) # Force hashes to be freed, before we start allocating a new one. del hashes gc.collect() for tmp in last: os.remove(tmp) last = intermediates def finalize(source, dedup_hashes, min_len): n_chars, n_chars_kept = 0, 0 with open(dedup_hashes, "rb") as hashes: for doc in jsonql.read_jsons(source): content = doc.get(field) if not content or len(content) < min_len: continue sentences = content.split("\n") doc_hashes = np.fromfile(hashes, dtype=HASH_TYPE, count=len(sentences)) chars, kept_chars = finalize_doc(doc, field, doc_hashes) n_chars += chars n_chars_kept += kept_chars yield doc selectivity = n_chars_kept / n_chars if n_chars else 0 log(f"Kept {n_chars_kept} chars out of {n_chars} ({selectivity:.1%}).") dedup_hashes = last run_par([( jsonql.run_pipe, (finalize, ), dict(kwargs=dict(dedup_hashes=h, min_len=min_len), file=f, output=o), ) for h, f, o in zip(dedup_hashes, files, outputs)]) tmp_directory.cleanup()