Ejemplo n.º 1
0
def test_blocked_gzip(tmp_path):
    file = tmp_path / "test.gz"
    # Each object is 10/11 bytes long. We have 2 of them by block.
    content = [f'{{"xx": {i}}}' for i in range(80)]
    with jsonql.BlockedGzipWriter(file, "wt", block_size="20B") as o:
        for line in content:
            print(line, file=o)

    with jsonql.JsonReader(strict=True) as jr:
        with jsonql.smart_open(file) as f:
            read_as_one_file = list(jr.map(f))

        expected = list(jr.map(content))
        assert expected == read_as_one_file

        with jsonql.smart_open(str(file) + "[0/40]") as f:
            reader = list(f)
        assert expected[:2] == list(jr.map(l for l in reader))

        with jsonql.smart_open(str(file) + "[39/40]") as f:
            reader = list(f)
        assert expected[-2:] == list(jr.map(l for l in reader))

        readers = jsonql.get_block_readers(file, 9)
        read_as_several_files = [list(jr.map(r)) for r in readers]
        # 40 splits of 2 docs, 9 readers -> 5 splits, 10 docs per reader
        assert list(jsonql.grouper(expected, 10)) == read_as_several_files
Ejemplo n.º 2
0
def determine_groups(inputs: List[Path],
                     target_size: int = 4 * 1024**3) -> List[List[Path]]:
    if len(inputs) == 0:
        return []

    sample = inputs[:10]
    typical_size = sum(s.stat().st_size for s in sample) / len(sample)
    group_size = min(target_size // typical_size, len(inputs))
    group_size = max(group_size, 1)

    return jsonql.grouper(inputs, group_size)
Ejemplo n.º 3
0
def mine(conf: Config) -> List[Path]:
    """Remove dups, run LID and LMs, and split by lang and quality."""
    mined_dir = conf.output_dir / "mined" / conf.dump
    if conf.will_split:
        # Give a directories when splitting
        outputs = [
            mined_dir / f"{shard:04d}" for shard in range(conf.num_shards)
        ]
    else:
        # Files otherwise
        outputs = [
            mined_dir / f"{shard:04d}.json.gz"
            for shard in range(conf.num_shards)
        ]

    # TODO: try to reduce this / make it a function of "hash_in_mem" / num_langs
    mem_gb = 60 + 1 * conf.hash_in_mem
    timeout_hour = 5
    if "hashes" in conf.experiments:
        # HACK: used for generating paper figures
        outputs = [
            conf.output_dir /
            f"hashes_exp/{conf.dump}_0000_dedup{h:03d}.json.gz"
            for h in HASHES_IN_MEM
        ]
        mem_gb = int(max(HASHES_IN_MEM) * 1.2)
        timeout_hour = 8

    missing_outputs = [(shard, o) for shard, o in enumerate(outputs)
                       if not o.exists()]

    if not missing_outputs:
        return outputs

    # Compute hashes firsts.
    hashes_groups = list(jsonql.grouper(hashes(conf), conf.hash_in_mem))

    mined_dir.mkdir(parents=True, exist_ok=True)

    ex = conf.get_executor(
        f"mine_{conf.dump}",
        mem_gb=mem_gb,
        timeout_hour=timeout_hour,
        cpus=conf.mine_num_processes + 1,
    )

    hashes_files = [
        hashes_groups[shard // conf.hash_in_mem]
        for shard, o in missing_outputs
    ]
    ex(_mine_shard, repeat(conf), hashes_files, *_transpose(missing_outputs))

    assert all(o.exists() for o in outputs)
    return outputs
Ejemplo n.º 4
0
def test_blocked_gzip(tmp_path: Path):
    file = tmp_path / "test.gz"
    f = str(file)
    # Each object is 10/11 bytes long. We have 2 of them by block.
    content = ['{"xx": %d}' % i for i in range(80)]
    with jsonql.BlockedGzipWriter(file, "wt", block_size="20B") as o:
        for line in content:
            print(line, file=o)

    jr = jsonql.JsonReader(strict=True)
    expected = list(jr.map(content))
    # read as one file
    assert expected == list(jsonql.read_jsons(file))
    # read first block
    assert expected[:2] == list(jsonql.read_jsons(f + "[0/40]"))
    # read last block
    assert expected[-2:] == list(jsonql.read_jsons(f + "[39/40]"))

    readers = jsonql.get_block_readers(file, 9)
    read_as_several_files = [list(jsonql.read_jsons(r)) for r in readers]
    # 40 splits of 2 docs, 9 readers -> 5 splits, 10 docs per reader
    assert list(jsonql.grouper(expected, 10)) == read_as_several_files
Ejemplo n.º 5
0
def remove_duplicates_sharded(
    files: List[Path],
    outputs: List[Path],
    hashes_dir: FilesOrDir,
    field: str,
    group_hashes: int = 1,
    tmp_dir: Path = None,
    min_len: int = 0,
):
    """Remove duplicates in several passes, when all hashes don't fit in RAM.

    Note: The current implementation is not doing a 'perfect' deduplication.
    If a hash appear exactly once in each shard of hashes it won't be detected
    as a duplicate. This can be fixed if hashes are fully dedup beforehand.
    """
    assert len(files) == len(outputs)

    if isinstance(hashes_dir, list):
        hashes_files = hashes_dir
    else:
        hashes_files = sorted(h for h in Path(hashes_dir).iterdir()
                              if h.suffix == ".bin")

    assert len(hashes_files) > 0, f"no hashes files found in: {hashes_dir}"

    if len(hashes_files) <= group_hashes:
        log(f"All hashes can be done in one pass, using DuplicatesRemover on {files}"
            )
        rm_dups = DuplicatesRemover(field, hashes_files)
        rm_dups._prepare()
        run_par((jsonql.run_pipes, (rm_dups, ), dict(file=f, output=o))
                for f, o in zip(files, outputs))
        return

    log(f"Starting deduplicate_sharded on {files}.")
    tmp_directory = tempfile.TemporaryDirectory(
        dir=str(tmp_dir) if tmp_dir else None)

    def tmp_files(i):
        return [
            Path(tmp_directory.name) / (f.name.split(".")[0] + f".{i}.bin")
            for f in files
        ]

    last = tmp_files(0)
    run_par((_dump_sentence_hashes, (f, tmp, field), {})
            for f, tmp in zip(files, last))

    if isinstance(hashes_dir, list):
        hashes_files = hashes_dir
    else:
        hashes_files = sorted(h for h in Path(hashes_dir).iterdir()
                              if h.suffix == ".bin")
    for i, group in enumerate(jsonql.grouper(hashes_files, group_hashes)):
        hashes = FlatHashSet()
        for h in group:
            hashes.load(h)
            log(f"Loaded {h}, up to {len(hashes)} hashes ({mem_footprint_gb()}GB)"
                )

        intermediates = tmp_files(i + 1)
        # Remove hashes in parallel. Since modern OS have "copy-on-write" and
        # `hashes` is read-only, we will only have one version of it in RAM.
        run_par((_remove_duplicate_hashes, (hashes, f, tmp), {})
                for f, tmp in zip(last, intermediates))
        # Force hashes to be freed, before we start allocating a new one.
        del hashes
        gc.collect()

        for tmp in last:
            os.remove(tmp)
        last = intermediates

    def finalize(source, dedup_hashes, min_len):
        n_chars, n_chars_kept = 0, 0
        with open(dedup_hashes, "rb") as hashes:
            for doc in jsonql.read_jsons(source):
                content = doc.get(field)
                if not content or len(content) < min_len:
                    continue
                sentences = content.split("\n")
                doc_hashes = np.fromfile(hashes,
                                         dtype=HASH_TYPE,
                                         count=len(sentences))
                chars, kept_chars = finalize_doc(doc, field, doc_hashes)
                n_chars += chars
                n_chars_kept += kept_chars
                yield doc
        selectivity = n_chars_kept / n_chars if n_chars else 0
        log(f"Kept {n_chars_kept} chars out of {n_chars} ({selectivity:.1%}).")

    dedup_hashes = last
    run_par([(
        jsonql.run_pipe,
        (finalize, ),
        dict(kwargs=dict(dedup_hashes=h, min_len=min_len), file=f, output=o),
    ) for h, f, o in zip(dedup_hashes, files, outputs)])

    tmp_directory.cleanup()