Python run_pipesの例、cc_net.jsonql.run_pipes Pythonの例

コード例 #1

0

ファイルを表示

def reshard(
    inputs: List[Path],
    output: Path,
    tmp: Path = None,
    free_original: bool = False,
    rm_original: bool = False,
) -> Path:
    """Read the given files and concatenate them to the output file.

    Can remove original files on completion, or just write dummy content into them to free disk.
    """
    if tmp is None:
        tmp = _get_tmp(output)
    logging.info(f"Resharding {inputs} to {tmp}, will move later to {output}")
    jsonql.run_pipes(file=inputs, output=tmp)
    tmp.replace(output)
    tmp_index = get_index(tmp)
    if tmp_index.exists():
        tmp_index.replace(get_index(output))

    if not (free_original or rm_original):
        return output

    for _input in inputs:
        if rm_original:
            _input.unlink()
        elif free_original:
            # Overwrite the previous file.
            # This frees up disk space and allows doit to properly track the success.
            _input.write_text(f"Resharded into {output}")
        if get_index(_input).is_file():
            get_index(_input).unlink()

    return output

コード例 #2

0

ファイルを表示

def check_regroup(tmp_path, regroup_fn, check_blocks_boundaries=False):
    n_shards = 4
    n_docs = 20
    shards = [[
        dict(id=i, shard=s, raw_content="hello world") for i in range(n_docs)
    ] for s in range(n_shards)]
    shards_files = [tmp_path / f"{s:04d}.json.gz" for s in range(n_shards)]
    for shard, shard_file in zip(shards, shards_files):
        jsonql.run_pipes(inputs=shard, output=shard_file)
    regroup_file = tmp_path / "regroup.json.gz"
    start = time.time()
    regroup_fn(shards_files, regroup_file)
    duration = time.time() - start
    print(f"{regroup_fn.__module__}.{regroup_fn.__name__} took {duration}s")

    regrouped = list(jsonql.read_jsons(regroup_file))
    assert [doc for shard in shards for doc in shard] == regrouped

    readers = jsonql.get_block_readers(regroup_file, n_shards)
    if not check_blocks_boundaries:
        assert [doc for shard in shards for doc in shard] == [
            doc for reader in readers for doc in jsonql.read_jsons(reader)
        ]
        return

    for shard, reader in zip(shards, readers):
        block = [doc for doc in jsonql.read_jsons(reader)]
        assert shard == block

コード例 #3

0

ファイルを表示

ファイル: mine.py プロジェクト: jsedoc/cc_net

def _hashes_shard(conf: Config, shard: int, output: Path):
    tmp_output = tmp(output)
    jsonql.run_pipes(
        dedup.HashesCollector(field="raw_content", output=tmp_output),
        inputs=conf.get_cc_shard(shard),
    )
    finalize(tmp_output, output)
    return f"Hashed {output}"

コード例 #4

0

ファイルを表示

ファイル: expand_corpus.py プロジェクト: jsedoc/cc_net

def _mine(file: Path, output: Path, sp: Path, lm: Path,
          threshold: float) -> Tuple[Path, Path]:
    extractor = ExtractSentences(sp,
                                 lm,
                                 field="raw_content",
                                 threshold=threshold)
    jsonql.run_pipes(extractor, file=file, output=output, processes=PROCESSES)
    return (file, output)

コード例 #5

0

ファイルを表示

def fetch_metadata_file(
    file: Union[Path, str],
    metadata_dir: Union[Path, str],
    output: Path,
    cache_dir: Path = None,
):
    unminifier = MetadataFetcher(metadata_dir)
    tmp = output.with_name("tmp." + output.name)
    jsonql.run_pipes(unminifier, file=file, output=tmp)
    tmp.rename(output)
    return f"Fetched metadata for {file}. Results at {output}."

コード例 #6

0

ファイルを表示

ファイル: expand_corpus.py プロジェクト: jsedoc/cc_net

def normalize(corpus: Path, output_dir: Path) -> Path:
    normalized = output_dir / (corpus.stem + ".normalized")
    if normalized.exists():
        return normalized

    print("Will normalize", corpus, "to", normalized)
    jsonql.run_pipes(
        jsonql.Mapper(text_normalizer.normalize),
        file=corpus,
        output=normalized,
        processes=PROCESSES,
    )
    return normalized

コード例 #7

0

ファイルを表示

ファイル: minify.py プロジェクト: torshie/cc_net

def unminify_file(file: Union[Path, str], output: Path, cache_dir: Path = None):
    unminifier = Unminifier(cache_dir)
    with jsonql.smart_open(file) as f:
        mini = [m for m in jsonql.read_jsons(f)]
    unminifier.look_for(mini)

    tmp = output.with_name("tmp." + output.name)
    jsonql.run_pipes(unminifier, file=iter(mini), output=tmp)
    shutil.move(tmp, output)
    f_size = Path(file).stat().st_size if Path(file).exists() else 0
    o_size = output.stat().st_size
    mb = 1024 ** 2
    return f"Unminified {output} ({f_size // mb:_}Mb -> {o_size // mb:_}Mb)"

コード例 #8

0

ファイルを表示

ファイル: expand_corpus.py プロジェクト: jsedoc/cc_net

def tokenize(corpus: Path, output_dir: Path, lang: str) -> Path:
    tokenized = output_dir / (corpus.stem + ".tokenized")
    if tokenized.exists():
        return tokenized

    print("Will SentencePiece", corpus, "to", tokenized)
    jsonql.run_pipes(
        SentencePiece(sp_model(lang)),
        file=normalize(corpus, output_dir),
        output=tokenized,
        processes=PROCESSES,
    )

    return tokenized

コード例 #9

0

ファイルを表示

def test_enter_exit(capsys):
    class MyTransformer(jsonql.Transformer):
        def __enter__(self):
            print("trans: started")
            self.ready = True
            return self

        def __exit__(self, *args):
            print("trans: done")

        def do(self, x):
            return (x, x)

    def acc(values):
        print("acc: started")
        res = 0
        for (x, _) in values:
            res += int(x)
        print("acc: done")
        yield f"acc: result={res}"

    t = MyTransformer()
    data = (str(x) for x in range(10))
    print("pipeline: started")
    # Print to stdout.
    jsonql.run_pipes(t, acc, file=data)
    print("pipeline: done")
    out = capsys.readouterr().out
    assert (
        "\n".join(
            [
                "pipeline: started",
                "trans: started",
                "acc: started",
                "acc: done",
                f"acc: result=45",
                # Transformers are closed at the very end.
                "trans: done",
                "pipeline: done\n",
            ]
        )
        == out
    )

コード例 #10

0

ファイルを表示

ファイル: dedup.py プロジェクト: jsedoc/cc_net

def deduplicate_two_pass(file: jsonql.FileDescriptor,
                         field: str = "raw_content") -> Iterable[dict]:
    """Remove duplicates of the given file (even removing the first occurence).

    This is what is done in the paper, and in mine.py
    """
    try:
        if isinstance(file, Path):
            hash_file: Path = file.with_suffix(".bin")
        else:
            hash_file = jsonql._tmp(Path("hashes.bin"))
        jsonql.run_pipes(jsonql.JsonReader(),
                         HashesCollector(field, output=hash_file),
                         file=file)
        dup_remover = DuplicatesRemover(field, [hash_file])
        return dup_remover.map(jsonql.read_jsons(file))
    finally:
        if hash_file.exists():
            hash_file.unlink()

コード例 #11

0

ファイルを表示

def dl(
    dump: str,
    shard: int,
    num_shards: int,
    output: Path = None,
    num_segments_per_shard: int = 0,
):
    """Download a shard of the common crawl, and export it to json.

    Arguments:
        output: filename of the output file
        dump: CC dump id
        shard: id of the shard
        num_shards: total number of shards
        num_segments_per_shard: manual control of the number of segment per shard.
    """
    reader = CCShardReader(dump, shard, num_shards, num_segments_per_shard)
    jsonql.run_pipes(inputs=reader, output=output)
    logger.info(f"Done. {output} is ready.")

コード例 #12

0

ファイルを表示

ファイル: get_wiki_cirrus.py プロジェクト: jsedoc/cc_net

def opening(file: Path, output: Path = None, n_docs: int = 1_000_000):
    """Will dump the tokenized opening text of the given Wikipedia.

    Args:
        - file: File containing the Wikipedia dump.
        - output: Output file.
        - n_docs: How many docs to parse
        - tokenize: whether to tokenize the text
        - lang: Language code used to chose the tokenizer
    """
    assert file.exists()
    return jsonql.run_pipes(
        functools.partial(extract_opening_text, n_docs=n_docs),
        file=file,
        output=tmp(output) if output else None,
    )
    if output:
        tmp(output).replace(output)

コード例 #13

0

ファイルを表示

def test_dedup_fast(tmp_path: Path):
    data = tmp_path / "data"
    part_0 = [["Hello", "_World", "I'm so original"]]
    write_docs(data / "part_0.json", part_0)
    part_1 = [["Good morning", "_World", "I'm originaler"]]
    write_docs(data / "part_1.json", part_1)
    parts = [data / "part_0.json", data / "part_1.json"]

    res = tmp_path / "res"
    res.mkdir()
    h = tmp_path / "hashes.bin"
    field = "text"
    jsonql.run_pipes(dedup.HashesCollector(field, output=h), file=parts)
    for part in parts:
        jsonql.run_pipes(
            dedup.DuplicatesRemover(field, [h]), file=part, output=res / part.name
        )
        jsonql.run_pipes(
            dedup.DuplicatesRemover(field, [h]), file=part, output=res / part.name
        )

    results_0 = list(jsonql.read_jsons(res / "part_0.json"))
    expected_0 = [
        dict(
            text=text("Hello", "I'm so original"),
            original_nlines=3,
            nlines=2,
            line_ids=[0, 2],
        )
    ]
    assert_documents_equal(expected_0, results_0, ignoring=LENGTHS)

    results_1 = list(jsonql.read_jsons(res / "part_1.json"))
    expected_1 = [
        dict(
            text=text("Good morning", "I'm originaler"),
            original_nlines=3,
            nlines=2,
            line_ids=[0, 2],
        )
    ]

    assert_documents_equal(expected_1, results_1, ignoring=LENGTHS)

    words = [w for part in [part_0, part_1] for doc in part for w in doc]
    expected = {str_hash(s.lower()): s.startswith("_") for s in words}
    assert expected == load_hashes(h)

コード例 #14

0

ファイルを表示

ファイル: mine.py プロジェクト: jsedoc/cc_net

def _mine_shard(conf: Config, hashes: List[Path], shard: int,
                output: Path) -> str:
    assert conf.pipeline
    tmp_output = tmp(output)
    if "hashes" in conf.experiments:
        # HACK: used for generating paper figures
        hashes_in_mem = shard
        hashes = hashes[:HASHES_IN_MEM[hashes_in_mem]]
        shard = 0
    cc_shard = conf.get_cc_shard(shard)

    steps: Dict[str, Optional[jsonql.Transformer]] = {}
    lang_id = Path("bin") / "lid.bin"
    steps["lid_before_dedup"] = split_by_lang.Classifier(
        model=lang_id,
        field="raw_content",
        out_field="lid_before_dedup",
        top=5)
    steps["dedup"] = dedup.DuplicatesRemover(field="raw_content",
                                             hashes_files=hashes)

    steps["lid"] = split_by_lang.Classifier(
        model=lang_id,
        field="raw_content",
        out_field="language",
        top=1,
        threshold=conf.lang_threshold,
    )
    steps["lid_after_dedup"] = split_by_lang.Classifier(
        model=lang_id, field="raw_content", out_field="lid_after_dedup", top=5)

    if conf.lang_blacklist:
        steps["keep_lang"] = jsonql.where(
            [lambda doc: doc.get("language") not in set(conf.lang_blacklist)])
    elif conf.lang_whitelist:
        steps["keep_lang"] = jsonql.where(
            [lambda doc: doc.get("language") in set(conf.lang_whitelist)])
    else:
        steps["keep_lang"] = None

    tok_field = "tokenized"
    steps["sp"] = perplexity.MultiSentencePiece(
        {l: conf.lm_dir / f"{l}.sp.model"
         for l in conf.get_lm_languages()},
        field="raw_content",
        output_field=tok_field,
        normalize=True,
    )
    steps["lm"] = perplexity.DocLM(
        {l: conf.lm_dir / f"{l}.arpa.bin"
         for l in conf.get_lm_languages()},
        field=tok_field,
        output_field="perplexity",
        normalize=False,  # Normalization is done before SentencePiece
        # load_method=kenlm.LoadMethod.PARALLEL_READ,
    )
    steps["pp_bucket"] = perplexity.PerplexityBucket(CUTOFF_CSV)
    steps["drop"] = perplexity.DropKeys(tok_field)

    steps["keep_bucket"] = None
    if conf.keep_bucket:
        steps["keep_bucket"] = jsonql.where(
            [lambda doc: doc.get("bucket", "all") in conf.keep_bucket])

    if "fetch_metadata" in conf.pipeline:
        # TODO: better default
        assert conf.metadata is not None
        steps["fetch_metadata"] = minify.MetadataFetcher(
            f"{conf.metadata}/{conf.dump}/")

    steps["minify"] = minify.Minifier()

    pattern = str(tmp_output / "{language}_{bucket}.json.gz")
    steps["split_by_lang"] = jsonql.split(pattern=str(pattern), mkdir=True)

    steps["split_by_segment"] = jsonql.split(
        split_fn=lambda doc: _get_segment(tmp_output, doc), mkdir=True)

    pipeline = filter(None, (steps[s] for s in conf.pipeline))

    jsonql.run_pipes(
        *pipeline,
        inputs=cc_shard,
        processes=conf.mine_num_processes,
        chunksize=100,
        # The splitter takes care of writing to files.
        output=tmp_output if not conf.will_split else None,
    )
    finalize(tmp_output, output)
    return f"Mined {output}"

コード例 #15

0

ファイルを表示

ファイル: minify.py プロジェクト: torshie/cc_net

def minify_file(file: Path, output: Path) -> str:
    """Minify the given file."""
    jsonql.run_pipes(Minifier(), file=file, output=output)
    return f"Minified {output}"

コード例 #16

0

ファイルを表示

def classify_and_split(file, output, pattern, **kwargs):
    classifier = Classifier(**kwargs)
    splitter = jsonql.split(pattern)
    jsonql.run_pipes(classifier, splitter, file=file, output=output)

コード例 #17

0

ファイルを表示

ファイル: expand_corpus.py プロジェクト: jsedoc/cc_net

    if sample_file.exists():
        return sample_file
    dataset = _dataset(dataset, lang)
    extractor = ExtractSentences(sp_model(lang),
                                 train_lm(corpus, output_dir),
                                 field="raw_content")
    sampling = functools.partial(uniform_sampling_wrt_perplexity,
                                 rounding=100.0,
                                 cut=1000.0,
                                 samples=n // 10)

    print(f"Will sample data from {dataset} to {sample_file}")
    try:
        jsonql.run_pipes(extractor,
                         sampling,
                         file=dataset,
                         output=sample_file,
                         processes=PROCESSES)
    except Exception:
        sample_file.unlink()
        raise

    subprocess.run(["sort", "-n", "-o", sample_file, sample_file], check=True)
    subprocess.run(["head", sample_file], check=True)
    return sample_file


def mine(
    corpus: Path,
    output_dir: Path,
    threshold: float,

コード例 #18

0

ファイルを表示

def test_write_to_stdout_handle_newlines(capsys):
    lines = [str(x) + "\n" for x in range(10)]
    jsonql.run_pipes(file=iter(lines))
    out = capsys.readouterr().out
    assert out == "".join(lines)

コード例 #19

0

ファイルを表示

def test_multiprocess(capsys):
    mult = jsonql.Mapper(lambda x: f"2x = {2 * int(x)}")
    jsonql.run_pipes(mult, processes=2, file=(str(x) for x in range(10)))
    out = set(capsys.readouterr().out.strip("\n").split("\n"))
    assert set(f"2x = {2 * x}" for x in range(10)) == out