Example #1
0
def test_can_pickle_transformer(tmp_path):
    model = Path("bin/lid.bin")
    if not model.exists():
        return
    classifier = split_by_lang.Classifier(model, "text", "lang")
    classifier.__enter__()
    doc = dict(text="Hello world ! This is English btw.")
    original_results = classifier(doc)

    with open(tmp_path / "transformer.pkl", "wb") as o:
        pickle.dump(classifier, o)
    with open(tmp_path / "transformer.pkl", "rb") as f:
        classifier = pickle.load(f)

    assert original_results == classifier(doc)

    # Do it again with the unpickled object.
    with open(tmp_path / "transformer.pkl", "wb") as o:
        pickle.dump(classifier, o)
    with open(tmp_path / "transformer.pkl", "rb") as f:
        classifier = pickle.load(f)

    assert original_results == classifier(doc)
Example #2
0
def _mine_shard(conf: Config, hashes: List[Path], shard: int,
                output: Path) -> str:
    assert conf.pipeline
    tmp_output = tmp(output)
    if "hashes" in conf.experiments:
        # HACK: used for generating paper figures
        hashes_in_mem = shard
        hashes = hashes[:HASHES_IN_MEM[hashes_in_mem]]
        shard = 0
    cc_shard = conf.get_cc_shard(shard)

    steps: Dict[str, Optional[jsonql.Transformer]] = {}
    lang_id = Path("bin") / "lid.bin"
    steps["lid_before_dedup"] = split_by_lang.Classifier(
        model=lang_id,
        field="raw_content",
        out_field="lid_before_dedup",
        top=5)
    steps["dedup"] = dedup.DuplicatesRemover(field="raw_content",
                                             hashes_files=hashes)

    steps["lid"] = split_by_lang.Classifier(
        model=lang_id,
        field="raw_content",
        out_field="language",
        top=1,
        threshold=conf.lang_threshold,
    )
    steps["lid_after_dedup"] = split_by_lang.Classifier(
        model=lang_id, field="raw_content", out_field="lid_after_dedup", top=5)

    if conf.lang_blacklist:
        steps["keep_lang"] = jsonql.where(
            [lambda doc: doc.get("language") not in set(conf.lang_blacklist)])
    elif conf.lang_whitelist:
        steps["keep_lang"] = jsonql.where(
            [lambda doc: doc.get("language") in set(conf.lang_whitelist)])
    else:
        steps["keep_lang"] = None

    tok_field = "tokenized"
    steps["sp"] = perplexity.MultiSentencePiece(
        {l: conf.lm_dir / f"{l}.sp.model"
         for l in conf.get_lm_languages()},
        field="raw_content",
        output_field=tok_field,
        normalize=True,
    )
    steps["lm"] = perplexity.DocLM(
        {l: conf.lm_dir / f"{l}.arpa.bin"
         for l in conf.get_lm_languages()},
        field=tok_field,
        output_field="perplexity",
        normalize=False,  # Normalization is done before SentencePiece
        # load_method=kenlm.LoadMethod.PARALLEL_READ,
    )
    steps["pp_bucket"] = perplexity.PerplexityBucket(CUTOFF_CSV)
    steps["drop"] = perplexity.DropKeys(tok_field)

    steps["keep_bucket"] = None
    if conf.keep_bucket:
        steps["keep_bucket"] = jsonql.where(
            [lambda doc: doc.get("bucket", "all") in conf.keep_bucket])

    if "fetch_metadata" in conf.pipeline:
        # TODO: better default
        assert conf.metadata is not None
        steps["fetch_metadata"] = minify.MetadataFetcher(
            f"{conf.metadata}/{conf.dump}/")

    steps["minify"] = minify.Minifier()

    pattern = str(tmp_output / "{language}_{bucket}.json.gz")
    steps["split_by_lang"] = jsonql.split(pattern=str(pattern), mkdir=True)

    steps["split_by_segment"] = jsonql.split(
        split_fn=lambda doc: _get_segment(tmp_output, doc), mkdir=True)

    pipeline = filter(None, (steps[s] for s in conf.pipeline))

    jsonql.run_pipes(
        *pipeline,
        inputs=cc_shard,
        processes=conf.mine_num_processes,
        chunksize=100,
        # The splitter takes care of writing to files.
        output=tmp_output if not conf.will_split else None,
    )
    finalize(tmp_output, output)
    return f"Mined {output}"