Python MetadataFetcherの例

プログラミング言語: Python

名前空間/パッケージ名: cc_net.minify

メソッド/関数: MetadataFetcher

hotexamples.comのコード掲載数: 3

Python MetadataFetcher - 3件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのcc_net.minify.MetadataFetcherの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: test_minify.py プロジェクト: jsedoc/cc_net

def test_fetch(http_from_disk, tmp_path: Path):
    mini_docs = [
        {
            "url": "http://sample_chinese.com",
            "digest": "sha1:Y4E6URVYGIAFNVRTPZ5S3J64RTZTP6HJ",
            "cc_segment": "crawl-data/sample.warc.wet",
            "line_ids": encode_line_ids([2]),
            "bucket": "not_that_great",
        },
        {
            "url": "http://sample_english.com",
            "digest": "sha1:XQZHW7QWIG54HVAV3KPRW6MK5ILDNCER",
            "cc_segment": "crawl-data/sample.warc.wet",
            "line_ids": encode_line_ids([3]),
            "bucket": "top_notch",
        },
    ]
    with jsonql.open_write(tmp_path / "sample.json") as o:
        for mini in mini_docs:
            print(json.dumps(mini), file=o)

    fetcher = minify.MetadataFetcher(tmp_path)
    cc = process_wet_file.CCSegmentsReader(["crawl-data/sample.warc.wet"])
    docs = [d for d in fetcher.map(cc) if d is not None]
    assert cc.retrieved_segments == 1

    # Note: documents are retrieved as they are ordered in the .warc.wet file
    assert [
        "Facts are stubborn things, but statistics are more pliable.",
        "事實是固執的東西，但統計數字卻比較柔和。",
    ] == [d["raw_content"] for d in docs]
    assert ["top_notch", "not_that_great"] == [d["bucket"] for d in docs]

コード例 #2

ファイルを表示

ファイル: test_minify.py プロジェクト: jsedoc/cc_net

def test_minify_and_fetch(http_from_disk, tmp_path: Path):
    full_quotes = """Don't part with your illusions. When they are gone you may still exist, but you have ceased to live.
Education: that which reveals to the wise, and conceals from the stupid, the vast limits of their knowledge.
Facts are stubborn things, but statistics are more pliable.
Fiction is obliged to stick to possibilities. Truth isn't."""
    # We don't need no education.
    chosen_quotes = "\n".join(l for l in full_quotes.splitlines()
                              if "Education" not in l)

    cc_doc = {
        "url": "http://sample_english.com",
        "date_download": "2019-03-18T00:00:00Z",
        "digest": "sha1:XQZHW7QWIG54HVAV3KPRW6MK5ILDNCER",
        "source_domain": "sample_english.com",
        "title": "Famous Mark Twain Quotes",
        "raw_content": full_quotes,
        "cc_segment": "crawl-data/sample.warc.wet",
        "nlines": 4,
        "length": 353,
    }

    ccnet_metadata = {
        "language": "en",
        "language_score": 0.99,
        "perplexity": 151.5,
        "bucket": "head",
        "raw_content": chosen_quotes,
        "nlines": 3,
        "length": len(chosen_quotes),
        "original_nlines": 4,
        "original_length": 353,
        "line_ids": [0, 2, 3],
    }
    ccnet_doc = dict(cc_doc, **ccnet_metadata)
    mini = minify.Minifier()(ccnet_doc.copy())
    assert mini is not ccnet_doc

    important_fields = [
        "url",
        "digest",
        "cc_segment",
        "language",
        "language_score",
        "perplexity",
        "bucket",
        "line_ids",
    ]
    expected = {k: ccnet_doc[k] for k in important_fields}
    expected["line_ids"] = encode_line_ids(
        expected["line_ids"])  # type: ignore
    assert expected == mini

    with jsonql.open_write(tmp_path / "sample.json") as o:
        print(json.dumps(mini), file=o)
    fetcher = minify.MetadataFetcher(tmp_path)
    # line_ids is removed when unminifying
    ccnet_doc.pop("line_ids")
    assert ccnet_doc == fetcher(cc_doc)

コード例 #3

ファイルを表示

ファイル: mine.py プロジェクト: jsedoc/cc_net

def _mine_shard(conf: Config, hashes: List[Path], shard: int,
                output: Path) -> str:
    assert conf.pipeline
    tmp_output = tmp(output)
    if "hashes" in conf.experiments:
        # HACK: used for generating paper figures
        hashes_in_mem = shard
        hashes = hashes[:HASHES_IN_MEM[hashes_in_mem]]
        shard = 0
    cc_shard = conf.get_cc_shard(shard)

    steps: Dict[str, Optional[jsonql.Transformer]] = {}
    lang_id = Path("bin") / "lid.bin"
    steps["lid_before_dedup"] = split_by_lang.Classifier(
        model=lang_id,
        field="raw_content",
        out_field="lid_before_dedup",
        top=5)
    steps["dedup"] = dedup.DuplicatesRemover(field="raw_content",
                                             hashes_files=hashes)

    steps["lid"] = split_by_lang.Classifier(
        model=lang_id,
        field="raw_content",
        out_field="language",
        top=1,
        threshold=conf.lang_threshold,
    )
    steps["lid_after_dedup"] = split_by_lang.Classifier(
        model=lang_id, field="raw_content", out_field="lid_after_dedup", top=5)

    if conf.lang_blacklist:
        steps["keep_lang"] = jsonql.where(
            [lambda doc: doc.get("language") not in set(conf.lang_blacklist)])
    elif conf.lang_whitelist:
        steps["keep_lang"] = jsonql.where(
            [lambda doc: doc.get("language") in set(conf.lang_whitelist)])
    else:
        steps["keep_lang"] = None

    tok_field = "tokenized"
    steps["sp"] = perplexity.MultiSentencePiece(
        {l: conf.lm_dir / f"{l}.sp.model"
         for l in conf.get_lm_languages()},
        field="raw_content",
        output_field=tok_field,
        normalize=True,
    )
    steps["lm"] = perplexity.DocLM(
        {l: conf.lm_dir / f"{l}.arpa.bin"
         for l in conf.get_lm_languages()},
        field=tok_field,
        output_field="perplexity",
        normalize=False,  # Normalization is done before SentencePiece
        # load_method=kenlm.LoadMethod.PARALLEL_READ,
    )
    steps["pp_bucket"] = perplexity.PerplexityBucket(CUTOFF_CSV)
    steps["drop"] = perplexity.DropKeys(tok_field)

    steps["keep_bucket"] = None
    if conf.keep_bucket:
        steps["keep_bucket"] = jsonql.where(
            [lambda doc: doc.get("bucket", "all") in conf.keep_bucket])

    if "fetch_metadata" in conf.pipeline:
        # TODO: better default
        assert conf.metadata is not None
        steps["fetch_metadata"] = minify.MetadataFetcher(
            f"{conf.metadata}/{conf.dump}/")

    steps["minify"] = minify.Minifier()

    pattern = str(tmp_output / "{language}_{bucket}.json.gz")
    steps["split_by_lang"] = jsonql.split(pattern=str(pattern), mkdir=True)

    steps["split_by_segment"] = jsonql.split(
        split_fn=lambda doc: _get_segment(tmp_output, doc), mkdir=True)

    pipeline = filter(None, (steps[s] for s in conf.pipeline))

    jsonql.run_pipes(
        *pipeline,
        inputs=cc_shard,
        processes=conf.mine_num_processes,
        chunksize=100,
        # The splitter takes care of writing to files.
        output=tmp_output if not conf.will_split else None,
    )
    finalize(tmp_output, output)
    return f"Mined {output}"