def test_fetch(http_from_disk, tmp_path: Path): mini_docs = [ { "url": "http://sample_chinese.com", "digest": "sha1:Y4E6URVYGIAFNVRTPZ5S3J64RTZTP6HJ", "cc_segment": "crawl-data/sample.warc.wet", "line_ids": encode_line_ids([2]), "bucket": "not_that_great", }, { "url": "http://sample_english.com", "digest": "sha1:XQZHW7QWIG54HVAV3KPRW6MK5ILDNCER", "cc_segment": "crawl-data/sample.warc.wet", "line_ids": encode_line_ids([3]), "bucket": "top_notch", }, ] with jsonql.open_write(tmp_path / "sample.json") as o: for mini in mini_docs: print(json.dumps(mini), file=o) fetcher = minify.MetadataFetcher(tmp_path) cc = process_wet_file.CCSegmentsReader(["crawl-data/sample.warc.wet"]) docs = [d for d in fetcher.map(cc) if d is not None] assert cc.retrieved_segments == 1 # Note: documents are retrieved as they are ordered in the .warc.wet file assert [ "Facts are stubborn things, but statistics are more pliable.", "事實是固執的東西,但統計數字卻比較柔和。", ] == [d["raw_content"] for d in docs] assert ["top_notch", "not_that_great"] == [d["bucket"] for d in docs]
def test_minify_and_fetch(http_from_disk, tmp_path: Path): full_quotes = """Don't part with your illusions. When they are gone you may still exist, but you have ceased to live. Education: that which reveals to the wise, and conceals from the stupid, the vast limits of their knowledge. Facts are stubborn things, but statistics are more pliable. Fiction is obliged to stick to possibilities. Truth isn't.""" # We don't need no education. chosen_quotes = "\n".join(l for l in full_quotes.splitlines() if "Education" not in l) cc_doc = { "url": "http://sample_english.com", "date_download": "2019-03-18T00:00:00Z", "digest": "sha1:XQZHW7QWIG54HVAV3KPRW6MK5ILDNCER", "source_domain": "sample_english.com", "title": "Famous Mark Twain Quotes", "raw_content": full_quotes, "cc_segment": "crawl-data/sample.warc.wet", "nlines": 4, "length": 353, } ccnet_metadata = { "language": "en", "language_score": 0.99, "perplexity": 151.5, "bucket": "head", "raw_content": chosen_quotes, "nlines": 3, "length": len(chosen_quotes), "original_nlines": 4, "original_length": 353, "line_ids": [0, 2, 3], } ccnet_doc = dict(cc_doc, **ccnet_metadata) mini = minify.Minifier()(ccnet_doc.copy()) assert mini is not ccnet_doc important_fields = [ "url", "digest", "cc_segment", "language", "language_score", "perplexity", "bucket", "line_ids", ] expected = {k: ccnet_doc[k] for k in important_fields} expected["line_ids"] = encode_line_ids( expected["line_ids"]) # type: ignore assert expected == mini with jsonql.open_write(tmp_path / "sample.json") as o: print(json.dumps(mini), file=o) fetcher = minify.MetadataFetcher(tmp_path) # line_ids is removed when unminifying ccnet_doc.pop("line_ids") assert ccnet_doc == fetcher(cc_doc)
def _mine_shard(conf: Config, hashes: List[Path], shard: int, output: Path) -> str: assert conf.pipeline tmp_output = tmp(output) if "hashes" in conf.experiments: # HACK: used for generating paper figures hashes_in_mem = shard hashes = hashes[:HASHES_IN_MEM[hashes_in_mem]] shard = 0 cc_shard = conf.get_cc_shard(shard) steps: Dict[str, Optional[jsonql.Transformer]] = {} lang_id = Path("bin") / "lid.bin" steps["lid_before_dedup"] = split_by_lang.Classifier( model=lang_id, field="raw_content", out_field="lid_before_dedup", top=5) steps["dedup"] = dedup.DuplicatesRemover(field="raw_content", hashes_files=hashes) steps["lid"] = split_by_lang.Classifier( model=lang_id, field="raw_content", out_field="language", top=1, threshold=conf.lang_threshold, ) steps["lid_after_dedup"] = split_by_lang.Classifier( model=lang_id, field="raw_content", out_field="lid_after_dedup", top=5) if conf.lang_blacklist: steps["keep_lang"] = jsonql.where( [lambda doc: doc.get("language") not in set(conf.lang_blacklist)]) elif conf.lang_whitelist: steps["keep_lang"] = jsonql.where( [lambda doc: doc.get("language") in set(conf.lang_whitelist)]) else: steps["keep_lang"] = None tok_field = "tokenized" steps["sp"] = perplexity.MultiSentencePiece( {l: conf.lm_dir / f"{l}.sp.model" for l in conf.get_lm_languages()}, field="raw_content", output_field=tok_field, normalize=True, ) steps["lm"] = perplexity.DocLM( {l: conf.lm_dir / f"{l}.arpa.bin" for l in conf.get_lm_languages()}, field=tok_field, output_field="perplexity", normalize=False, # Normalization is done before SentencePiece # load_method=kenlm.LoadMethod.PARALLEL_READ, ) steps["pp_bucket"] = perplexity.PerplexityBucket(CUTOFF_CSV) steps["drop"] = perplexity.DropKeys(tok_field) steps["keep_bucket"] = None if conf.keep_bucket: steps["keep_bucket"] = jsonql.where( [lambda doc: doc.get("bucket", "all") in conf.keep_bucket]) if "fetch_metadata" in conf.pipeline: # TODO: better default assert conf.metadata is not None steps["fetch_metadata"] = minify.MetadataFetcher( f"{conf.metadata}/{conf.dump}/") steps["minify"] = minify.Minifier() pattern = str(tmp_output / "{language}_{bucket}.json.gz") steps["split_by_lang"] = jsonql.split(pattern=str(pattern), mkdir=True) steps["split_by_segment"] = jsonql.split( split_fn=lambda doc: _get_segment(tmp_output, doc), mkdir=True) pipeline = filter(None, (steps[s] for s in conf.pipeline)) jsonql.run_pipes( *pipeline, inputs=cc_shard, processes=conf.mine_num_processes, chunksize=100, # The splitter takes care of writing to files. output=tmp_output if not conf.will_split else None, ) finalize(tmp_output, output) return f"Mined {output}"