def test_split(tmp_path: Path): data = [ dict(text="Hello world", lang="en"), dict(text="Boujour les amis", lang="fr"), dict(text="Rock your boat", lang="en"), ] with jsonql.split(tmp_path / "{lang}.json") as split: list(split.map(data)) summary = split.summary() assert "Found 2 splits." in summary en_docs = list(jsonql.read_jsons(tmp_path / "en.json")) assert [data[0], data[2]] == en_docs fr_docs = list(jsonql.read_jsons(tmp_path / "fr.json")) assert [data[1]] == fr_docs
def test_split(self): tmp = self.get_tmpdir() data = [ dict(text="Hello world", lang="en"), dict(text="Boujour les amis", lang="fr"), dict(text="Rock your boat", lang="en"), ] with jsonql.split(tmp("{lang}.json")) as split: list(split.map(data)) summary = split.summary() self.assertIn("Found 2 splits.", summary) with open(tmp("en.json")) as f_en: en_docs = list(jsonql.read_jsons(f_en)) self.assertEqual([data[0], data[2]], en_docs) with open(tmp("fr.json")) as f_fr: fr_docs = list(jsonql.read_jsons(f_fr)) self.assertEqual([data[1]], fr_docs)
def test_split_bad_pattern(self): tmp = self.get_tmpdir() data = [dict(text="Hello world", lang="en")] with self.assertRaises(KeyError): with jsonql.split(tmp("{language}.json")) as split: list(split.map(data))
def _mine_shard(conf: Config, hashes: List[Path], shard: int, output: Path) -> str: assert conf.pipeline tmp_output = tmp(output) if "hashes" in conf.experiments: # HACK: used for generating paper figures hashes_in_mem = shard hashes = hashes[:HASHES_IN_MEM[hashes_in_mem]] shard = 0 cc_shard = conf.get_cc_shard(shard) steps: Dict[str, Optional[jsonql.Transformer]] = {} lang_id = Path("bin") / "lid.bin" steps["lid_before_dedup"] = split_by_lang.Classifier( model=lang_id, field="raw_content", out_field="lid_before_dedup", top=5) steps["dedup"] = dedup.DuplicatesRemover(field="raw_content", hashes_files=hashes) steps["lid"] = split_by_lang.Classifier( model=lang_id, field="raw_content", out_field="language", top=1, threshold=conf.lang_threshold, ) steps["lid_after_dedup"] = split_by_lang.Classifier( model=lang_id, field="raw_content", out_field="lid_after_dedup", top=5) if conf.lang_blacklist: steps["keep_lang"] = jsonql.where( [lambda doc: doc.get("language") not in set(conf.lang_blacklist)]) elif conf.lang_whitelist: steps["keep_lang"] = jsonql.where( [lambda doc: doc.get("language") in set(conf.lang_whitelist)]) else: steps["keep_lang"] = None tok_field = "tokenized" steps["sp"] = perplexity.MultiSentencePiece( {l: conf.lm_dir / f"{l}.sp.model" for l in conf.get_lm_languages()}, field="raw_content", output_field=tok_field, normalize=True, ) steps["lm"] = perplexity.DocLM( {l: conf.lm_dir / f"{l}.arpa.bin" for l in conf.get_lm_languages()}, field=tok_field, output_field="perplexity", normalize=False, # Normalization is done before SentencePiece # load_method=kenlm.LoadMethod.PARALLEL_READ, ) steps["pp_bucket"] = perplexity.PerplexityBucket(CUTOFF_CSV) steps["drop"] = perplexity.DropKeys(tok_field) steps["keep_bucket"] = None if conf.keep_bucket: steps["keep_bucket"] = jsonql.where( [lambda doc: doc.get("bucket", "all") in conf.keep_bucket]) if "fetch_metadata" in conf.pipeline: # TODO: better default assert conf.metadata is not None steps["fetch_metadata"] = minify.MetadataFetcher( f"{conf.metadata}/{conf.dump}/") steps["minify"] = minify.Minifier() pattern = str(tmp_output / "{language}_{bucket}.json.gz") steps["split_by_lang"] = jsonql.split(pattern=str(pattern), mkdir=True) steps["split_by_segment"] = jsonql.split( split_fn=lambda doc: _get_segment(tmp_output, doc), mkdir=True) pipeline = filter(None, (steps[s] for s in conf.pipeline)) jsonql.run_pipes( *pipeline, inputs=cc_shard, processes=conf.mine_num_processes, chunksize=100, # The splitter takes care of writing to files. output=tmp_output if not conf.will_split else None, ) finalize(tmp_output, output) return f"Mined {output}"
def classify_and_split(file, output, pattern, **kwargs): classifier = Classifier(**kwargs) splitter = jsonql.split(pattern) jsonql.run_pipes(classifier, splitter, file=file, output=output)
def test_split_bad_pattern(tmp_path: Path): data = [dict(text="Hello world", lang="en")] with pytest.raises(KeyError): with jsonql.split(tmp_path / "{language}.json") as split: list(split.map(data))