- file: CC shard file - tags_file: dmoz tagging file, (like the one produced by `dl`) - output: "" """ url2tags = load_tags(tags_file) with jsonql.smart_open(file) as f, jsonql.smart_open(output, "w") as o: for document in jsonql.read_jsons(f): if not document: continue url = document["url"] domain = document["source_domain"] if url in url2tags: tags = url2tags[url] elif domain in url2tags: tags = url2tags[domain] else: continue if len(tags) == 0: continue fasttext_tags = ["__label__" + tag for tag in tags] content = document["tokenized"].replace("\n", " ").lower() if len(content) > 200: print(" ".join(fasttext_tags), content, file=o) # type: ignore if __name__ == "__main__": func_argparse.single_main(make_corpus)
return output for _input in inputs: if rm_original: _input.unlink() elif free_original: # Overwrite the previous file. # This frees up disk space and allows doit to properly track the success. _input.write_text(f"Resharded into {output}") if get_index(_input).is_file(): get_index(_input).unlink() return output def determine_groups(inputs: List[Path], target_size: int = 4 * 1024**3) -> List[List[Path]]: if len(inputs) == 0: return [] sample = inputs[:10] typical_size = sum(s.stat().st_size for s in sample) / len(sample) group_size = min(target_size // typical_size, len(inputs)) group_size = max(group_size, 1) return jsonql.grouper(inputs, group_size) if __name__ == "__main__": func_argparse.single_main(reshard)
f"Extracted {num_pars:_d} paragraphs from snapshot {snapshot}.") def dl(snapshot: str = None, outdir: Path = Path("data_cc100"), processes: int = 1) -> None: """ Download CC100 corpus. Will create one text file per language and CC snapshot. - snapshot: restrict to one snapshot. Useful for parallelization. - outdir: output directory - processes: number of processes to use """ if snapshot is None: snapshots = CC_100_SNAPSHOTS else: snapshots = snapshot.split(",") invalids = [s for s in snapshots if s not in CC_100_SNAPSHOTS] assert not invalids, f"Invalid snapshots {invalids}, chose from {CC_100_SNAPSHOTS}" for snapshot in snapshots: dl_snapshot(snapshot, outdir, processes) if __name__ == "__main__": import func_argparse func_argparse.single_main(dl)