コード例 #1
0
        - file: CC shard file
        - tags_file: dmoz tagging file, (like the one produced by `dl`)
        - output: ""
    """
    url2tags = load_tags(tags_file)
    with jsonql.smart_open(file) as f, jsonql.smart_open(output, "w") as o:
        for document in jsonql.read_jsons(f):
            if not document:
                continue
            url = document["url"]
            domain = document["source_domain"]

            if url in url2tags:
                tags = url2tags[url]
            elif domain in url2tags:
                tags = url2tags[domain]
            else:
                continue

            if len(tags) == 0:
                continue

            fasttext_tags = ["__label__" + tag for tag in tags]
            content = document["tokenized"].replace("\n", " ").lower()
            if len(content) > 200:
                print(" ".join(fasttext_tags), content, file=o)  # type: ignore


if __name__ == "__main__":
    func_argparse.single_main(make_corpus)
コード例 #2
0
        return output

    for _input in inputs:
        if rm_original:
            _input.unlink()
        elif free_original:
            # Overwrite the previous file.
            # This frees up disk space and allows doit to properly track the success.
            _input.write_text(f"Resharded into {output}")
        if get_index(_input).is_file():
            get_index(_input).unlink()

    return output


def determine_groups(inputs: List[Path],
                     target_size: int = 4 * 1024**3) -> List[List[Path]]:
    if len(inputs) == 0:
        return []

    sample = inputs[:10]
    typical_size = sum(s.stat().st_size for s in sample) / len(sample)
    group_size = min(target_size // typical_size, len(inputs))
    group_size = max(group_size, 1)

    return jsonql.grouper(inputs, group_size)


if __name__ == "__main__":
    func_argparse.single_main(reshard)
コード例 #3
0
        f"Extracted {num_pars:_d} paragraphs from snapshot {snapshot}.")


def dl(snapshot: str = None,
       outdir: Path = Path("data_cc100"),
       processes: int = 1) -> None:
    """
    Download CC100 corpus.
    Will create one text file per language and CC snapshot.

    - snapshot: restrict to one snapshot. Useful for parallelization.
    - outdir: output directory
    - processes: number of processes to use
    """
    if snapshot is None:
        snapshots = CC_100_SNAPSHOTS
    else:
        snapshots = snapshot.split(",")

    invalids = [s for s in snapshots if s not in CC_100_SNAPSHOTS]
    assert not invalids, f"Invalid snapshots {invalids}, chose from {CC_100_SNAPSHOTS}"

    for snapshot in snapshots:
        dl_snapshot(snapshot, outdir, processes)


if __name__ == "__main__":
    import func_argparse

    func_argparse.single_main(dl)