Exemple #1
0

@contextlib.contextmanager
def timer(name: str = "-"):
    start = time.time()
    yield None
    delay = time.time() - start
    print(f"{name} took {delay:.1f}s")


def benchmark(tmp_path: Path):
    segments = [
        "crawl-data/CC-MAIN-2019-09/segments/1550249406966.99/wet/CC-MAIN-20190222220601-20190223002601-00441.warc.wet.gz"
    ]
    seg_file = tmp_path / "CC-MAIN-20190222220601-20190223002601-00441.warc.wet.gz"

    with timer("from network"):
        list(CCSegmentsReader(segments))

    with timer("from network, with caching"):
        list(CCSegmentsReader(segments, cache_dir=tmp_path))
    assert seg_file.exists()

    with timer("from disk"):
        CCSegmentsReader(segments, cache_dir=tmp_path)
    seg_file.unlink()


if __name__ == "__main__":
    func_argparse.main(ls, dl)
Exemple #2
0
            urls.append(CC_NET_ROOT_FOLDER + dump + "/" + file)
    return urls


def reproduce(
    language: List[str] = None,
    dump: str = "2019-09",
    bucket: str = "head",
    output_dir: Path = DATA / "reconstruct",
    execution: str = "mp",
    parallelism: int = -1,
    cache_dir: Path = None,
):
    """Reproduce paper results from official CC snapshot and precomputed results.

    - dump: CC dump id
    - bucket: can be one of ("head", "middle", "tail", "all")
    - ouput_dir: output directory
    - execution: how to parallelize ("mp", "debug", "slurm", ...)
    - cache_dir: where the CC .wet files will be downloaded.
    """
    output_dir.mkdir(exist_ok=True, parents=True)
    urls = select_urls(dump, language, bucket)
    unminify(urls, output_dir / dump, execution, parallelism, cache_dir)


if __name__ == "__main__":
    import func_argparse

    func_argparse.main(reproduce, minify_file, minify, unminify, unminify_file)
Exemple #3
0
    with open(src_file) as src_f, open(trg_file) as trg_f:
        src_l = src_f.readline()
        trg_l = trg_f.readline()
        while src_l and trg_l:
            src: SimpleBitext = parser(src_l)
            trg: SimpleBitext = parser(trg_l)
            if src.line_no <= trg.line_no:
                lines_src += 1
                src_l = src_f.readline()
            if trg.line_no <= src.line_no:
                lines_trg += 1
                trg_l = trg_f.readline()
            if trg.line_no == src.line_no:
                found_pairs += 1

    if found_pairs == lines_src and found_pairs == lines_trg:
        logging.info(
            f"Validated {src_file} and {trg_file}. Found {found_pairs} bitexts."
        )
    else:
        logging.error(
            f"Validated {src_file} and {trg_file}. "
            f"Found {found_pairs} bitexts, from {lines_src} in {src_file} and {lines_trg} in {trg_file}"
        )


if __name__ == "__main__":
    import func_argparse

    func_argparse.main(dl, finalize)
Exemple #4
0
        dumps.remove("..")
        dumps.remove("current")
        # We take the oldest dump since the most recent might be incomplete.
        # The page only link to the N latest dumps so the dump won't be too old.
        date = min(dumps)

    cirrus_url = "/".join((CIRRUS_URL, date))
    print("Will use the Wikipedia dump from:", date, cirrus_url)
    cirrus_page = BeautifulSoup(urllib.request.urlopen(cirrus_url),
                                features="html.parser")
    urls = {}
    for link in cirrus_page.findAll("a"):
        match = CIRRUS_DUMP_RE.match(link.get("href"))
        if not match:
            continue

        urls[match.group(1)] = "/".join([cirrus_url, link.get("href")])
    assert urls, f"No valid download urls found at {cirrus_url}"
    return urls


def wget(url: str, output: Path):
    subprocess.run(["wget", url, "-O", tmp(output), "-q"], check=True)
    tmp(output).replace(output)
    assert (output.stat().st_size >
            10_000), f"File {output} downloaded from {url} looks too small"


if __name__ == "__main__":
    func_argparse.main(dl, opening)
Exemple #5
0
    assert len(files) > 0, "No files given."
    output_dir.mkdir(exist_ok=True)

    outputs = [output_dir / str(f).split("/")[-1] for f in files]
    if cache_dir is None:
        cache_dir = output_dir / "wet_cache"
        cache_dir.mkdir(exist_ok=True)
    if str(cache_dir) == "none":
        cache_dir = None
    files = [f for f, o in zip(files, outputs) if not o.exists()]
    outputs = [o for o in outputs if not o.exists()]
    if not files:
        return
    ex = get_executor(
        "unminify",
        output_dir / "logs",
        execution,
        timeout_hour=8,
        cpus=1,
        task_parallelism=parallelism,
        mem_gb=32,
    )
    ex(fetch_metadata_file, files, outputs, itertools.repeat(cache_dir))


if __name__ == "__main__":
    import func_argparse

    func_argparse.main(minify_file, minify, fetch_metadata,
                       fetch_metadata_file)
Exemple #6
0
    ex = submitit.AutoExecutor(output_dir / "mining_logs")
    ex.update_parameters(
        name="mine",
        cpus_per_task=PROCESSES,
        timeout_min=60 * 24 // PROCESSES,
        mem_gb=10,
    )
    jobs = ex.map_array(_mine, files, outputs, sp, lm, thresholds)
    print("Submited job array:", jobs[0])

    for j in submitit.helpers.as_completed(jobs):
        (i, o) = j.result()
        print("Mined sentences from", i, "to", o)

    return outputs


def _mine(file: Path, output: Path, sp: Path, lm: Path,
          threshold: float) -> Tuple[Path, Path]:
    extractor = ExtractSentences(sp,
                                 lm,
                                 field="raw_content",
                                 threshold=threshold)
    jsonql.run_pipes(extractor, file=file, output=output, processes=PROCESSES)
    return (file, output)


if __name__ == "__main__":
    func_argparse.main(sample, mine)
Exemple #7
0
"""Say hello or goodbye to the user."""

import func_argparse


def hello(user: str, times: int = None):
    """Say hello.

    Arguments:
        user: name of the user
    """
    print(f"Hello {user}" * (1 if times is None else times))


def bye(user: str, see_you: float = 1.0):
    """Say goodbye."""
    print(f"Goodbye {user}, see you in {see_you:.1f} days")


if __name__ == "__main__":
    func_argparse.main()