@contextlib.contextmanager def timer(name: str = "-"): start = time.time() yield None delay = time.time() - start print(f"{name} took {delay:.1f}s") def benchmark(tmp_path: Path): segments = [ "crawl-data/CC-MAIN-2019-09/segments/1550249406966.99/wet/CC-MAIN-20190222220601-20190223002601-00441.warc.wet.gz" ] seg_file = tmp_path / "CC-MAIN-20190222220601-20190223002601-00441.warc.wet.gz" with timer("from network"): list(CCSegmentsReader(segments)) with timer("from network, with caching"): list(CCSegmentsReader(segments, cache_dir=tmp_path)) assert seg_file.exists() with timer("from disk"): CCSegmentsReader(segments, cache_dir=tmp_path) seg_file.unlink() if __name__ == "__main__": func_argparse.main(ls, dl)
urls.append(CC_NET_ROOT_FOLDER + dump + "/" + file) return urls def reproduce( language: List[str] = None, dump: str = "2019-09", bucket: str = "head", output_dir: Path = DATA / "reconstruct", execution: str = "mp", parallelism: int = -1, cache_dir: Path = None, ): """Reproduce paper results from official CC snapshot and precomputed results. - dump: CC dump id - bucket: can be one of ("head", "middle", "tail", "all") - ouput_dir: output directory - execution: how to parallelize ("mp", "debug", "slurm", ...) - cache_dir: where the CC .wet files will be downloaded. """ output_dir.mkdir(exist_ok=True, parents=True) urls = select_urls(dump, language, bucket) unminify(urls, output_dir / dump, execution, parallelism, cache_dir) if __name__ == "__main__": import func_argparse func_argparse.main(reproduce, minify_file, minify, unminify, unminify_file)
with open(src_file) as src_f, open(trg_file) as trg_f: src_l = src_f.readline() trg_l = trg_f.readline() while src_l and trg_l: src: SimpleBitext = parser(src_l) trg: SimpleBitext = parser(trg_l) if src.line_no <= trg.line_no: lines_src += 1 src_l = src_f.readline() if trg.line_no <= src.line_no: lines_trg += 1 trg_l = trg_f.readline() if trg.line_no == src.line_no: found_pairs += 1 if found_pairs == lines_src and found_pairs == lines_trg: logging.info( f"Validated {src_file} and {trg_file}. Found {found_pairs} bitexts." ) else: logging.error( f"Validated {src_file} and {trg_file}. " f"Found {found_pairs} bitexts, from {lines_src} in {src_file} and {lines_trg} in {trg_file}" ) if __name__ == "__main__": import func_argparse func_argparse.main(dl, finalize)
dumps.remove("..") dumps.remove("current") # We take the oldest dump since the most recent might be incomplete. # The page only link to the N latest dumps so the dump won't be too old. date = min(dumps) cirrus_url = "/".join((CIRRUS_URL, date)) print("Will use the Wikipedia dump from:", date, cirrus_url) cirrus_page = BeautifulSoup(urllib.request.urlopen(cirrus_url), features="html.parser") urls = {} for link in cirrus_page.findAll("a"): match = CIRRUS_DUMP_RE.match(link.get("href")) if not match: continue urls[match.group(1)] = "/".join([cirrus_url, link.get("href")]) assert urls, f"No valid download urls found at {cirrus_url}" return urls def wget(url: str, output: Path): subprocess.run(["wget", url, "-O", tmp(output), "-q"], check=True) tmp(output).replace(output) assert (output.stat().st_size > 10_000), f"File {output} downloaded from {url} looks too small" if __name__ == "__main__": func_argparse.main(dl, opening)
assert len(files) > 0, "No files given." output_dir.mkdir(exist_ok=True) outputs = [output_dir / str(f).split("/")[-1] for f in files] if cache_dir is None: cache_dir = output_dir / "wet_cache" cache_dir.mkdir(exist_ok=True) if str(cache_dir) == "none": cache_dir = None files = [f for f, o in zip(files, outputs) if not o.exists()] outputs = [o for o in outputs if not o.exists()] if not files: return ex = get_executor( "unminify", output_dir / "logs", execution, timeout_hour=8, cpus=1, task_parallelism=parallelism, mem_gb=32, ) ex(fetch_metadata_file, files, outputs, itertools.repeat(cache_dir)) if __name__ == "__main__": import func_argparse func_argparse.main(minify_file, minify, fetch_metadata, fetch_metadata_file)
ex = submitit.AutoExecutor(output_dir / "mining_logs") ex.update_parameters( name="mine", cpus_per_task=PROCESSES, timeout_min=60 * 24 // PROCESSES, mem_gb=10, ) jobs = ex.map_array(_mine, files, outputs, sp, lm, thresholds) print("Submited job array:", jobs[0]) for j in submitit.helpers.as_completed(jobs): (i, o) = j.result() print("Mined sentences from", i, "to", o) return outputs def _mine(file: Path, output: Path, sp: Path, lm: Path, threshold: float) -> Tuple[Path, Path]: extractor = ExtractSentences(sp, lm, field="raw_content", threshold=threshold) jsonql.run_pipes(extractor, file=file, output=output, processes=PROCESSES) return (file, output) if __name__ == "__main__": func_argparse.main(sample, mine)
"""Say hello or goodbye to the user.""" import func_argparse def hello(user: str, times: int = None): """Say hello. Arguments: user: name of the user """ print(f"Hello {user}" * (1 if times is None else times)) def bye(user: str, see_you: float = 1.0): """Say goodbye.""" print(f"Goodbye {user}, see you in {see_you:.1f} days") if __name__ == "__main__": func_argparse.main()