def open_segment(self, segment: str) -> ContextManager[Iterable[str]]: url = "/".join((WET_URL_ROOT, segment)) if not self.cache_dir: self.retrieved_segments += 1 return jsonql.open_remote_file(url) file = self.cache_dir / segment.split("/")[-1] if not file.exists(): self.retrieved_segments += 1 # TODO: make this write thread-safe. # create a different tmp file for each process to avoid collisions. h = hex(hash(file))[2:10] tmp = file.with_name(f"tmp_{h}." + file.name) content = jsonql.request_get_content(url) tmp.write_bytes(content) # don't overwrite a file that might being read from other process. if not file.exists(): shutil.move(tmp, file) else: tmp.unlink() # read from memory if possible f = gzip.open(io.BytesIO(content), mode="rt") return f return jsonql.smart_open(file)
def dl_file(metadata_dir: str, outdir: Path, file: str): metadata = "/".join((metadata_dir, file)) parser = get_typed_parser(NormalizedBitextPtr) found_bitext, missed_bitext, skipped_line = 0, 0, 0 segment = "" segment_downloads: Dict[str, int] = defaultdict(int) raw_documents: Dict[str, str] = {} cleaned_documents: Dict[str, str] = {} outfile = outdir / file if outfile.exists(): return o = FileWriterWithTmp(outfile) for i, line in enumerate(open_remote_file(metadata)): try: bitext: NormalizedBitextPtr = parser(line) # Add some more assert in case the line is invalid but still parse assert bitext.segment.startswith("crawl-data/") assert bitext.digest.startswith("sha1:") except AssertionError: logging.error(f"Skipping line {i}: {line}") skipped_line += 1 continue if not segment or bitext.segment != segment: segment = bitext.segment segment_downloads[segment] += 1 # Load segment in RAM, purge document cache raw_documents = get_documents(segment) cleaned_documents = {} raw_doc = raw_documents.get(bitext.digest) if raw_doc is None: logging.error(f"Document not found: {bitext.digest} in {segment}") missed_bitext += 1 continue clean_doc = cleaned_documents.get(bitext.digest) if clean_doc is None: clean_doc = clean_content(raw_doc) cleaned_documents[bitext.digest] = clean_doc text = clean_doc[bitext.ptr_start:bitext.ptr_end] score = getattr(bitext, "score", 0.0) bt = Bitext(bitext.lang_pair, bitext.line_no, score, text) print(*bt, sep="\t", file=o) o.close(True) logging.info( f"Found {found_bitext} sentences, missed {missed_bitext} sentences.") if skipped_line > 0: logging.error(f"Skipped {skipped_line} unparsable lines") expected_dl = len(segment_downloads) actual_dl = sum(segment_downloads.values()) if actual_dl != expected_dl: logging.error( f"Some segments where downloaded twice. Total dl: {actual_dl}, distinct dl: {expected_dl}" )
def open_segment(self, segment: str) -> Iterable[str]: url = self.segment_url(segment) file: Optional[Path] = None if self.cache_dir: file = self.cache_dir / segment.split("/")[-1] if not file or not file.exists(): self.retrieved_segments += 1 return jsonql.open_remote_file(url, cache=file)
def dl_file(metadata_dir: str, file: str, outdir: Path): metadata = "/".join((metadata_dir, file)) parser = get_typed_parser(NormalizedBitextPtr) found_bitext, missed_bitext, skipped_line = 0, 0, 0 segment = "" segment_downloads: Dict[str, int] = defaultdict(int) raw_documents: Dict[str, str] = {} cleaned_documents: Dict[str, str] = {} outfile = outdir / file with gzip.open(outfile, "wt") as o: for i, line in enumerate(open_remote_file(metadata)): try: bitext: NormalizedBitextPtr = parser(line) except AssertionError: logging.error(f"Skipping line {i}: {line}") skipped_line += 1 continue if not segment or bitext.segment != segment: segment = bitext.segment segment_downloads[segment] += 1 # Load segment in RAM, purge document cache raw_documents = get_documents(segment) cleaned_documents = {} raw_doc = raw_documents.get(bitext.digest) if raw_doc is None: logging.error( f"Document not found: {bitext.digest} in {segment}") missed_bitext += 1 continue clean_doc = cleaned_documents.get(bitext.digest) if clean_doc is None: clean_doc = clean_content(raw_doc) cleaned_documents[bitext.digest] = clean_doc text = clean_doc[bitext.ptr_start:bitext.ptr_end] bt = Bitext(bitext.lang_pair, bitext.line_no, text) print(*bt, sep="\t", file=o) logging.info( f"Found {found_bitext} sentences, missed {missed_bitext} sentences.") if skipped_line > 0: logging.error(f"Skipped {skipped_line} unparsable lines") expected_dl = len(segment_downloads) actual_dl = sum(segment_downloads.values()) if actual_dl != expected_dl: logging.error( f"Some segments where downloaded twice. Total dl: {actual_dl}, distinct dl: {expected_dl}" )
def dl(outdir: Path = Path("data"), version: str = "v1.0"): """Checks that the segments in the given batch are valid.""" metadata_dir = f"https://dl.fbaipublicfiles.com/laser/CCMatrix/{version}" file_list = [ l.strip() for l in open_remote_file(metadata_dir + "/list.txt") ] outdir.mkdir(exist_ok=True) outdir = outdir / version outdir.mkdir(exist_ok=True) for file in file_list: dl_file(metadata_dir, file, outdir)
def select_urls( dump: str, languages: List[str] = None, bucket: str = "head" ) -> List[str]: urls = [] languages_set = set(languages) if languages else None with jsonql.open_remote_file(CC_NET_ROOT_FOLDER + dump + "/files.txt") as f: for file in f: file = file.strip() lang, buck, shard = file.split(".")[0].split("_") if bucket != "all" and bucket != buck: continue if languages_set and lang not in languages_set: continue urls.append(CC_NET_ROOT_FOLDER + dump + "/" + file) return urls
def load_tags(filename: Path = None) -> TaggedUrls: if filename is None: with StringIO("".join(jsonql.open_remote_file(DMOZ_TAGS_URL))) as dmoz: tree = etree.parse(dmoz) else: tree = etree.parse(str(filename)) root = tree.getroot() url2tags: Dict[str, Set[str]] = {} for external_page in root.iterfind("{http://dmoz.org/rdf/}ExternalPage"): url = external_page.get("about") domain = urlparse(url).netloc for topic in external_page.iterfind("{http://dmoz.org/rdf/}topic"): # print(url, topic.text) # Tags looks like Top/Arts/Animation/Anime/Collectibles tags = set(topic.text.split("/")[1:]) add_tags(url, tags, url2tags) add_tags(domain, tags, url2tags) return url2tags
def dl(outdir: Path = Path("data"), version: str = KNOWN_VERSIONS[0], parallelism: int = 8): """ Download bitext pointers from FAIR dataset and extract corresponding CC snippets. - version: Specific version to download - outdir: Directory where the data should go. Files will be in {outdir}/{version}/raw/ """ assert version in KNOWN_VERSIONS, f"Unknown version {version}, chose from {KNOWN_VERSIONS}" metadata_dir = f"https://dl.fbaipublicfiles.com/laser/CCMatrix/{version}" file_list = [ l.strip() for l in open_remote_file(metadata_dir + "/list.txt") ] outdir.mkdir(exist_ok=True) outdir = outdir / version / "raw" outdir.mkdir(exist_ok=True, parents=True) with multiprocessing.Pool(parallelism) as pool: dlf = functools.partial(dl_file, metadata_dir, outdir) pool.map(dlf, file_list)
def open_segment(self, segment: str) -> ContextManager[Iterable[str]]: url = "/".join((WET_URL_ROOT, segment)) if not self.cache_dir: self.retrieved_segments += 1 return jsonql.open_remote_file(url) file = self.cache_dir / segment.split("/")[-1] if not file.exists(): self.retrieved_segments += 1 tmp = file.with_name(f"tmp_{os.getpid()}." + file.name) content = jsonql.request_get_content(url) tmp.write_bytes(content) # don't overwrite a file that might being read from other process. if not file.exists(): shutil.move(tmp, file) else: tmp.unlink() # read from memory if possible return gzip.open(io.BytesIO(content), mode="rt") return jsonql.smart_open(file)
def cc_segments(dump_id: str, cache_dir: Path = None) -> List[str]: wet_paths = cc_wet_paths_url(dump_id) cache_dir = cache_dir or jsonql._tmp_dir() wet_paths_cache = cache_dir / f"wet_{dump_id}.paths.gz" f = jsonql.open_remote_file(wet_paths, cache=wet_paths_cache) return [segment.strip() for segment in f]
def _dl_shard(snapshot: str, shard: int) -> Iterator[Paragraph]: """ Download metadata from a shards. Sample metadata: { "cc_segment": "crawl-data/CC-MAIN-2018-51/segments/1544376823009.19/wet/CC-MAIN-20181209185547-20181209211547-00000.warc.wet.gz", "digest": "sha1:222LWNHN5FM26XGS7WJSMI6IISTVWBKJ", "url": "http://personals.gearplay.com/ads/DRJONES.htm", "line_ids": [10], "languages": ["en_XX"], "lm_scores": [-2.658], } """ snapshot = snapshot.replace("-", "_") name = f"snap_{snapshot}_batch_{shard}.json.gz" url = "/".join([S3_BUCKET, VERSION, name]) shard_metadata: Dict[str, Dict[str, dict]] = defaultdict(dict) try: cache_file: Optional[Path] = None if WET_CACHE is not None: cache_file = WET_CACHE / name metadata_file = jsonql.open_remote_file(url, cache_file) except: logging.warning(f"Couldn't open {url}") return for meta in jsonql.read_jsons(metadata_file): shard_metadata[meta["cc_segment"]][meta["digest"]] = meta found_pars, missed_pars = 0, 0 for seg, segment_metadata in shard_metadata.items(): for doc in CCSegmentsReader([seg], cache_dir=WET_CACHE): if doc["digest"] not in segment_metadata: continue meta = segment_metadata[doc["digest"]] full_pars = [doc["title"]] + doc["raw_content"].split("\n") assert len(meta["line_ids"]) == len(meta["languages"]) assert len(meta["line_ids"]) == len(meta["lm_scores"]) for i, lang, score in zip(meta["line_ids"], meta["languages"], meta["lm_scores"]): if snapshot != "2018-51" and lang in BIG_LANGUAGES: # Big languages only come from "2018-51" snapshot continue if i >= len(full_pars): # This is because CC100 was created by saving only urls. # Some urls appears in different snapshot with slightly different # versions, but we don't know which one is correct. # Here we read both versions, but some index may end up # being incorrect. # This impact ~3% documents. missed_pars += 1 continue yield Paragraph(lang, full_pars[i], score) found_pars += 1 if missed_pars > 0: logging.warning( f"Missed {missed_pars} ({missed_pars / found_pars:%}) paragraphes." )