Esempio n. 1
0
    def open_segment(self, segment: str) -> ContextManager[Iterable[str]]:
        url = "/".join((WET_URL_ROOT, segment))
        if not self.cache_dir:
            self.retrieved_segments += 1
            return jsonql.open_remote_file(url)

        file = self.cache_dir / segment.split("/")[-1]
        if not file.exists():
            self.retrieved_segments += 1
            # TODO: make this write thread-safe.
            # create a different tmp file for each process to avoid collisions.
            h = hex(hash(file))[2:10]
            tmp = file.with_name(f"tmp_{h}." + file.name)
            content = jsonql.request_get_content(url)
            tmp.write_bytes(content)
            # don't overwrite a file that might being read from other process.
            if not file.exists():
                shutil.move(tmp, file)
            else:
                tmp.unlink()
            # read from memory if possible
            f = gzip.open(io.BytesIO(content), mode="rt")
            return f

        return jsonql.smart_open(file)
Esempio n. 2
0
def dl_file(metadata_dir: str, outdir: Path, file: str):
    metadata = "/".join((metadata_dir, file))
    parser = get_typed_parser(NormalizedBitextPtr)
    found_bitext, missed_bitext, skipped_line = 0, 0, 0
    segment = ""
    segment_downloads: Dict[str, int] = defaultdict(int)
    raw_documents: Dict[str, str] = {}
    cleaned_documents: Dict[str, str] = {}

    outfile = outdir / file
    if outfile.exists():
        return
    o = FileWriterWithTmp(outfile)
    for i, line in enumerate(open_remote_file(metadata)):
        try:
            bitext: NormalizedBitextPtr = parser(line)
            # Add some more assert in case the line is invalid but still parse
            assert bitext.segment.startswith("crawl-data/")
            assert bitext.digest.startswith("sha1:")
        except AssertionError:
            logging.error(f"Skipping line {i}: {line}")
            skipped_line += 1
            continue

        if not segment or bitext.segment != segment:
            segment = bitext.segment
            segment_downloads[segment] += 1
            # Load segment in RAM, purge document cache
            raw_documents = get_documents(segment)
            cleaned_documents = {}

        raw_doc = raw_documents.get(bitext.digest)
        if raw_doc is None:
            logging.error(f"Document not found: {bitext.digest} in {segment}")
            missed_bitext += 1
            continue

        clean_doc = cleaned_documents.get(bitext.digest)
        if clean_doc is None:
            clean_doc = clean_content(raw_doc)
            cleaned_documents[bitext.digest] = clean_doc

        text = clean_doc[bitext.ptr_start:bitext.ptr_end]
        score = getattr(bitext, "score", 0.0)
        bt = Bitext(bitext.lang_pair, bitext.line_no, score, text)
        print(*bt, sep="\t", file=o)

    o.close(True)
    logging.info(
        f"Found {found_bitext} sentences, missed {missed_bitext} sentences.")
    if skipped_line > 0:
        logging.error(f"Skipped {skipped_line} unparsable lines")
    expected_dl = len(segment_downloads)
    actual_dl = sum(segment_downloads.values())

    if actual_dl != expected_dl:
        logging.error(
            f"Some segments where downloaded twice. Total dl: {actual_dl}, distinct dl: {expected_dl}"
        )
Esempio n. 3
0
    def open_segment(self, segment: str) -> Iterable[str]:
        url = self.segment_url(segment)
        file: Optional[Path] = None
        if self.cache_dir:
            file = self.cache_dir / segment.split("/")[-1]
        if not file or not file.exists():
            self.retrieved_segments += 1

        return jsonql.open_remote_file(url, cache=file)
Esempio n. 4
0
def dl_file(metadata_dir: str, file: str, outdir: Path):
    metadata = "/".join((metadata_dir, file))
    parser = get_typed_parser(NormalizedBitextPtr)
    found_bitext, missed_bitext, skipped_line = 0, 0, 0
    segment = ""
    segment_downloads: Dict[str, int] = defaultdict(int)
    raw_documents: Dict[str, str] = {}
    cleaned_documents: Dict[str, str] = {}

    outfile = outdir / file
    with gzip.open(outfile, "wt") as o:
        for i, line in enumerate(open_remote_file(metadata)):
            try:
                bitext: NormalizedBitextPtr = parser(line)
            except AssertionError:
                logging.error(f"Skipping line {i}: {line}")
                skipped_line += 1
                continue

            if not segment or bitext.segment != segment:
                segment = bitext.segment
                segment_downloads[segment] += 1
                # Load segment in RAM, purge document cache
                raw_documents = get_documents(segment)
                cleaned_documents = {}

            raw_doc = raw_documents.get(bitext.digest)
            if raw_doc is None:
                logging.error(
                    f"Document not found: {bitext.digest} in {segment}")
                missed_bitext += 1
                continue

            clean_doc = cleaned_documents.get(bitext.digest)
            if clean_doc is None:
                clean_doc = clean_content(raw_doc)
                cleaned_documents[bitext.digest] = clean_doc

            text = clean_doc[bitext.ptr_start:bitext.ptr_end]
            bt = Bitext(bitext.lang_pair, bitext.line_no, text)
            print(*bt, sep="\t", file=o)

    logging.info(
        f"Found {found_bitext} sentences, missed {missed_bitext} sentences.")
    if skipped_line > 0:
        logging.error(f"Skipped {skipped_line} unparsable lines")
    expected_dl = len(segment_downloads)
    actual_dl = sum(segment_downloads.values())

    if actual_dl != expected_dl:
        logging.error(
            f"Some segments where downloaded twice. Total dl: {actual_dl}, distinct dl: {expected_dl}"
        )
Esempio n. 5
0
def dl(outdir: Path = Path("data"), version: str = "v1.0"):
    """Checks that the segments in the given batch are valid."""
    metadata_dir = f"https://dl.fbaipublicfiles.com/laser/CCMatrix/{version}"
    file_list = [
        l.strip() for l in open_remote_file(metadata_dir + "/list.txt")
    ]
    outdir.mkdir(exist_ok=True)
    outdir = outdir / version
    outdir.mkdir(exist_ok=True)

    for file in file_list:
        dl_file(metadata_dir, file, outdir)
Esempio n. 6
0
def select_urls(
    dump: str, languages: List[str] = None, bucket: str = "head"
) -> List[str]:
    urls = []
    languages_set = set(languages) if languages else None
    with jsonql.open_remote_file(CC_NET_ROOT_FOLDER + dump + "/files.txt") as f:
        for file in f:
            file = file.strip()
            lang, buck, shard = file.split(".")[0].split("_")
            if bucket != "all" and bucket != buck:
                continue
            if languages_set and lang not in languages_set:
                continue
            urls.append(CC_NET_ROOT_FOLDER + dump + "/" + file)
    return urls
Esempio n. 7
0
def load_tags(filename: Path = None) -> TaggedUrls:
    if filename is None:
        with StringIO("".join(jsonql.open_remote_file(DMOZ_TAGS_URL))) as dmoz:
            tree = etree.parse(dmoz)
    else:
        tree = etree.parse(str(filename))

    root = tree.getroot()
    url2tags: Dict[str, Set[str]] = {}
    for external_page in root.iterfind("{http://dmoz.org/rdf/}ExternalPage"):
        url = external_page.get("about")
        domain = urlparse(url).netloc
        for topic in external_page.iterfind("{http://dmoz.org/rdf/}topic"):
            # print(url, topic.text)
            # Tags looks like Top/Arts/Animation/Anime/Collectibles
            tags = set(topic.text.split("/")[1:])
            add_tags(url, tags, url2tags)
            add_tags(domain, tags, url2tags)
    return url2tags
def dl(outdir: Path = Path("data"),
       version: str = KNOWN_VERSIONS[0],
       parallelism: int = 8):
    """
    Download bitext pointers from FAIR dataset and extract corresponding CC snippets.
    - version: Specific version to download
    - outdir: Directory where the data should go. Files will be in {outdir}/{version}/raw/
    """
    assert version in KNOWN_VERSIONS, f"Unknown version {version}, chose from {KNOWN_VERSIONS}"
    metadata_dir = f"https://dl.fbaipublicfiles.com/laser/CCMatrix/{version}"
    file_list = [
        l.strip() for l in open_remote_file(metadata_dir + "/list.txt")
    ]
    outdir.mkdir(exist_ok=True)
    outdir = outdir / version / "raw"
    outdir.mkdir(exist_ok=True, parents=True)

    with multiprocessing.Pool(parallelism) as pool:
        dlf = functools.partial(dl_file, metadata_dir, outdir)
        pool.map(dlf, file_list)
Esempio n. 9
0
    def open_segment(self, segment: str) -> ContextManager[Iterable[str]]:
        url = "/".join((WET_URL_ROOT, segment))
        if not self.cache_dir:
            self.retrieved_segments += 1
            return jsonql.open_remote_file(url)

        file = self.cache_dir / segment.split("/")[-1]
        if not file.exists():
            self.retrieved_segments += 1
            tmp = file.with_name(f"tmp_{os.getpid()}." + file.name)
            content = jsonql.request_get_content(url)
            tmp.write_bytes(content)
            # don't overwrite a file that might being read from other process.
            if not file.exists():
                shutil.move(tmp, file)
            else:
                tmp.unlink()
            # read from memory if possible
            return gzip.open(io.BytesIO(content), mode="rt")

        return jsonql.smart_open(file)
Esempio n. 10
0
def cc_segments(dump_id: str, cache_dir: Path = None) -> List[str]:
    wet_paths = cc_wet_paths_url(dump_id)
    cache_dir = cache_dir or jsonql._tmp_dir()
    wet_paths_cache = cache_dir / f"wet_{dump_id}.paths.gz"
    f = jsonql.open_remote_file(wet_paths, cache=wet_paths_cache)
    return [segment.strip() for segment in f]
Esempio n. 11
0
def _dl_shard(snapshot: str, shard: int) -> Iterator[Paragraph]:
    """
    Download metadata from a shards.

    Sample metadata:

    {
        "cc_segment": "crawl-data/CC-MAIN-2018-51/segments/1544376823009.19/wet/CC-MAIN-20181209185547-20181209211547-00000.warc.wet.gz",
        "digest": "sha1:222LWNHN5FM26XGS7WJSMI6IISTVWBKJ",
        "url": "http://personals.gearplay.com/ads/DRJONES.htm",
        "line_ids": [10],
        "languages": ["en_XX"],
        "lm_scores": [-2.658],
    }
    """
    snapshot = snapshot.replace("-", "_")
    name = f"snap_{snapshot}_batch_{shard}.json.gz"
    url = "/".join([S3_BUCKET, VERSION, name])
    shard_metadata: Dict[str, Dict[str, dict]] = defaultdict(dict)
    try:
        cache_file: Optional[Path] = None
        if WET_CACHE is not None:
            cache_file = WET_CACHE / name
        metadata_file = jsonql.open_remote_file(url, cache_file)
    except:
        logging.warning(f"Couldn't open {url}")
        return

    for meta in jsonql.read_jsons(metadata_file):
        shard_metadata[meta["cc_segment"]][meta["digest"]] = meta

    found_pars, missed_pars = 0, 0
    for seg, segment_metadata in shard_metadata.items():
        for doc in CCSegmentsReader([seg], cache_dir=WET_CACHE):
            if doc["digest"] not in segment_metadata:
                continue

            meta = segment_metadata[doc["digest"]]
            full_pars = [doc["title"]] + doc["raw_content"].split("\n")

            assert len(meta["line_ids"]) == len(meta["languages"])
            assert len(meta["line_ids"]) == len(meta["lm_scores"])
            for i, lang, score in zip(meta["line_ids"], meta["languages"],
                                      meta["lm_scores"]):
                if snapshot != "2018-51" and lang in BIG_LANGUAGES:
                    # Big languages only come from "2018-51" snapshot
                    continue
                if i >= len(full_pars):
                    # This is because CC100 was created by saving only urls.
                    # Some urls appears in different snapshot with slightly different
                    # versions, but we don't know which one is correct.
                    # Here we read both versions, but some index may end up
                    # being incorrect.
                    # This impact ~3% documents.
                    missed_pars += 1
                    continue

                yield Paragraph(lang, full_pars[i], score)
                found_pars += 1
        if missed_pars > 0:
            logging.warning(
                f"Missed {missed_pars} ({missed_pars / found_pars:%}) paragraphes."
            )