Ejemplo n.º 1
0
def make_corpus(file: Path, tags_file: Path = None, output: Path = None) -> None:
    """
    Loads a tags file and create a training dataset using the given webpages.

    Arguments:
        - file: CC shard file
        - tags_file: dmoz tagging file, (like the one produced by `dl`)
        - output: ""
    """
    url2tags = load_tags(tags_file)
    with jsonql.smart_open(file) as f, jsonql.smart_open(output, "w") as o:
        for document in jsonql.read_jsons(f):
            if not document:
                continue
            url = document["url"]
            domain = document["source_domain"]

            if url in url2tags:
                tags = url2tags[url]
            elif domain in url2tags:
                tags = url2tags[domain]
            else:
                continue

            if len(tags) == 0:
                continue

            fasttext_tags = ["__label__" + tag for tag in tags]
            content = document["tokenized"].replace("\n", " ").lower()
            if len(content) > 200:
                print(" ".join(fasttext_tags), content, file=o)  # type: ignore
Ejemplo n.º 2
0
def test_blocked_gzip(tmp_path):
    file = tmp_path / "test.gz"
    # Each object is 10/11 bytes long. We have 2 of them by block.
    content = [f'{{"xx": {i}}}' for i in range(80)]
    with jsonql.BlockedGzipWriter(file, "wt", block_size="20B") as o:
        for line in content:
            print(line, file=o)

    with jsonql.JsonReader(strict=True) as jr:
        with jsonql.smart_open(file) as f:
            read_as_one_file = list(jr.map(f))

        expected = list(jr.map(content))
        assert expected == read_as_one_file

        with jsonql.smart_open(str(file) + "[0/40]") as f:
            reader = list(f)
        assert expected[:2] == list(jr.map(l for l in reader))

        with jsonql.smart_open(str(file) + "[39/40]") as f:
            reader = list(f)
        assert expected[-2:] == list(jr.map(l for l in reader))

        readers = jsonql.get_block_readers(file, 9)
        read_as_several_files = [list(jr.map(r)) for r in readers]
        # 40 splits of 2 docs, 9 readers -> 5 splits, 10 docs per reader
        assert list(jsonql.grouper(expected, 10)) == read_as_several_files
Ejemplo n.º 3
0
    def open_segment(self, segment: str) -> ContextManager[Iterable[str]]:
        url = "/".join((WET_URL_ROOT, segment))
        if not self.cache_dir:
            self.retrieved_segments += 1
            return jsonql.open_remote_file(url)

        file = self.cache_dir / segment.split("/")[-1]
        if not file.exists():
            self.retrieved_segments += 1
            # TODO: make this write thread-safe.
            # create a different tmp file for each process to avoid collisions.
            h = hex(hash(file))[2:10]
            tmp = file.with_name(f"tmp_{h}." + file.name)
            content = jsonql.request_get_content(url)
            tmp.write_bytes(content)
            # don't overwrite a file that might being read from other process.
            if not file.exists():
                shutil.move(tmp, file)
            else:
                tmp.unlink()
            # read from memory if possible
            f = gzip.open(io.BytesIO(content), mode="rt")
            return f

        return jsonql.smart_open(file)
Ejemplo n.º 4
0
def check_regroup(tmp_path, regroup_fn, check_blocks_boundaries=False):
    n_shards = 4
    n_docs = 20
    shards = [
        [dict(id=i, shard=s, raw_content="hello world") for i in range(n_docs)]
        for s in range(n_shards)
    ]
    shards_files = [tmp_path / f"{s:04d}.json.gz" for s in range(n_shards)]
    for shard, shard_file in zip(shards, shards_files):
        jsonql.run_pipes(file=iter(shard), output=shard_file)

    regroup_file = tmp_path / "regroup.json.gz"
    start = time.time()
    regroup_fn(shards_files, regroup_file)
    duration = time.time() - start
    print(f"{regroup_fn.__module__}.{regroup_fn.__name__} took {duration}s")

    with jsonql.smart_open(regroup_file) as f:
        regrouped = list(jsonql.read_jsons(f))

    assert [doc for shard in shards for doc in shard] == regrouped

    readers = jsonql.get_block_readers(regroup_file, n_shards)
    if not check_blocks_boundaries:
        assert [doc for shard in shards for doc in shard] == [
            doc for reader in readers for doc in jsonql.read_jsons(reader)
        ]
        return

    for shard, reader in zip(shards, readers):
        block = [doc for doc in jsonql.read_jsons(reader)]
        assert shard == block
Ejemplo n.º 5
0
def unminify_file(file: Union[Path, str], output: Path, cache_dir: Path = None):
    unminifier = Unminifier(cache_dir)
    with jsonql.smart_open(file) as f:
        mini = [m for m in jsonql.read_jsons(f)]
    unminifier.look_for(mini)

    tmp = output.with_name("tmp." + output.name)
    jsonql.run_pipes(unminifier, file=iter(mini), output=tmp)
    shutil.move(tmp, output)
    f_size = Path(file).stat().st_size if Path(file).exists() else 0
    o_size = output.stat().st_size
    mb = 1024 ** 2
    return f"Unminified {output} ({f_size // mb:_}Mb -> {o_size // mb:_}Mb)"
Ejemplo n.º 6
0
    def test_smart_open(self):
        tmp = self.get_tmpdir()

        def readlines(filename):
            with jsonql.smart_open(filename) as f:
                return list(jsonql.lines(f))

        with jsonql.smart_open(tmp("a.txt"), "w") as o:
            print("a", file=o)
        self.assertEqual(readlines(tmp("a.txt")), ["a"])

        # with jsonql.smart_open(tmp("a.json.gz"), "w") as o:
        #     print("a", file=o)
        # self.assertEqual(readlines(tmp("a.json.gz")), ["a"])

        with jsonql.smart_open([tmp("a0.txt"), tmp("a1.txt")], "w") as o:
            print("a", file=o)
        self.assertEqual(readlines(tmp("a0.txt")), ["a"])
        self.assertFalse(os.path.isfile(tmp("a1.txt")))

        with jsonql.smart_open([tmp("b0.txt"), tmp("b1.txt")], "w", max_size="1k") as o:
            print("0" * 2000, file=o)
            print("1" * 2000, file=o)
        self.assertEqual(readlines(tmp("b0.txt")), ["0" * 2000])
        self.assertEqual(readlines(tmp("b1.txt")), ["1" * 2000])

        with jsonql.smart_open(tmp("a_????.json"), "w") as o:
            print("a", file=o)
        self.assertEqual(readlines(tmp("a_0000.json")), ["a"])
        self.assertFalse(os.path.isfile(tmp("a_0001.json")))
        self.assertEqual(readlines(tmp("a_*.json")), ["a"])

        with jsonql.smart_open(tmp("b_??.json"), "w", max_size="1k") as o:
            print("0" * 2000, file=o)
            print("1" * 2000, file=o)
        self.assertEqual(readlines(tmp("b_00.json")), ["0" * 2000])
        self.assertEqual(readlines(tmp("b_01.json")), ["1" * 2000])
        self.assertEqual(readlines(tmp("b_*.json")), ["0" * 2000, "1" * 2000])
Ejemplo n.º 7
0
def _validate_test(conf: Config, generate: bool = False):
    stats: Dict[str, dict] = {}
    for file in sorted(
        (conf.output_dir / "regroup" / conf.dump).glob("*.json.gz")):
        fname = f"regroup/{conf.dump}/{file.name}"
        with jsonql.smart_open(file) as lines:
            # The order of documents is not guaranteed inside a shard.
            content = "\n".join(sorted(lines))
            size = len(content)
            checksum = hashlib.sha1(bytes(content,
                                          encoding="utf-8")).hexdigest()
        stats[fname] = {"size": size, "checksum": checksum}

    print("*** Stats ***")
    print(json.dumps(stats, indent=2))
    stats_file = Path(__file__).parent / "data" / "test_stats.json"
    if generate:
        print("Saving stats to", stats_file)
        stats_file.write_text(json.dumps(stats, indent=2))
        return

    expected_stats: Dict[str, dict] = {}
    if stats_file.exists():
        expected_stats = json.loads(stats_file.read_text())

    if expected_stats == stats:
        print("Everything looks good !")
        return

    print("*** Expected Stats ***")
    print(json.dumps(expected_stats, indent=2))

    print("*** Diff ***")
    for fname in sorted(expected_stats.keys()):
        print(fname)
        assert fname in expected_stats, "missing file " + fname
        if expected_stats[fname]["size"] != stats[fname]["size"]:
            print(
                "  - Expected size",
                expected_stats[fname]["size"],
                ", size",
                stats[fname]["size"],
            )
        if expected_stats[fname]["checksum"] != stats[fname]["checksum"]:
            print(
                "  - Expected checksum",
                expected_stats[fname]["checksum"],
                ", checksum",
                stats[fname]["checksum"],
            )
Ejemplo n.º 8
0
 def segments(self) -> List[str]:
     if self._segments:
         return self._segments
     # code by ray : change the source of wet.paths.gz to local disk
     segments_file = os.path.join(data_dir, self.dump + "wet.paths.gz")
     with jsonql.smart_open(segments_file) as f:
         segments = [segment.strip() for segment in f]
     n = len(segments)
     i_min = (self.shard * n) // self.num_shards
     i_max = ((self.shard + 1) * n) // self.num_shards
     if self.num_segments_per_shard > 0:
         i_max = min(i_max, i_min + self.num_segments_per_shard)
     self._segments = segments[i_min:i_max]
     return self._segments
Ejemplo n.º 9
0
def _dump_sentence_hashes(source: Path, output: Path, field: str):
    treated = 0
    started = time.time()
    with jsonql.smart_open(source, "r") as f, open(output, "wb") as o:
        for doc in jsonql.read_jsons(f):
            content = doc.get(field)
            if not content:
                continue
            h = compute_hashes(content)
            if h is None:
                continue
            h.tofile(o)
            treated += 1
            if treated % 100_000 == 0:
                delay = time.time() - started
                log(f"Computed {treated} documents hashes in {delay / 3600:.2f}h ({treated / delay} doc / s)"
                    )
Ejemplo n.º 10
0
def perplexity_to_bin(file: Path, output: Path, models, tok_field: str):
    pp_field = "perplexity"
    lm = DocLM(models, tok_field, output_field=pp_field)
    stats: List[float] = []
    max_stats = 1_000_000
    batch_size = 100_000
    i = 0
    batch = []
    with jsonql.smart_open(file) as f, open(output, "wb") as o:
        for doc in jsonql.read_jsons(f):
            i += 1
            pp = lm(doc)[pp_field]
            if len(stats) < max_stats:
                stats.append(pp)
            batch.append(pp)
            if len(batch) >= batch_size:
                np.array(batch, dtype=np.float32).tofile(o)
                batch = []
        if len(batch) > 0:
            np.array(batch, dtype=np.float32).tofile(o)
Ejemplo n.º 11
0
    def open_segment(self, segment: str) -> ContextManager[Iterable[str]]:
        url = "/".join((WET_URL_ROOT, segment))
        if not self.cache_dir:
            self.retrieved_segments += 1
            return jsonql.open_remote_file(url)

        file = self.cache_dir / segment.split("/")[-1]
        if not file.exists():
            self.retrieved_segments += 1
            tmp = file.with_name(f"tmp_{os.getpid()}." + file.name)
            content = jsonql.request_get_content(url)
            tmp.write_bytes(content)
            # don't overwrite a file that might being read from other process.
            if not file.exists():
                shutil.move(tmp, file)
            else:
                tmp.unlink()
            # read from memory if possible
            return gzip.open(io.BytesIO(content), mode="rt")

        return jsonql.smart_open(file)
Ejemplo n.º 12
0
    def __iter__(self) -> Iterator[dict]:
        n = len(self.segments)
        for i, segment in enumerate(self.segments):
            start = time.time()
            # TODO: start downloading the next segment in the background
            # code by ray : split the file path from wet.path.gz and get the
            # segment file from local disk rather than remote url
            segment = segment.split("/")[-1]
            segment_file = os.path.join(data_dir, self.dump, segment)
            with jsonql.smart_open(segment_file) as f:
                for doc in parse_warc_file(iter(f), self.min_len):
                    doc["cc_segment"] = segment
                    yield doc

            if i + 1 >= n:
                continue
            end = time.time()
            delay = (end - start) / 3600 * (n - 1 - i)
            logger.info(
                f"Parsed {i + 1} / {n} files. Estimated remaining time: {delay:.1f}h"
            )
Ejemplo n.º 13
0
 def readlines(filename):
     with jsonql.smart_open(filename) as f:
         return list(jsonql.lines(f))