def make_corpus(file: Path, tags_file: Path = None, output: Path = None) -> None: """ Loads a tags file and create a training dataset using the given webpages. Arguments: - file: CC shard file - tags_file: dmoz tagging file, (like the one produced by `dl`) - output: "" """ url2tags = load_tags(tags_file) with jsonql.smart_open(file) as f, jsonql.smart_open(output, "w") as o: for document in jsonql.read_jsons(f): if not document: continue url = document["url"] domain = document["source_domain"] if url in url2tags: tags = url2tags[url] elif domain in url2tags: tags = url2tags[domain] else: continue if len(tags) == 0: continue fasttext_tags = ["__label__" + tag for tag in tags] content = document["tokenized"].replace("\n", " ").lower() if len(content) > 200: print(" ".join(fasttext_tags), content, file=o) # type: ignore
def test_blocked_gzip(tmp_path): file = tmp_path / "test.gz" # Each object is 10/11 bytes long. We have 2 of them by block. content = [f'{{"xx": {i}}}' for i in range(80)] with jsonql.BlockedGzipWriter(file, "wt", block_size="20B") as o: for line in content: print(line, file=o) with jsonql.JsonReader(strict=True) as jr: with jsonql.smart_open(file) as f: read_as_one_file = list(jr.map(f)) expected = list(jr.map(content)) assert expected == read_as_one_file with jsonql.smart_open(str(file) + "[0/40]") as f: reader = list(f) assert expected[:2] == list(jr.map(l for l in reader)) with jsonql.smart_open(str(file) + "[39/40]") as f: reader = list(f) assert expected[-2:] == list(jr.map(l for l in reader)) readers = jsonql.get_block_readers(file, 9) read_as_several_files = [list(jr.map(r)) for r in readers] # 40 splits of 2 docs, 9 readers -> 5 splits, 10 docs per reader assert list(jsonql.grouper(expected, 10)) == read_as_several_files
def open_segment(self, segment: str) -> ContextManager[Iterable[str]]: url = "/".join((WET_URL_ROOT, segment)) if not self.cache_dir: self.retrieved_segments += 1 return jsonql.open_remote_file(url) file = self.cache_dir / segment.split("/")[-1] if not file.exists(): self.retrieved_segments += 1 # TODO: make this write thread-safe. # create a different tmp file for each process to avoid collisions. h = hex(hash(file))[2:10] tmp = file.with_name(f"tmp_{h}." + file.name) content = jsonql.request_get_content(url) tmp.write_bytes(content) # don't overwrite a file that might being read from other process. if not file.exists(): shutil.move(tmp, file) else: tmp.unlink() # read from memory if possible f = gzip.open(io.BytesIO(content), mode="rt") return f return jsonql.smart_open(file)
def check_regroup(tmp_path, regroup_fn, check_blocks_boundaries=False): n_shards = 4 n_docs = 20 shards = [ [dict(id=i, shard=s, raw_content="hello world") for i in range(n_docs)] for s in range(n_shards) ] shards_files = [tmp_path / f"{s:04d}.json.gz" for s in range(n_shards)] for shard, shard_file in zip(shards, shards_files): jsonql.run_pipes(file=iter(shard), output=shard_file) regroup_file = tmp_path / "regroup.json.gz" start = time.time() regroup_fn(shards_files, regroup_file) duration = time.time() - start print(f"{regroup_fn.__module__}.{regroup_fn.__name__} took {duration}s") with jsonql.smart_open(regroup_file) as f: regrouped = list(jsonql.read_jsons(f)) assert [doc for shard in shards for doc in shard] == regrouped readers = jsonql.get_block_readers(regroup_file, n_shards) if not check_blocks_boundaries: assert [doc for shard in shards for doc in shard] == [ doc for reader in readers for doc in jsonql.read_jsons(reader) ] return for shard, reader in zip(shards, readers): block = [doc for doc in jsonql.read_jsons(reader)] assert shard == block
def unminify_file(file: Union[Path, str], output: Path, cache_dir: Path = None): unminifier = Unminifier(cache_dir) with jsonql.smart_open(file) as f: mini = [m for m in jsonql.read_jsons(f)] unminifier.look_for(mini) tmp = output.with_name("tmp." + output.name) jsonql.run_pipes(unminifier, file=iter(mini), output=tmp) shutil.move(tmp, output) f_size = Path(file).stat().st_size if Path(file).exists() else 0 o_size = output.stat().st_size mb = 1024 ** 2 return f"Unminified {output} ({f_size // mb:_}Mb -> {o_size // mb:_}Mb)"
def test_smart_open(self): tmp = self.get_tmpdir() def readlines(filename): with jsonql.smart_open(filename) as f: return list(jsonql.lines(f)) with jsonql.smart_open(tmp("a.txt"), "w") as o: print("a", file=o) self.assertEqual(readlines(tmp("a.txt")), ["a"]) # with jsonql.smart_open(tmp("a.json.gz"), "w") as o: # print("a", file=o) # self.assertEqual(readlines(tmp("a.json.gz")), ["a"]) with jsonql.smart_open([tmp("a0.txt"), tmp("a1.txt")], "w") as o: print("a", file=o) self.assertEqual(readlines(tmp("a0.txt")), ["a"]) self.assertFalse(os.path.isfile(tmp("a1.txt"))) with jsonql.smart_open([tmp("b0.txt"), tmp("b1.txt")], "w", max_size="1k") as o: print("0" * 2000, file=o) print("1" * 2000, file=o) self.assertEqual(readlines(tmp("b0.txt")), ["0" * 2000]) self.assertEqual(readlines(tmp("b1.txt")), ["1" * 2000]) with jsonql.smart_open(tmp("a_????.json"), "w") as o: print("a", file=o) self.assertEqual(readlines(tmp("a_0000.json")), ["a"]) self.assertFalse(os.path.isfile(tmp("a_0001.json"))) self.assertEqual(readlines(tmp("a_*.json")), ["a"]) with jsonql.smart_open(tmp("b_??.json"), "w", max_size="1k") as o: print("0" * 2000, file=o) print("1" * 2000, file=o) self.assertEqual(readlines(tmp("b_00.json")), ["0" * 2000]) self.assertEqual(readlines(tmp("b_01.json")), ["1" * 2000]) self.assertEqual(readlines(tmp("b_*.json")), ["0" * 2000, "1" * 2000])
def _validate_test(conf: Config, generate: bool = False): stats: Dict[str, dict] = {} for file in sorted( (conf.output_dir / "regroup" / conf.dump).glob("*.json.gz")): fname = f"regroup/{conf.dump}/{file.name}" with jsonql.smart_open(file) as lines: # The order of documents is not guaranteed inside a shard. content = "\n".join(sorted(lines)) size = len(content) checksum = hashlib.sha1(bytes(content, encoding="utf-8")).hexdigest() stats[fname] = {"size": size, "checksum": checksum} print("*** Stats ***") print(json.dumps(stats, indent=2)) stats_file = Path(__file__).parent / "data" / "test_stats.json" if generate: print("Saving stats to", stats_file) stats_file.write_text(json.dumps(stats, indent=2)) return expected_stats: Dict[str, dict] = {} if stats_file.exists(): expected_stats = json.loads(stats_file.read_text()) if expected_stats == stats: print("Everything looks good !") return print("*** Expected Stats ***") print(json.dumps(expected_stats, indent=2)) print("*** Diff ***") for fname in sorted(expected_stats.keys()): print(fname) assert fname in expected_stats, "missing file " + fname if expected_stats[fname]["size"] != stats[fname]["size"]: print( " - Expected size", expected_stats[fname]["size"], ", size", stats[fname]["size"], ) if expected_stats[fname]["checksum"] != stats[fname]["checksum"]: print( " - Expected checksum", expected_stats[fname]["checksum"], ", checksum", stats[fname]["checksum"], )
def segments(self) -> List[str]: if self._segments: return self._segments # code by ray : change the source of wet.paths.gz to local disk segments_file = os.path.join(data_dir, self.dump + "wet.paths.gz") with jsonql.smart_open(segments_file) as f: segments = [segment.strip() for segment in f] n = len(segments) i_min = (self.shard * n) // self.num_shards i_max = ((self.shard + 1) * n) // self.num_shards if self.num_segments_per_shard > 0: i_max = min(i_max, i_min + self.num_segments_per_shard) self._segments = segments[i_min:i_max] return self._segments
def _dump_sentence_hashes(source: Path, output: Path, field: str): treated = 0 started = time.time() with jsonql.smart_open(source, "r") as f, open(output, "wb") as o: for doc in jsonql.read_jsons(f): content = doc.get(field) if not content: continue h = compute_hashes(content) if h is None: continue h.tofile(o) treated += 1 if treated % 100_000 == 0: delay = time.time() - started log(f"Computed {treated} documents hashes in {delay / 3600:.2f}h ({treated / delay} doc / s)" )
def perplexity_to_bin(file: Path, output: Path, models, tok_field: str): pp_field = "perplexity" lm = DocLM(models, tok_field, output_field=pp_field) stats: List[float] = [] max_stats = 1_000_000 batch_size = 100_000 i = 0 batch = [] with jsonql.smart_open(file) as f, open(output, "wb") as o: for doc in jsonql.read_jsons(f): i += 1 pp = lm(doc)[pp_field] if len(stats) < max_stats: stats.append(pp) batch.append(pp) if len(batch) >= batch_size: np.array(batch, dtype=np.float32).tofile(o) batch = [] if len(batch) > 0: np.array(batch, dtype=np.float32).tofile(o)
def open_segment(self, segment: str) -> ContextManager[Iterable[str]]: url = "/".join((WET_URL_ROOT, segment)) if not self.cache_dir: self.retrieved_segments += 1 return jsonql.open_remote_file(url) file = self.cache_dir / segment.split("/")[-1] if not file.exists(): self.retrieved_segments += 1 tmp = file.with_name(f"tmp_{os.getpid()}." + file.name) content = jsonql.request_get_content(url) tmp.write_bytes(content) # don't overwrite a file that might being read from other process. if not file.exists(): shutil.move(tmp, file) else: tmp.unlink() # read from memory if possible return gzip.open(io.BytesIO(content), mode="rt") return jsonql.smart_open(file)
def __iter__(self) -> Iterator[dict]: n = len(self.segments) for i, segment in enumerate(self.segments): start = time.time() # TODO: start downloading the next segment in the background # code by ray : split the file path from wet.path.gz and get the # segment file from local disk rather than remote url segment = segment.split("/")[-1] segment_file = os.path.join(data_dir, self.dump, segment) with jsonql.smart_open(segment_file) as f: for doc in parse_warc_file(iter(f), self.min_len): doc["cc_segment"] = segment yield doc if i + 1 >= n: continue end = time.time() delay = (end - start) / 3600 * (n - 1 - i) logger.info( f"Parsed {i + 1} / {n} files. Estimated remaining time: {delay:.1f}h" )
def readlines(filename): with jsonql.smart_open(filename) as f: return list(jsonql.lines(f))