def check_regroup(tmp_path, regroup_fn, check_blocks_boundaries=False): n_shards = 4 n_docs = 20 shards = [[ dict(id=i, shard=s, raw_content="hello world") for i in range(n_docs) ] for s in range(n_shards)] shards_files = [tmp_path / f"{s:04d}.json.gz" for s in range(n_shards)] for shard, shard_file in zip(shards, shards_files): jsonql.run_pipes(inputs=shard, output=shard_file) regroup_file = tmp_path / "regroup.json.gz" start = time.time() regroup_fn(shards_files, regroup_file) duration = time.time() - start print(f"{regroup_fn.__module__}.{regroup_fn.__name__} took {duration}s") regrouped = list(jsonql.read_jsons(regroup_file)) assert [doc for shard in shards for doc in shard] == regrouped readers = jsonql.get_block_readers(regroup_file, n_shards) if not check_blocks_boundaries: assert [doc for shard in shards for doc in shard] == [ doc for reader in readers for doc in jsonql.read_jsons(reader) ] return for shard, reader in zip(shards, readers): block = [doc for doc in jsonql.read_jsons(reader)] assert shard == block
def test_remove_duplicates_sharded(tmp_path: Path): data = tmp_path / "data" part_0 = [["Hello", "_World", "I'm so original"]] write_docs(data / "part_0.json", part_0) part_1 = [["_Good morning", "_World", "I'm originaler"]] write_docs(data / "part_1.json", part_1) h = tmp_path / "hashes" h.mkdir() h0 = FlatHashSet() h0.add([str_hash(s.lower()) for doc in part_0 for s in doc]) h0.add([str_hash("_world")]) h0.dump(h / "part_0.bin") assert { str_hash("hello"): False, str_hash("_world"): True, str_hash("i'm so original"): False, } == as_dict(h0) h1 = FlatHashSet() h1.add([str_hash(s.lower()) for doc in part_1 for s in doc]) h1.add([str_hash("_good morning")]) h1.dump(h / "part_1.bin") assert { str_hash("_good morning"): True, str_hash("_world"): False, str_hash("i'm originaler"): False, } == as_dict(h1) res = tmp_path / "res" res.mkdir() # dedup.DISABLE_MULTI_PROCESSING = True # Simplifies debugging dedup.remove_duplicates_sharded( files=[data / "part_0.json", data / "part_1.json"], outputs=[res / "part_0.json", res / "part_1.json"], field="text", hashes_dir=h, ) results_0 = list(jsonql.read_jsons(res / "part_0.json")) expected_0 = [ dict( text=text("Hello", "I'm so original"), original_nlines=3, nlines=2, line_ids=[0, 2], ) ] assert_documents_equal(expected_0, results_0, ignoring=LENGTHS) # First pass removes "_world", second "_good morning". results_1 = list(jsonql.read_jsons(res / "part_1.json")) expected_1 = [ dict(text=text("I'm originaler"), original_nlines=3, nlines=1, line_ids=[2]) ] assert_documents_equal(expected_1, results_1, ignoring=LENGTHS)
def test_split(tmp_path: Path): data = [ dict(text="Hello world", lang="en"), dict(text="Boujour les amis", lang="fr"), dict(text="Rock your boat", lang="en"), ] with jsonql.split(tmp_path / "{lang}.json") as split: list(split.map(data)) summary = split.summary() assert "Found 2 splits." in summary en_docs = list(jsonql.read_jsons(tmp_path / "en.json")) assert [data[0], data[2]] == en_docs fr_docs = list(jsonql.read_jsons(tmp_path / "fr.json")) assert [data[1]] == fr_docs
def make_corpus(file: Path, tags_file: Path = None, output: Path = None) -> None: """ Loads a tags file and create a training dataset using the given webpages. Arguments: - file: CC shard file - tags_file: dmoz tagging file, (like the one produced by `dl`) - output: "" """ url2tags = load_tags(tags_file) with jsonql.smart_open(file) as f, jsonql.smart_open(output, "w") as o: for document in jsonql.read_jsons(f): if not document: continue url = document["url"] domain = document["source_domain"] if url in url2tags: tags = url2tags[url] elif domain in url2tags: tags = url2tags[domain] else: continue if len(tags) == 0: continue fasttext_tags = ["__label__" + tag for tag in tags] content = document["tokenized"].replace("\n", " ").lower() if len(content) > 200: print(" ".join(fasttext_tags), content, file=o) # type: ignore
def test_dedup_fast(tmp_path: Path): data = tmp_path / "data" part_0 = [["Hello", "_World", "I'm so original"]] write_docs(data / "part_0.json", part_0) part_1 = [["Good morning", "_World", "I'm originaler"]] write_docs(data / "part_1.json", part_1) parts = [data / "part_0.json", data / "part_1.json"] res = tmp_path / "res" res.mkdir() h = tmp_path / "hashes.bin" field = "text" jsonql.run_pipes(dedup.HashesCollector(field, output=h), file=parts) for part in parts: jsonql.run_pipes( dedup.DuplicatesRemover(field, [h]), file=part, output=res / part.name ) jsonql.run_pipes( dedup.DuplicatesRemover(field, [h]), file=part, output=res / part.name ) results_0 = list(jsonql.read_jsons(res / "part_0.json")) expected_0 = [ dict( text=text("Hello", "I'm so original"), original_nlines=3, nlines=2, line_ids=[0, 2], ) ] assert_documents_equal(expected_0, results_0, ignoring=LENGTHS) results_1 = list(jsonql.read_jsons(res / "part_1.json")) expected_1 = [ dict( text=text("Good morning", "I'm originaler"), original_nlines=3, nlines=2, line_ids=[0, 2], ) ] assert_documents_equal(expected_1, results_1, ignoring=LENGTHS) words = [w for part in [part_0, part_1] for doc in part for w in doc] expected = {str_hash(s.lower()): s.startswith("_") for s in words} assert expected == load_hashes(h)
def test_split(self): tmp = self.get_tmpdir() data = [ dict(text="Hello world", lang="en"), dict(text="Boujour les amis", lang="fr"), dict(text="Rock your boat", lang="en"), ] with jsonql.split(tmp("{lang}.json")) as split: list(split.map(data)) summary = split.summary() self.assertIn("Found 2 splits.", summary) with open(tmp("en.json")) as f_en: en_docs = list(jsonql.read_jsons(f_en)) self.assertEqual([data[0], data[2]], en_docs) with open(tmp("fr.json")) as f_fr: fr_docs = list(jsonql.read_jsons(f_fr)) self.assertEqual([data[1]], fr_docs)
def unminify_file(file: Union[Path, str], output: Path, cache_dir: Path = None): unminifier = Unminifier(cache_dir) with jsonql.smart_open(file) as f: mini = [m for m in jsonql.read_jsons(f)] unminifier.look_for(mini) tmp = output.with_name("tmp." + output.name) jsonql.run_pipes(unminifier, file=iter(mini), output=tmp) shutil.move(tmp, output) f_size = Path(file).stat().st_size if Path(file).exists() else 0 o_size = output.stat().st_size mb = 1024 ** 2 return f"Unminified {output} ({f_size // mb:_}Mb -> {o_size // mb:_}Mb)"
def test_blocked_gzip(tmp_path: Path): file = tmp_path / "test.gz" f = str(file) # Each object is 10/11 bytes long. We have 2 of them by block. content = ['{"xx": %d}' % i for i in range(80)] with jsonql.BlockedGzipWriter(file, "wt", block_size="20B") as o: for line in content: print(line, file=o) jr = jsonql.JsonReader(strict=True) expected = list(jr.map(content)) # read as one file assert expected == list(jsonql.read_jsons(file)) # read first block assert expected[:2] == list(jsonql.read_jsons(f + "[0/40]")) # read last block assert expected[-2:] == list(jsonql.read_jsons(f + "[39/40]")) readers = jsonql.get_block_readers(file, 9) read_as_several_files = [list(jsonql.read_jsons(r)) for r in readers] # 40 splits of 2 docs, 9 readers -> 5 splits, 10 docs per reader assert list(jsonql.grouper(expected, 10)) == read_as_several_files
def extract_opening_text(source, n_docs: int = 10_000): i = 0 for doc in jsonql.read_jsons(source): if not doc: continue text = doc.get("opening_text") if not text: continue yield text_normalizer.normalize(text) i += 1 if i >= n_docs: break
def finalize(source, dedup_hashes, min_len): n_chars, n_chars_kept = 0, 0 with open(dedup_hashes, "rb") as hashes: for doc in jsonql.read_jsons(source): content = doc.get(field) if not content or len(content) < min_len: continue sentences = content.split("\n") doc_hashes = np.fromfile(hashes, dtype=HASH_TYPE, count=len(sentences)) chars, kept_chars = finalize_doc(doc, field, doc_hashes) n_chars += chars n_chars_kept += kept_chars yield doc selectivity = n_chars_kept / n_chars if n_chars else 0 log(f"Kept {n_chars_kept} chars out of {n_chars} ({selectivity:.1%}).")
def _dump_sentence_hashes(source: Path, output: Path, field: str): treated = 0 started = time.time() with open(output, "wb") as o: for doc in jsonql.read_jsons(source): content = doc.get(field) if not content: continue h = compute_hashes(content) if h is None: continue h.tofile(o) treated += 1 if treated % 100_000 == 0: delay = time.time() - started log(f"Computed {treated} documents hashes in {delay / 3600:.2f}h ({treated / delay} doc / s)" )
def deduplicate_two_pass(file: jsonql.FileDescriptor, field: str = "raw_content") -> Iterable[dict]: """Remove duplicates of the given file (even removing the first occurence). This is what is done in the paper, and in mine.py """ try: if isinstance(file, Path): hash_file: Path = file.with_suffix(".bin") else: hash_file = jsonql._tmp(Path("hashes.bin")) jsonql.run_pipes(jsonql.JsonReader(), HashesCollector(field, output=hash_file), file=file) dup_remover = DuplicatesRemover(field, [hash_file]) return dup_remover.map(jsonql.read_jsons(file)) finally: if hash_file.exists(): hash_file.unlink()
def fetch_metadata(self, segment: str) -> None: meta_file = self.meta_file(segment) k = get_doc_key self.metadata = {} collision = 0 for m in jsonql.read_jsons(meta_file): key = k(m["digest"]) if key in self.metadata: collision += 1 self.metadata[key] = m self.log(f"Loaded {len(self.metadata)} metadatas from {meta_file}") if collision > 0: self._logger.warning(f"Found {collision} collisions !") self.segment = segment if segment in self._segments: self.log("Cache miss") self.segments_read_twice += 1 self._segments.add(segment)
def perplexity_to_bin(file: Path, output: Path, models, tok_field: str): pp_field = "perplexity" lm = DocLM(models, tok_field, output_field=pp_field) stats: List[float] = [] max_stats = 1_000_000 batch_size = 100_000 i = 0 batch = [] with jsonql.smart_open(file) as f, open(output, "wb") as o: for doc in jsonql.read_jsons(f): i += 1 pp = lm(doc)[pp_field] if len(stats) < max_stats: stats.append(pp) batch.append(pp) if len(batch) >= batch_size: np.array(batch, dtype=np.float32).tofile(o) batch = [] if len(batch) > 0: np.array(batch, dtype=np.float32).tofile(o)
def deduplicate(file: jsonql.ReadableFileLike, field: str = "raw_content") -> Iterable[dict]: """Remove duplicates of the given file (but keep the first occurence).""" dup_remover = DuplicatesRemover(field, [], collect=True) return dup_remover.map(jsonql.read_jsons(file))
def _dl_shard(snapshot: str, shard: int) -> Iterator[Paragraph]: """ Download metadata from a shards. Sample metadata: { "cc_segment": "crawl-data/CC-MAIN-2018-51/segments/1544376823009.19/wet/CC-MAIN-20181209185547-20181209211547-00000.warc.wet.gz", "digest": "sha1:222LWNHN5FM26XGS7WJSMI6IISTVWBKJ", "url": "http://personals.gearplay.com/ads/DRJONES.htm", "line_ids": [10], "languages": ["en_XX"], "lm_scores": [-2.658], } """ snapshot = snapshot.replace("-", "_") name = f"snap_{snapshot}_batch_{shard}.json.gz" url = "/".join([S3_BUCKET, VERSION, name]) shard_metadata: Dict[str, Dict[str, dict]] = defaultdict(dict) try: cache_file: Optional[Path] = None if WET_CACHE is not None: cache_file = WET_CACHE / name metadata_file = jsonql.open_remote_file(url, cache_file) except: logging.warning(f"Couldn't open {url}") return for meta in jsonql.read_jsons(metadata_file): shard_metadata[meta["cc_segment"]][meta["digest"]] = meta found_pars, missed_pars = 0, 0 for seg, segment_metadata in shard_metadata.items(): for doc in CCSegmentsReader([seg], cache_dir=WET_CACHE): if doc["digest"] not in segment_metadata: continue meta = segment_metadata[doc["digest"]] full_pars = [doc["title"]] + doc["raw_content"].split("\n") assert len(meta["line_ids"]) == len(meta["languages"]) assert len(meta["line_ids"]) == len(meta["lm_scores"]) for i, lang, score in zip(meta["line_ids"], meta["languages"], meta["lm_scores"]): if snapshot != "2018-51" and lang in BIG_LANGUAGES: # Big languages only come from "2018-51" snapshot continue if i >= len(full_pars): # This is because CC100 was created by saving only urls. # Some urls appears in different snapshot with slightly different # versions, but we don't know which one is correct. # Here we read both versions, but some index may end up # being incorrect. # This impact ~3% documents. missed_pars += 1 continue yield Paragraph(lang, full_pars[i], score) found_pars += 1 if missed_pars > 0: logging.warning( f"Missed {missed_pars} ({missed_pars / found_pars:%}) paragraphes." )
def deduplicate(source, field, hashes=None, output_hashes=None, add_hashes=True, finalize=True): """ DOES TOO MANY THINGS Removes duplicate lines found in the field `field` of the source documents. Finds duplicate lines based on the hashes. Either hashes can be computed when reading the documents or they can be loaded from a binary file. If `add_hashes` is set to False only the given hashes will be considered. This grants a better control on memory footprint. """ hash_field = field + "_hash" if isinstance(hashes, str) or isinstance(hashes, Path): seen = FlatHashSet() seen.load(hashes) elif hashes is not None: seen = hashes else: seen = FlatHashSet() log(f"Loaded {len(seen)} unique hashes.") n_doc = 0 batch_size = 100_000 n_lines, n_lines_kept = 0, 0 n_chars, n_chars_kept = 0, 0 t = time.time() def log_stats(start_time): end_time = time.time() speed = batch_size / (end_time - start_time) if add_hashes: log(f"Saw {len(seen)} unique hashes over {n_lines} lines in {n_doc} docs. [{speed:.1f} doc/s]" ) else: log(f"Processed {n_lines} lines in {n_doc} docs. [{speed:.1f} doc/s]" ) max_mem = mem_footprint_gb() log(f"Used up to {max_mem:.1f}GB of RAM.") selectivity = n_lines_kept / n_lines if n_lines else 0 log(f"Kept {n_lines_kept} lines out of {n_lines} ({selectivity:.1%}).") if finalize: selectivity = n_chars_kept / n_chars if n_chars else 0 log(f"Kept {n_chars_kept} chars out of {n_chars} ({selectivity:.1%})." ) for doc in jsonql.read_jsons(source): n_doc += 1 if n_doc % batch_size == 0: log_stats(t) t = time.time() hashes = doc.get(hash_field) or compute_hashes(doc.get(field)) if hashes is None: continue if isinstance(hashes, list): hashes = np.array(hashes, dtype=HASH_TYPE) duplicate = seen.__contains__(hashes) if add_hashes: seen.add(hashes, duplicate) keep = duplicate < 1 kept = keep.sum() hashes = hashes * keep doc[hash_field] = list(int(x) for x in hashes) n_lines += keep.size n_lines_kept += kept if finalize: chars, kept_chars = finalize_doc(doc, field) n_chars += chars n_chars_kept += kept_chars if kept > 0: yield doc log_stats(t) if output_hashes: log(f"Dumping {len(seen)} hashes to {output_hashes}.") seen.dump(output_hashes)
def test_remove_duplicates_sharded(self): data = self.get_tmpdir() part_0 = [["Hello", "_World", "I'm so original"]] write_docs(data("part_0.json"), part_0) part_1 = [["_Good morning", "_World", "I'm originaler"]] write_docs(data("part_1.json"), part_1) h = self.get_tmpdir() h0 = FlatHashSet() h0.add([str_hash(s.lower()) for doc in part_0 for s in doc]) h0.add([str_hash("_world")]) h0.dump(h("part_0.bin")) self.assertEqual( { str_hash("hello"): False, str_hash("_world"): True, str_hash("i'm so original"): False, }, as_dict(h0), ) h1 = FlatHashSet() h1.add([str_hash(s.lower()) for doc in part_1 for s in doc]) h1.add([str_hash("_good morning")]) h1.dump(h("part_1.bin")) self.assertEqual( { str_hash("_good morning"): True, str_hash("_world"): False, str_hash("i'm originaler"): False, }, as_dict(h1), ) res = self.get_tmpdir() # dedup.DISABLE_MULTI_PROCESSING = True # Simplifies debugging dedup.remove_duplicates_sharded( files=[data("part_0.json"), data("part_1.json")], outputs=[res("part_0.json"), res("part_1.json")], field="text", hashes_dir=h(), ) with open(res("part_0.json")) as o: lines = o.readlines() print(lines) results_0 = list(jsonql.read_jsons(lines)) expected_0 = [ dict(text=text("Hello", "I'm so original"), original_nlines=3, nlines=2) ] assert_documents_equal(expected_0, results_0, ignoring=CUMBERSOME) with open(res("part_1.json")) as o: results_1 = [json.loads(l) for l in o.readlines()] # First pass removes "_world", second "_good morning". expected_1 = [ dict(text=text("I'm originaler"), original_nlines=3, nlines=1) ] assert_documents_equal(expected_1, results_1, ignoring=CUMBERSOME)