예제 #1
0
파일: dedup.py 프로젝트: zl827154659/cc_net
 def __init__(self,
              field: str,
              output: Path = None,
              hashes: AbstractDedupHashSet = None):
     super().__init__()
     self.n_lines = 0
     self.field = field
     self.output = output
     self.hashes = FlatHashSet() if hashes is None else hashes
     self.num_hashes_start = len(self.hashes)
예제 #2
0
def test_dedup_with_np_dump(tmp_path: Path):
    hashes = tmp_path / "hashes.bin"
    documents = [
        dict(text=text("_Hello", "_World", "I'm so original")),
        dict(text=text("_world", "I'm originaler", "_Hello")),
    ]
    with dedup.HashesCollector(field="text", output=hashes) as d:
        list(d.map(documents))

    results = FlatHashSet()
    results.load_np(hashes)
    expected = set(
        str_hash(l) for l in ["_hello", "_world", "i'm so original", "i'm originaler"]
    )
    assert expected == set(results.keys())
예제 #3
0
    def test_dedup_with_np_dump(self):
        tmp = self.get_tmpdir()

        documents = [
            dict(text=text("_Hello", "_World", "I'm so original")),
            dict(text=text("_world", "I'm originaler", "_Hello")),
        ]
        with dedup.HashesCollector(field="text",
                                   output=tmp("hashes.bin")) as d:
            list(d.map(documents))

        results = FlatHashSet()
        results.load_np(tmp("hashes.bin"))
        expected = set(
            str_hash(l)
            for l in ["_hello", "_world", "i'm so original", "i'm originaler"])
        self.assertEqual(expected, set(results.keys()))
예제 #4
0
파일: dedup.py 프로젝트: jsedoc/cc_net
class HashesCollector(jsonql.Transformer):
    """
    Collect all hashes found of lines found in the `field` of the source documents.
    """

    parallelisable = False

    def __init__(self,
                 field: str,
                 output: Path = None,
                 hashes: AbstractDedupHashSet = None):
        super().__init__()
        self.n_lines = 0
        self.field = field
        self.output = output
        self.hashes = FlatHashSet() if hashes is None else hashes
        self.num_hashes_end = 0
        self.num_hashes_start = len(self.hashes)

    def summary(self) -> List[str]:
        summ = super().summary()
        h = self.num_hashes_end if self.hashes is None else len(self.hashes)
        h = (h - self.num_hashes_start) // 1000
        max_mem = mem_footprint_gb()
        n = self.n_lines // 1000
        summ.append(
            f"Found {h:_}k unique hashes over {n:_}k lines. Using {max_mem:.1f}GB of RAM."
        )
        return summ

    def do(self, doc: dict) -> None:
        doc_hashes = compute_hashes(doc.get(self.field))
        if doc_hashes is None:
            return
        self.hashes.add(doc_hashes)
        self.n_lines += doc_hashes.size

    def close(self):
        if self.output and self.hashes:
            self.hashes.dump(self.output)
            self.log(f"Saved {len(self.hashes)} hashes to {self.output}")
            # Save the number of hashes.
            self.num_hashes_end = len(self.hashes)
            # Free up mem even if the transformer is kept somewhere else.
            self.hashes = None  # type: ignore
예제 #5
0
파일: dedup.py 프로젝트: jsedoc/cc_net
    def _prepare(self):
        if self.duplicates is not None:
            return
        self.duplicates = FlatHashSet()

        start = time.time()
        for h in self.hashes_files:
            shard_start = time.time()
            self.duplicates.load(str(h))
            delay = time.time() - shard_start
            self.log(
                f"Loaded hashes from {h} ({mem_footprint_gb():.3f}GB total, took {delay / 60:.1}m)"
            )

        delay = time.time() - start
        self.log(
            f"Loaded {len(self.duplicates):_d} hashes from {len(self.hashes_files)} files. ({mem_footprint_gb():.1f}GB total, took {delay / 60:.1}m)"
        )
예제 #6
0
파일: dedup.py 프로젝트: zl827154659/cc_net
def deduplicate_concatenated(files,
                             outputs,
                             field,
                             output_hashes,
                             finalize=True):
    """Deduplicate several files at once, using the same set of hashes for all."""
    hashes = FlatHashSet()
    dedup_kwargs = dict(
        field=field,
        hashes=hashes,
        add_hashes=True,
        output_hashes=None,
        finalize=finalize,
    )

    assert len(files) == len(outputs)
    for f, o in zip(files, outputs):
        jsonql.run_pipe(deduplicate, dedup_kwargs, file=f, output=o)
        log(f"Saw {len(hashes)} hashes.")

        if output_hashes:
            log(f"Dumping {len(hashes)} hashes to {output_hashes}.")
            hashes.dump(output_hashes)
예제 #7
0
파일: dedup.py 프로젝트: jsedoc/cc_net
def merge_shard(hash_files, output):
    h = FlatHashSet()
    h.load(hash_files[0])
    for hash_file in hash_files[1:]:
        h = merge(h, hash_file, output=None)
        print(f"Merged {hash_file}. We now have {len(h)} hashes.")

    h.dump(output)
    print(f"Saved {len(h)} hashes to {output}.")
예제 #8
0
파일: dedup.py 프로젝트: jsedoc/cc_net
def merge(hashes_1, hashes_2, output):
    if isinstance(hashes_1, str):
        h1 = FlatHashSet()
        h1.load(hashes_1)
    else:
        h1 = hashes_1

    if isinstance(hashes_2, str):
        h2 = FlatHashSet()
        h2.load(hashes_2)
    else:
        h2 = hashes_2

    h2_np = np.fromiter(h2.keys(), dtype=FlatHashSet.dtype, count=len(h2))
    dup = h1.__contains__(h2_np)

    # Dups between h1 and h2 will be set to 1, keys unique to h2 are copied to
    # h1 with their value.
    h1[h2_np] = dup
    if output:
        h1.dump(output)
    return h1
예제 #9
0
파일: dedup.py 프로젝트: jsedoc/cc_net
class DuplicatesRemover(jsonql.Transformer):
    """DuplicatesRemover"""

    # The hashes can't be pickled so they will have to be read back from disk.
    warn_when_pickling = True

    def __init__(self,
                 field: str,
                 hashes_files: List[Path],
                 collect: bool = False):
        """
        Remove duplicates
        """
        super().__init__()
        self.field = field
        self.collect = collect

        self.hashes_files = hashes_files
        self.duplicates: Optional[AbstractDedupHashSet] = None

        self.n_lines, self.n_lines_kept = 0, 0
        self.n_chars, self.n_chars_kept = 0, 0

    def _prepare(self):
        if self.duplicates is not None:
            return
        self.duplicates = FlatHashSet()

        start = time.time()
        for h in self.hashes_files:
            shard_start = time.time()
            self.duplicates.load(str(h))
            delay = time.time() - shard_start
            self.log(
                f"Loaded hashes from {h} ({mem_footprint_gb():.3f}GB total, took {delay / 60:.1}m)"
            )

        delay = time.time() - start
        self.log(
            f"Loaded {len(self.duplicates):_d} hashes from {len(self.hashes_files)} files. ({mem_footprint_gb():.1f}GB total, took {delay / 60:.1}m)"
        )

    def do(self, doc: dict) -> Optional[dict]:
        content = doc.get(self.field)
        if not content:
            return None
        doc_hashes = compute_hashes(content)

        assert self.duplicates is not None
        seen = (self.duplicates.add(doc_hashes)
                if self.collect else self.duplicates[doc_hashes])
        keep = seen < True
        kept = keep.sum()
        if kept == 0:
            return None
        doc_hashes = doc_hashes * keep
        self.n_lines += keep.size
        self.n_lines_kept += kept
        chars, kept_chars = finalize_doc(doc, self.field, hashes=doc_hashes)
        self.n_chars += chars
        self.n_chars_kept += kept_chars
        return doc

    def summary(self) -> List[str]:
        summ = super().summary()
        end_time = time.time()
        n_lines_kept, n_lines, n_docs = self.n_lines_kept, self.n_lines, self.processed
        speed = n_docs / (end_time - self.start_time)
        summ.append(
            f"Processed {self.n_lines} lines in {n_docs} docs. [{speed:.1f} doc/s]"
        )
        selectivity = self.n_lines_kept / self.n_lines if n_lines else 0
        summ.append(
            f"Kept {n_lines_kept} lines out of {n_lines} ({selectivity:.1%}).")

        n_chars_kept, n_chars = self.n_chars_kept, self.n_chars
        selectivity = n_chars_kept / n_chars if n_chars else 0
        summ.append(
            f"Kept {n_chars_kept} chars out of {n_chars} ({selectivity:.1%}).")
        return summ
예제 #10
0
파일: dedup.py 프로젝트: jsedoc/cc_net
def remove_duplicates_sharded(
    files: List[Path],
    outputs: List[Path],
    hashes_dir: FilesOrDir,
    field: str,
    group_hashes: int = 1,
    tmp_dir: Path = None,
    min_len: int = 0,
):
    """Remove duplicates in several passes, when all hashes don't fit in RAM.

    Note: The current implementation is not doing a 'perfect' deduplication.
    If a hash appear exactly once in each shard of hashes it won't be detected
    as a duplicate. This can be fixed if hashes are fully dedup beforehand.
    """
    assert len(files) == len(outputs)

    if isinstance(hashes_dir, list):
        hashes_files = hashes_dir
    else:
        hashes_files = sorted(h for h in Path(hashes_dir).iterdir()
                              if h.suffix == ".bin")

    assert len(hashes_files) > 0, f"no hashes files found in: {hashes_dir}"

    if len(hashes_files) <= group_hashes:
        log(f"All hashes can be done in one pass, using DuplicatesRemover on {files}"
            )
        rm_dups = DuplicatesRemover(field, hashes_files)
        rm_dups._prepare()
        run_par((jsonql.run_pipes, (rm_dups, ), dict(file=f, output=o))
                for f, o in zip(files, outputs))
        return

    log(f"Starting deduplicate_sharded on {files}.")
    tmp_directory = tempfile.TemporaryDirectory(
        dir=str(tmp_dir) if tmp_dir else None)

    def tmp_files(i):
        return [
            Path(tmp_directory.name) / (f.name.split(".")[0] + f".{i}.bin")
            for f in files
        ]

    last = tmp_files(0)
    run_par((_dump_sentence_hashes, (f, tmp, field), {})
            for f, tmp in zip(files, last))

    if isinstance(hashes_dir, list):
        hashes_files = hashes_dir
    else:
        hashes_files = sorted(h for h in Path(hashes_dir).iterdir()
                              if h.suffix == ".bin")
    for i, group in enumerate(jsonql.grouper(hashes_files, group_hashes)):
        hashes = FlatHashSet()
        for h in group:
            hashes.load(h)
            log(f"Loaded {h}, up to {len(hashes)} hashes ({mem_footprint_gb()}GB)"
                )

        intermediates = tmp_files(i + 1)
        # Remove hashes in parallel. Since modern OS have "copy-on-write" and
        # `hashes` is read-only, we will only have one version of it in RAM.
        run_par((_remove_duplicate_hashes, (hashes, f, tmp), {})
                for f, tmp in zip(last, intermediates))
        # Force hashes to be freed, before we start allocating a new one.
        del hashes
        gc.collect()

        for tmp in last:
            os.remove(tmp)
        last = intermediates

    def finalize(source, dedup_hashes, min_len):
        n_chars, n_chars_kept = 0, 0
        with open(dedup_hashes, "rb") as hashes:
            for doc in jsonql.read_jsons(source):
                content = doc.get(field)
                if not content or len(content) < min_len:
                    continue
                sentences = content.split("\n")
                doc_hashes = np.fromfile(hashes,
                                         dtype=HASH_TYPE,
                                         count=len(sentences))
                chars, kept_chars = finalize_doc(doc, field, doc_hashes)
                n_chars += chars
                n_chars_kept += kept_chars
                yield doc
        selectivity = n_chars_kept / n_chars if n_chars else 0
        log(f"Kept {n_chars_kept} chars out of {n_chars} ({selectivity:.1%}).")

    dedup_hashes = last
    run_par([(
        jsonql.run_pipe,
        (finalize, ),
        dict(kwargs=dict(dedup_hashes=h, min_len=min_len), file=f, output=o),
    ) for h, f, o in zip(dedup_hashes, files, outputs)])

    tmp_directory.cleanup()
예제 #11
0
    def test_remove_duplicates_sharded(self):
        data = self.get_tmpdir()
        part_0 = [["Hello", "_World", "I'm so original"]]
        write_docs(data("part_0.json"), part_0)
        part_1 = [["_Good morning", "_World", "I'm originaler"]]
        write_docs(data("part_1.json"), part_1)

        h = self.get_tmpdir()
        h0 = FlatHashSet()
        h0.add([str_hash(s.lower()) for doc in part_0 for s in doc])
        h0.add([str_hash("_world")])
        h0.dump(h("part_0.bin"))
        self.assertEqual(
            {
                str_hash("hello"): False,
                str_hash("_world"): True,
                str_hash("i'm so original"): False,
            },
            as_dict(h0),
        )

        h1 = FlatHashSet()
        h1.add([str_hash(s.lower()) for doc in part_1 for s in doc])
        h1.add([str_hash("_good morning")])
        h1.dump(h("part_1.bin"))
        self.assertEqual(
            {
                str_hash("_good morning"): True,
                str_hash("_world"): False,
                str_hash("i'm originaler"): False,
            },
            as_dict(h1),
        )

        res = self.get_tmpdir()
        # dedup.DISABLE_MULTI_PROCESSING = True  # Simplifies debugging
        dedup.remove_duplicates_sharded(
            files=[data("part_0.json"),
                   data("part_1.json")],
            outputs=[res("part_0.json"),
                     res("part_1.json")],
            field="text",
            hashes_dir=h(),
        )

        with open(res("part_0.json")) as o:
            lines = o.readlines()
            print(lines)
            results_0 = list(jsonql.read_jsons(lines))
        expected_0 = [
            dict(text=text("Hello", "I'm so original"),
                 original_nlines=3,
                 nlines=2)
        ]
        assert_documents_equal(expected_0, results_0, ignoring=CUMBERSOME)

        with open(res("part_1.json")) as o:
            results_1 = [json.loads(l) for l in o.readlines()]
        # First pass removes "_world", second "_good morning".
        expected_1 = [
            dict(text=text("I'm originaler"), original_nlines=3, nlines=1)
        ]

        assert_documents_equal(expected_1, results_1, ignoring=CUMBERSOME)
예제 #12
0
파일: dedup.py 프로젝트: zl827154659/cc_net
def deduplicate(source,
                field,
                hashes=None,
                output_hashes=None,
                add_hashes=True,
                finalize=True):
    """
    DOES TOO MANY THINGS
    Removes duplicate lines found in the field `field` of the source documents.

    Finds duplicate lines based on the hashes. Either hashes can be computed when
    reading the documents or they can be loaded from a binary file.

    If `add_hashes` is set to False only the given hashes will be considered.
    This grants a better control on memory footprint.
    """
    hash_field = field + "_hash"
    if isinstance(hashes, str) or isinstance(hashes, Path):
        seen = FlatHashSet()
        seen.load(hashes)
    elif hashes is not None:
        seen = hashes
    else:
        seen = FlatHashSet()
    log(f"Loaded {len(seen)} unique hashes.")
    n_doc = 0
    batch_size = 100_000
    n_lines, n_lines_kept = 0, 0
    n_chars, n_chars_kept = 0, 0
    t = time.time()

    def log_stats(start_time):
        end_time = time.time()
        speed = batch_size / (end_time - start_time)

        if add_hashes:
            log(f"Saw {len(seen)} unique hashes over {n_lines} lines in {n_doc} docs. [{speed:.1f} doc/s]"
                )
        else:
            log(f"Processed {n_lines} lines in {n_doc} docs. [{speed:.1f} doc/s]"
                )
        max_mem = mem_footprint_gb()
        log(f"Used up to {max_mem:.1f}GB of RAM.")
        selectivity = n_lines_kept / n_lines if n_lines else 0
        log(f"Kept {n_lines_kept} lines out of {n_lines} ({selectivity:.1%}).")
        if finalize:
            selectivity = n_chars_kept / n_chars if n_chars else 0
            log(f"Kept {n_chars_kept} chars out of {n_chars} ({selectivity:.1%})."
                )

    for doc in jsonql.read_jsons(source):
        n_doc += 1
        if n_doc % batch_size == 0:
            log_stats(t)
            t = time.time()

        hashes = doc.get(hash_field) or compute_hashes(doc.get(field))
        if hashes is None:
            continue
        if isinstance(hashes, list):
            hashes = np.array(hashes, dtype=HASH_TYPE)

        duplicate = seen.__contains__(hashes)
        if add_hashes:
            seen.add(hashes, duplicate)

        keep = duplicate < 1
        kept = keep.sum()
        hashes = hashes * keep
        doc[hash_field] = list(int(x) for x in hashes)
        n_lines += keep.size
        n_lines_kept += kept
        if finalize:
            chars, kept_chars = finalize_doc(doc, field)
            n_chars += chars
            n_chars_kept += kept_chars
        if kept > 0:
            yield doc

    log_stats(t)

    if output_hashes:
        log(f"Dumping {len(seen)} hashes to {output_hashes}.")
        seen.dump(output_hashes)
예제 #13
0
파일: dedup.py 프로젝트: zl827154659/cc_net
 def close(self):
     if self.output and self.hashes:
         self.hashes.dump(self.output)
         # Free up mem even if the transformer is kept somewhere else.
         self.hashes = FlatHashSet()
예제 #14
0
def test_remove_duplicates_sharded(tmp_path: Path):
    data = tmp_path / "data"
    part_0 = [["Hello", "_World", "I'm so original"]]
    write_docs(data / "part_0.json", part_0)
    part_1 = [["_Good morning", "_World", "I'm originaler"]]
    write_docs(data / "part_1.json", part_1)

    h = tmp_path / "hashes"
    h.mkdir()
    h0 = FlatHashSet()
    h0.add([str_hash(s.lower()) for doc in part_0 for s in doc])
    h0.add([str_hash("_world")])
    h0.dump(h / "part_0.bin")
    assert {
        str_hash("hello"): False,
        str_hash("_world"): True,
        str_hash("i'm so original"): False,
    } == as_dict(h0)

    h1 = FlatHashSet()
    h1.add([str_hash(s.lower()) for doc in part_1 for s in doc])
    h1.add([str_hash("_good morning")])
    h1.dump(h / "part_1.bin")
    assert {
        str_hash("_good morning"): True,
        str_hash("_world"): False,
        str_hash("i'm originaler"): False,
    } == as_dict(h1)

    res = tmp_path / "res"
    res.mkdir()
    # dedup.DISABLE_MULTI_PROCESSING = True  # Simplifies debugging
    dedup.remove_duplicates_sharded(
        files=[data / "part_0.json", data / "part_1.json"],
        outputs=[res / "part_0.json", res / "part_1.json"],
        field="text",
        hashes_dir=h,
    )

    results_0 = list(jsonql.read_jsons(res / "part_0.json"))
    expected_0 = [
        dict(
            text=text("Hello", "I'm so original"),
            original_nlines=3,
            nlines=2,
            line_ids=[0, 2],
        )
    ]
    assert_documents_equal(expected_0, results_0, ignoring=LENGTHS)

    # First pass removes "_world", second "_good morning".
    results_1 = list(jsonql.read_jsons(res / "part_1.json"))
    expected_1 = [
        dict(text=text("I'm originaler"), original_nlines=3, nlines=1, line_ids=[2])
    ]

    assert_documents_equal(expected_1, results_1, ignoring=LENGTHS)