Esempio n. 1
0
def test_remove_duplicates_sharded(tmp_path: Path):
    data = tmp_path / "data"
    part_0 = [["Hello", "_World", "I'm so original"]]
    write_docs(data / "part_0.json", part_0)
    part_1 = [["_Good morning", "_World", "I'm originaler"]]
    write_docs(data / "part_1.json", part_1)

    h = tmp_path / "hashes"
    h.mkdir()
    h0 = FlatHashSet()
    h0.add([str_hash(s.lower()) for doc in part_0 for s in doc])
    h0.add([str_hash("_world")])
    h0.dump(h / "part_0.bin")
    assert {
        str_hash("hello"): False,
        str_hash("_world"): True,
        str_hash("i'm so original"): False,
    } == as_dict(h0)

    h1 = FlatHashSet()
    h1.add([str_hash(s.lower()) for doc in part_1 for s in doc])
    h1.add([str_hash("_good morning")])
    h1.dump(h / "part_1.bin")
    assert {
        str_hash("_good morning"): True,
        str_hash("_world"): False,
        str_hash("i'm originaler"): False,
    } == as_dict(h1)

    res = tmp_path / "res"
    res.mkdir()
    # dedup.DISABLE_MULTI_PROCESSING = True  # Simplifies debugging
    dedup.remove_duplicates_sharded(
        files=[data / "part_0.json", data / "part_1.json"],
        outputs=[res / "part_0.json", res / "part_1.json"],
        field="text",
        hashes_dir=h,
    )

    results_0 = list(jsonql.read_jsons(res / "part_0.json"))
    expected_0 = [
        dict(
            text=text("Hello", "I'm so original"),
            original_nlines=3,
            nlines=2,
            line_ids=[0, 2],
        )
    ]
    assert_documents_equal(expected_0, results_0, ignoring=LENGTHS)

    # First pass removes "_world", second "_good morning".
    results_1 = list(jsonql.read_jsons(res / "part_1.json"))
    expected_1 = [
        dict(text=text("I'm originaler"), original_nlines=3, nlines=1, line_ids=[2])
    ]

    assert_documents_equal(expected_1, results_1, ignoring=LENGTHS)
Esempio n. 2
0
class HashesCollector(jsonql.Transformer):
    """
    Collect all hashes found of lines found in the `field` of the source documents.
    """

    parallelisable = False

    def __init__(self,
                 field: str,
                 output: Path = None,
                 hashes: AbstractDedupHashSet = None):
        super().__init__()
        self.n_lines = 0
        self.field = field
        self.output = output
        self.hashes = FlatHashSet() if hashes is None else hashes
        self.num_hashes_end = 0
        self.num_hashes_start = len(self.hashes)

    def summary(self) -> List[str]:
        summ = super().summary()
        h = self.num_hashes_end if self.hashes is None else len(self.hashes)
        h = (h - self.num_hashes_start) // 1000
        max_mem = mem_footprint_gb()
        n = self.n_lines // 1000
        summ.append(
            f"Found {h:_}k unique hashes over {n:_}k lines. Using {max_mem:.1f}GB of RAM."
        )
        return summ

    def do(self, doc: dict) -> None:
        doc_hashes = compute_hashes(doc.get(self.field))
        if doc_hashes is None:
            return
        self.hashes.add(doc_hashes)
        self.n_lines += doc_hashes.size

    def close(self):
        if self.output and self.hashes:
            self.hashes.dump(self.output)
            self.log(f"Saved {len(self.hashes)} hashes to {self.output}")
            # Save the number of hashes.
            self.num_hashes_end = len(self.hashes)
            # Free up mem even if the transformer is kept somewhere else.
            self.hashes = None  # type: ignore
Esempio n. 3
0
class DuplicatesRemover(jsonql.Transformer):
    """DuplicatesRemover"""

    # The hashes can't be pickled so they will have to be read back from disk.
    warn_when_pickling = True

    def __init__(self,
                 field: str,
                 hashes_files: List[Path],
                 collect: bool = False):
        """
        Remove duplicates
        """
        super().__init__()
        self.field = field
        self.collect = collect

        self.hashes_files = hashes_files
        self.duplicates: Optional[AbstractDedupHashSet] = None

        self.n_lines, self.n_lines_kept = 0, 0
        self.n_chars, self.n_chars_kept = 0, 0

    def _prepare(self):
        if self.duplicates is not None:
            return
        self.duplicates = FlatHashSet()

        start = time.time()
        for h in self.hashes_files:
            shard_start = time.time()
            self.duplicates.load(str(h))
            delay = time.time() - shard_start
            self.log(
                f"Loaded hashes from {h} ({mem_footprint_gb():.3f}GB total, took {delay / 60:.1}m)"
            )

        delay = time.time() - start
        self.log(
            f"Loaded {len(self.duplicates):_d} hashes from {len(self.hashes_files)} files. ({mem_footprint_gb():.1f}GB total, took {delay / 60:.1}m)"
        )

    def do(self, doc: dict) -> Optional[dict]:
        content = doc.get(self.field)
        if not content:
            return None
        doc_hashes = compute_hashes(content)

        assert self.duplicates is not None
        seen = (self.duplicates.add(doc_hashes)
                if self.collect else self.duplicates[doc_hashes])
        keep = seen < True
        kept = keep.sum()
        if kept == 0:
            return None
        doc_hashes = doc_hashes * keep
        self.n_lines += keep.size
        self.n_lines_kept += kept
        chars, kept_chars = finalize_doc(doc, self.field, hashes=doc_hashes)
        self.n_chars += chars
        self.n_chars_kept += kept_chars
        return doc

    def summary(self) -> List[str]:
        summ = super().summary()
        end_time = time.time()
        n_lines_kept, n_lines, n_docs = self.n_lines_kept, self.n_lines, self.processed
        speed = n_docs / (end_time - self.start_time)
        summ.append(
            f"Processed {self.n_lines} lines in {n_docs} docs. [{speed:.1f} doc/s]"
        )
        selectivity = self.n_lines_kept / self.n_lines if n_lines else 0
        summ.append(
            f"Kept {n_lines_kept} lines out of {n_lines} ({selectivity:.1%}).")

        n_chars_kept, n_chars = self.n_chars_kept, self.n_chars
        selectivity = n_chars_kept / n_chars if n_chars else 0
        summ.append(
            f"Kept {n_chars_kept} chars out of {n_chars} ({selectivity:.1%}).")
        return summ
Esempio n. 4
0
    def test_remove_duplicates_sharded(self):
        data = self.get_tmpdir()
        part_0 = [["Hello", "_World", "I'm so original"]]
        write_docs(data("part_0.json"), part_0)
        part_1 = [["_Good morning", "_World", "I'm originaler"]]
        write_docs(data("part_1.json"), part_1)

        h = self.get_tmpdir()
        h0 = FlatHashSet()
        h0.add([str_hash(s.lower()) for doc in part_0 for s in doc])
        h0.add([str_hash("_world")])
        h0.dump(h("part_0.bin"))
        self.assertEqual(
            {
                str_hash("hello"): False,
                str_hash("_world"): True,
                str_hash("i'm so original"): False,
            },
            as_dict(h0),
        )

        h1 = FlatHashSet()
        h1.add([str_hash(s.lower()) for doc in part_1 for s in doc])
        h1.add([str_hash("_good morning")])
        h1.dump(h("part_1.bin"))
        self.assertEqual(
            {
                str_hash("_good morning"): True,
                str_hash("_world"): False,
                str_hash("i'm originaler"): False,
            },
            as_dict(h1),
        )

        res = self.get_tmpdir()
        # dedup.DISABLE_MULTI_PROCESSING = True  # Simplifies debugging
        dedup.remove_duplicates_sharded(
            files=[data("part_0.json"),
                   data("part_1.json")],
            outputs=[res("part_0.json"),
                     res("part_1.json")],
            field="text",
            hashes_dir=h(),
        )

        with open(res("part_0.json")) as o:
            lines = o.readlines()
            print(lines)
            results_0 = list(jsonql.read_jsons(lines))
        expected_0 = [
            dict(text=text("Hello", "I'm so original"),
                 original_nlines=3,
                 nlines=2)
        ]
        assert_documents_equal(expected_0, results_0, ignoring=CUMBERSOME)

        with open(res("part_1.json")) as o:
            results_1 = [json.loads(l) for l in o.readlines()]
        # First pass removes "_world", second "_good morning".
        expected_1 = [
            dict(text=text("I'm originaler"), original_nlines=3, nlines=1)
        ]

        assert_documents_equal(expected_1, results_1, ignoring=CUMBERSOME)
Esempio n. 5
0
def deduplicate(source,
                field,
                hashes=None,
                output_hashes=None,
                add_hashes=True,
                finalize=True):
    """
    DOES TOO MANY THINGS
    Removes duplicate lines found in the field `field` of the source documents.

    Finds duplicate lines based on the hashes. Either hashes can be computed when
    reading the documents or they can be loaded from a binary file.

    If `add_hashes` is set to False only the given hashes will be considered.
    This grants a better control on memory footprint.
    """
    hash_field = field + "_hash"
    if isinstance(hashes, str) or isinstance(hashes, Path):
        seen = FlatHashSet()
        seen.load(hashes)
    elif hashes is not None:
        seen = hashes
    else:
        seen = FlatHashSet()
    log(f"Loaded {len(seen)} unique hashes.")
    n_doc = 0
    batch_size = 100_000
    n_lines, n_lines_kept = 0, 0
    n_chars, n_chars_kept = 0, 0
    t = time.time()

    def log_stats(start_time):
        end_time = time.time()
        speed = batch_size / (end_time - start_time)

        if add_hashes:
            log(f"Saw {len(seen)} unique hashes over {n_lines} lines in {n_doc} docs. [{speed:.1f} doc/s]"
                )
        else:
            log(f"Processed {n_lines} lines in {n_doc} docs. [{speed:.1f} doc/s]"
                )
        max_mem = mem_footprint_gb()
        log(f"Used up to {max_mem:.1f}GB of RAM.")
        selectivity = n_lines_kept / n_lines if n_lines else 0
        log(f"Kept {n_lines_kept} lines out of {n_lines} ({selectivity:.1%}).")
        if finalize:
            selectivity = n_chars_kept / n_chars if n_chars else 0
            log(f"Kept {n_chars_kept} chars out of {n_chars} ({selectivity:.1%})."
                )

    for doc in jsonql.read_jsons(source):
        n_doc += 1
        if n_doc % batch_size == 0:
            log_stats(t)
            t = time.time()

        hashes = doc.get(hash_field) or compute_hashes(doc.get(field))
        if hashes is None:
            continue
        if isinstance(hashes, list):
            hashes = np.array(hashes, dtype=HASH_TYPE)

        duplicate = seen.__contains__(hashes)
        if add_hashes:
            seen.add(hashes, duplicate)

        keep = duplicate < 1
        kept = keep.sum()
        hashes = hashes * keep
        doc[hash_field] = list(int(x) for x in hashes)
        n_lines += keep.size
        n_lines_kept += kept
        if finalize:
            chars, kept_chars = finalize_doc(doc, field)
            n_chars += chars
            n_chars_kept += kept_chars
        if kept > 0:
            yield doc

    log_stats(t)

    if output_hashes:
        log(f"Dumping {len(seen)} hashes to {output_hashes}.")
        seen.dump(output_hashes)