Esempio n. 1
0
def merge(hashes_1, hashes_2, output):
    if isinstance(hashes_1, str):
        h1 = FlatHashSet()
        h1.load(hashes_1)
    else:
        h1 = hashes_1

    if isinstance(hashes_2, str):
        h2 = FlatHashSet()
        h2.load(hashes_2)
    else:
        h2 = hashes_2

    h2_np = np.fromiter(h2.keys(), dtype=FlatHashSet.dtype, count=len(h2))
    dup = h1.__contains__(h2_np)

    # Dups between h1 and h2 will be set to 1, keys unique to h2 are copied to
    # h1 with their value.
    h1[h2_np] = dup
    if output:
        h1.dump(output)
    return h1
Esempio n. 2
0
def deduplicate(source,
                field,
                hashes=None,
                output_hashes=None,
                add_hashes=True,
                finalize=True):
    """
    DOES TOO MANY THINGS
    Removes duplicate lines found in the field `field` of the source documents.

    Finds duplicate lines based on the hashes. Either hashes can be computed when
    reading the documents or they can be loaded from a binary file.

    If `add_hashes` is set to False only the given hashes will be considered.
    This grants a better control on memory footprint.
    """
    hash_field = field + "_hash"
    if isinstance(hashes, str) or isinstance(hashes, Path):
        seen = FlatHashSet()
        seen.load(hashes)
    elif hashes is not None:
        seen = hashes
    else:
        seen = FlatHashSet()
    log(f"Loaded {len(seen)} unique hashes.")
    n_doc = 0
    batch_size = 100_000
    n_lines, n_lines_kept = 0, 0
    n_chars, n_chars_kept = 0, 0
    t = time.time()

    def log_stats(start_time):
        end_time = time.time()
        speed = batch_size / (end_time - start_time)

        if add_hashes:
            log(f"Saw {len(seen)} unique hashes over {n_lines} lines in {n_doc} docs. [{speed:.1f} doc/s]"
                )
        else:
            log(f"Processed {n_lines} lines in {n_doc} docs. [{speed:.1f} doc/s]"
                )
        max_mem = mem_footprint_gb()
        log(f"Used up to {max_mem:.1f}GB of RAM.")
        selectivity = n_lines_kept / n_lines if n_lines else 0
        log(f"Kept {n_lines_kept} lines out of {n_lines} ({selectivity:.1%}).")
        if finalize:
            selectivity = n_chars_kept / n_chars if n_chars else 0
            log(f"Kept {n_chars_kept} chars out of {n_chars} ({selectivity:.1%})."
                )

    for doc in jsonql.read_jsons(source):
        n_doc += 1
        if n_doc % batch_size == 0:
            log_stats(t)
            t = time.time()

        hashes = doc.get(hash_field) or compute_hashes(doc.get(field))
        if hashes is None:
            continue
        if isinstance(hashes, list):
            hashes = np.array(hashes, dtype=HASH_TYPE)

        duplicate = seen.__contains__(hashes)
        if add_hashes:
            seen.add(hashes, duplicate)

        keep = duplicate < 1
        kept = keep.sum()
        hashes = hashes * keep
        doc[hash_field] = list(int(x) for x in hashes)
        n_lines += keep.size
        n_lines_kept += kept
        if finalize:
            chars, kept_chars = finalize_doc(doc, field)
            n_chars += chars
            n_chars_kept += kept_chars
        if kept > 0:
            yield doc

    log_stats(t)

    if output_hashes:
        log(f"Dumping {len(seen)} hashes to {output_hashes}.")
        seen.dump(output_hashes)