Esempio n. 1
0
def generate_clks(
    pii_data,  # type: Sequence[Sequence[str]]
    schema,  # type: Schema
    secret,  # type: AnyStr
    validate=True,  # type: bool
    callback=None,  # type: Optional[Callable[[int, Sequence[int]], None]]
    use_multiprocessing=True  # type: bool
):
    # type: (...) -> List[str]

    # Generate two keys for each identifier from the secret, one key per hashing method used when computing
    # the bloom filters.
    # Otherwise it could create more if required using the parameter `num_hashing_methods` in `generate_key_lists`
    key_lists = generate_key_lists(secret,
                                   len(schema.fields),
                                   key_size=schema.kdf_key_size,
                                   salt=schema.kdf_salt,
                                   info=schema.kdf_info,
                                   kdf=schema.kdf_type,
                                   hash_algo=schema.kdf_hash)

    if validate:
        validate_entries(schema.fields, pii_data)

    # Chunks PII
    log.info("Hashing {} entities".format(len(pii_data)))
    chunk_size = 200 if len(pii_data) <= 10000 else 1000
    futures = []

    # Compute Bloom filter from the chunks and then serialise it
    pool_executor = ProcessPoolExecutor if use_multiprocessing else \
        ThreadPoolExecutor # type: Union[Type[ProcessPoolExecutor], Type[ThreadPoolExecutor]]

    with pool_executor() as executor:
        for chunk in chunks(pii_data, chunk_size):
            future = executor.submit(
                hash_and_serialize_chunk,
                chunk,
                key_lists,
                schema,
            )
            if callback is not None:
                unpacked_callback = cast(Callable[[int, Sequence[int]], None],
                                         callback)
                future.add_done_callback(
                    lambda f: unpacked_callback(len(f.result()[0]),
                                                f.result()[1]))
            futures.append(future)

        results = []
        for future in futures:
            clks, clk_stats = future.result()
            results.extend(clks)

    return results
Esempio n. 2
0
    def test_invalid_data(self):
        msg = 'Expected invalid entry to throw EntryError.'

        row = [['John', 'DOE', '*****@*****.**',
                                            '23', '2015-10-21', 'free']]
        #        ^ Invalid case.
        with self.assertRaises(EntryError, msg=msg):
            validate_entries(self.fields, row)

        row = [['john', 'doe', '*****@*****.**',
                                            '23', '2015-10-21', 'free']]
        #                ^^^ Invalid case.
        with self.assertRaises(EntryError, msg=msg):
            validate_entries(self.fields, row)
Esempio n. 3
0
def generate_clks(
    pii_data,  # type: Sequence[Sequence[str]]
    schema,  # type: Schema
    keys,  # type: Tuple[AnyStr, AnyStr]
    validate=True,  # type: bool
    callback=None  # type: Optional[Callable[[int, Sequence[int]], None]]
):
    # type: (...) -> List[str]

    # generate two keys for each identifier
    key_lists = generate_key_lists(
        keys,
        len(schema.fields),
        key_size=schema.hashing_globals.kdf_key_size,
        salt=schema.hashing_globals.kdf_salt,
        info=schema.hashing_globals.kdf_info,
        kdf=schema.hashing_globals.kdf_type,
        hash_algo=schema.hashing_globals.kdf_hash)

    if validate:
        validate_entries(schema.fields, pii_data)

    # Chunks PII
    log.info("Hashing {} entities".format(len(pii_data)))
    chunk_size = 200 if len(pii_data) <= 10000 else 1000
    futures = []

    # Compute Bloom filter from the chunks and then serialise it
    with concurrent.futures.ProcessPoolExecutor() as executor:
        for chunk in chunks(pii_data, chunk_size):
            future = executor.submit(
                hash_and_serialize_chunk,
                chunk,
                key_lists,
                schema,
            )
            if callback is not None:
                unpacked_callback = cast(Callable[[int, Sequence[int]], None],
                                         callback)
                future.add_done_callback(
                    lambda f: unpacked_callback(len(f.result()[0]),
                                                f.result()[1]))
            futures.append(future)

        results = []
        for future in futures:
            clks, clk_stats = future.result()
            results.extend(clks)

    return results
Esempio n. 4
0
 def test_good_data(self):
     row = [[
         'john', 'DOE', '*****@*****.**', '23', '2015-10-21', 'free'
     ]]
     validate_entries(self.fields, row)  # This should not throw