def test_missing_column_names(self): column_names = [ 'given name', 'surname', 'email address', 'join date', 'account type' ] # missing 'age' msg = 'Expected missing column name to throw FormatError.' with self.assertRaises(FormatError, msg=msg): validate_header(self.fields, column_names)
def test_invalid_column_names(self): column_names = [ 'given name', 'surname', 'email address', 'age', 'join date', 'nonexistent field' ] msg = 'Expected invalid column name to throw FormatError.' with self.assertRaises(FormatError, msg=msg): validate_header(self.fields, column_names)
def generate_clk_from_csv( input_f, # type: TextIO keys, # type: Tuple[AnyStr, AnyStr] schema, # type: Schema validate=True, # type: bool header=True, # type: Union[bool, AnyStr] progress_bar=True # type: bool ): # type: (...) -> List[str] """ Generate Bloom filters from CSV file, then serialise them. This function also computes and outputs the Hamming weight (a.k.a popcount -- the number of bits set to high) of the generated Bloom filters. :param input_f: A file-like object of csv data to hash. :param keys: A tuple of two lists of secret keys. :param schema: Schema specifying the record formats and hashing settings. :param validate: Set to `False` to disable validation of data against the schema. Note that this will silence warnings whose aim is to keep the hashes consistent between data sources; this may affect linkage accuracy. :param header: Set to `False` if the CSV file does not have a header. Set to `'ignore'` if the CSV file does have a header but it should not be checked against the schema. :param bool progress_bar: Set to `False` to disable the progress bar. :return: A list of serialized Bloom filters and a list of corresponding popcounts. """ if header not in {False, True, 'ignore'}: raise ValueError( "header must be False, True or 'ignore' but is {}.".format(header)) log.info("Hashing data") # Read from CSV file reader = unicode_reader(input_f) if header: column_names = next(reader) if header != 'ignore': validate_header(schema.fields, column_names) start_time = time.time() # Read the lines in CSV file and add it to PII pii_data = [] for line in reader: pii_data.append(tuple(element.strip() for element in line)) validate_row_lengths(schema.fields, pii_data) if progress_bar: stats = OnlineMeanVariance() with tqdm(desc="generating CLKs", total=len(pii_data), unit='clk', unit_scale=True, postfix={ 'mean': stats.mean(), 'std': stats.std() }) as pbar: def callback(tics, clk_stats): stats.update(clk_stats) pbar.set_postfix(mean=stats.mean(), std=stats.std(), refresh=False) pbar.update(tics) results = generate_clks(pii_data, schema, keys, validate=validate, callback=callback) else: results = generate_clks(pii_data, schema, keys, validate=validate) log.info("Hashing took {:.2f} seconds".format(time.time() - start_time)) return results
def test_good_column_names(self): column_names = [ 'given name', 'surname', 'email address', 'age', 'join date', 'account type' ] validate_header(self.fields, column_names) # This should not throw