def test_timbrazil_ingest_voice_data(dagbag, populate_s3, mock_anonymizer): mock_anonymizer({'5548999027010': '1234567890'}) # mock s3 data required by the dag populate_s3('timbrazil-internal', 'timbrazil-public') # run the tasks in the dag dag = dagbag.get_dag('timbrazil.anonymize_voice') run_dag_tasks(dag) # get the dag input and output for comparison prefix = 'ocs/ocs_moc/2020/04/01/00' raw = pd.read_csv(get_object(bucket='timbrazil-internal', key=f'{prefix}/ocs_moc_01_20200401_0001.GZ'), compression='gzip', header=None, skiprows=1, delimiter='|') anonymized = pd.read_csv( get_object(bucket='timbrazil-public', key=f'{prefix}/ocs_moc_01_20200401_0001.csv')) # check that files have same structure columns = set(anonymized.columns) assert 'execution_date' in columns assert 'uuid' in columns assert anonymized.shape == (2, 456) assert raw.shape == (3, 454) # check pii encrypted pii_columns = [22, 32, 372, 373, 374, 375, 376, 377, 391, 398] raw_equivalent = [str(i - 1) for i in pii_columns] # create dataframe where any values are the same equal_pii_columns = anonymized[raw_equivalent].eq(raw[pii_columns]) # check if any column has any value the same assert not any(equal_pii_columns.any())
def test_anonymization(self, mock_anonymizer, populate_s3): # explicit test where some non numeric values are interspersed # with numeric in an msisdn column mock_anonymizer({"8675309": "1234567890"}) populate_s3("bucket/", "out/") run_operator( PIIOperator( task_id="ingest", input_path="bucket/prefix", output_path="out/prefix", key_pattern="anonymization.csv", msisdn_column="msisdn", )) path = "out/prefix/2020/04/01/00/anonymization.csv" df = pd.read_csv(get_object(path=path), dtype=str) assert df.uuid.to_list() == [ "1234567890", "1234567890", np.nan, np.nan, np.nan, "1234567890", np.nan, "1234567890", ]
def test_pii_ingestion(self, path, out_prefix, populate_s3): pattern = r"key.csv" populate_s3("bucket/prefix", "out/") def callback(df, *a): df["callback_column"] = True return df out_path = "out/prefix" if out_prefix else "out" run_operator( PIIOperator( task_id="ingest", input_path=path, output_path=out_path, key_pattern=pattern, pii_columns=["msisdn"], transform_func=callback, )) path = f"s3://{out_path}/2020/04/01/00/key.csv" df = pd.read_csv(get_object(path=path)) assert df.msisdn[0] not in {"123", 123} assert df.shape == (1, 4) assert df.columns.tolist() == [ "msisdn", "pet", "callback_column", "execution_date", ]
def test_anonymization_cache(self, mock_anonymizer, populate_s3): mock_anonymizer({"8675309": "1234567890"}) populate_s3("bucket/", "out/") uuid_lookup = "out/msisdn_lookup.csv.gz" # test dump run_operator( PIIOperator( task_id="ingest", input_path="bucket/prefix", output_path="out/prefix", uuid_write_path=uuid_lookup, key_pattern="anonymization.csv", msisdn_column="msisdn", )) with gzip.open(get_object(path=uuid_lookup), "rt") as f: content = f.read() assert content == "8675309,1234567890\n" # test load run_operator( PIIOperator( task_id="ingest", input_path="bucket/prefix", output_path="out/prefix", uuid_read_path=uuid_lookup, key_pattern="anonymization.csv", msisdn_column="msisdn", ))
def test_timbrazil_ingest_sms(dagbag, populate_s3, mock_anonymizer): mock_anonymizer({'5548999027010': '1234567890'}) populate_s3('timbrazil-internal', 'timbrazil-public') dag = dagbag.get_dag('timbrazil.anonymize_sms') run_dag_tasks(dag) assert get_object( bucket='timbrazil-public', key='ocs/ocs_sms/2020/04/01/00/ocs_sms_01_20200401_0001.csv')
def test_timbrazil_ingest_large_file(dagbag, populate_s3, mock_anonymizer, monkeypatch): mock_anonymizer({'5548999027010': '1234567890'}) # mock s3 data required by the dag populate_s3('timbrazil-internal', 'timbrazil-public') # run the tasks in the dag dag = dagbag.get_dag('timbrazil.anonymize_voice') # set chunksize to 1 to emulate a large file monkeypatch.setattr(dag.tasks[0], 'chunksize', 1) run_dag_tasks(dag) # get the dag input and output for comparison prefix = 'ocs/ocs_moc/2020/04/01/00' raw = pd.read_csv(get_object(bucket='timbrazil-internal', key=f'{prefix}/ocs_moc_01_20200401_0001.GZ'), compression='gzip', header=None, skiprows=1, delimiter='|') anonymized = pd.read_csv( get_object(bucket='timbrazil-public', key=f'{prefix}/ocs_moc_01_20200401_0001.csv')) assert anonymized.shape == (2, 456) assert raw.shape == (3, 454)