def test__anonymize_data(self): """Anonymoze data in tables with pii fields""" # Setup def side_effect_get_pii(ht_meta): if ht_meta['ht'] == 'a': return ['a_fields'] anonymized_table = pd.DataFrame({'a_fields': [1, 2, 3]}) anonymized_another_table = pd.DataFrame({'a_fields': [4, 5, 6]}) ht_mock = Mock() ht_mock.table_dict = { 'a_table': (anonymized_table, { 'ht': 'a' }), 'another_table': (anonymized_another_table, { 'ht': 'b' }) } ht_mock._get_pii_fields.side_effect = side_effect_get_pii a_table = pd.DataFrame({'a_fields': [1, 2, 3]}) another_table = Table(pd.DataFrame(), {'another': 'metadata'}) tables = { 'a_table': Table(a_table, {'some': 'metadata'}), 'another_table': another_table, } # Run data_navigator_mock = Mock() data_navigator_mock.tables = tables data_navigator_mock.ht = ht_mock DataNavigator._anonymize_data(data_navigator_mock) # Asserts exp_call_args_list = [ call({'ht': 'a'}), call({'ht': 'b'}), ] exp_a_table_dataframe = pd.DataFrame({'a_fields': [1, 2, 3]}) exp_another_dataframe = pd.DataFrame() pd.testing.assert_frame_equal(tables['a_table'].data, exp_a_table_dataframe) pd.testing.assert_frame_equal(tables['another_table'].data, exp_another_dataframe) for arg_item in ht_mock._get_pii_fields.call_args_list: assert arg_item in exp_call_args_list
def test_model_database_vine_modeler_single_table(self): """model_database works fine with vine modeler.""" # Setup data_navigator = MagicMock(spec=DataNavigator) modeler = Modeler(data_navigator=data_navigator, model=VineCopula) # Setup - Mock data = pd.DataFrame({ 'column_A': list('abdc'), 'column_B': range(4) }) meta = { 'name': 'table_name', 'fields': { 'column_A': { 'name': 'A', 'type': 'categorical' }, 'column_B': { 'name': 'B', 'type': 'number', 'subtype': 'integer' } } } data_navigator.tables = { 'table_name': Table(data, meta) } data_navigator.get_parents.return_value = set() data_navigator.get_children.return_value = set() data_navigator.transformed_data = { 'table_name': pd.DataFrame({ 'column_A': [0.1, 0.2, 0.5, 1.0], 'column_B': range(4) }) } data_navigator.meta = { 'tables': [ { 'name': meta } ] } data_navigator.ht = MagicMock() data_navigator.ht.transformers = { ('table_name', 'column_A'): None, ('table_name', 'column_B'): None } # Run modeler.model_database() # Check assert 'table_name' in modeler.models sampler = Sampler(data_navigator, modeler) samples = sampler.sample_all() assert 'table_name' in samples
def test__get_extensions(self, get_foreign_mock, extension_mock): """_get_extensions return the conditional modelling parameters for each children.""" # Setup data_navigator = MagicMock() first_table_data = pd.DataFrame({'foreign_key': [0, 1]}) first_table_meta = {'fields': []} data_navigator.tables = { 'first_children': Table(first_table_data, first_table_meta), 'second_children': Table(first_table_data, first_table_meta), } data_navigator.get_children.return_value = {} modeler = Modeler(data_navigator) modeler.tables = {} extension_mock.side_effect = lambda x, y, z: None get_foreign_mock.return_value = 'foreign_key' pk = 'primary_key' children = ['first_children', 'second_children'] expected_result = [ pd.DataFrame([{ '__first_children_column_1': 1, '__first_children_column_2': 2 }]), pd.DataFrame([{ '__second_children_column_1': 1, '__second_children_column_2': 2 }]) ] # Run result = modeler._get_extensions(pk, children) # Check assert all([ result[index].equals(expected_result[index]) for index in range(len(result)) ])
def test__fill_text_columns(self): """Fill columns""" # Setup data_navigator_mock = Mock() data_navigator_mock.tables = { 'DEMO': Table( pd.DataFrame(), { 'fields': { 'a_field': { 'name': 'a_field', 'type': 'id', 'ref': { 'table': 'table_ref', 'field': 'table_ref_id' } }, 'b_field': { 'name': 'b_field', 'type': 'id', 'regex': '^[0-9]{10}$' }, 'c_field': { 'name': 'c_field', 'type': 'text', 'regex': '^[a-z]{10}$' } } }) } sample_rows_mock = Mock() sample_rows_mock.return_value = { 'table_ref_id': { 'name': 'table_ref_id' } } # Run sampler_mock = Mock() sampler_mock.dn = data_navigator_mock sampler_mock.sample_rows = sample_rows_mock row = pd.DataFrame({'c_field': ['foo', 'bar', 'tar']}) labels = ['a_field', 'b_field', 'c_field'] table_name = 'DEMO' Sampler._fill_text_columns(sampler_mock, row, labels, table_name) # Asserts sample_rows_mock.assert_called_once_with('table_ref', 1)
def test_get_meta_data(self): """Retrieve table meta""" # Setup tables = {'DEMO': Table(None, 'meta')} # Run data_navigator_mock = Mock() data_navigator_mock.tables = tables result = DataNavigator.get_meta_data(data_navigator_mock, 'DEMO') # Asserts expect = 'meta' assert result == expect
def test_get_data(self): """Retrieve table data""" # Setup data = pd.DataFrame({'foo': [0, 1]}) tables = {'DEMO': Table(data, 'meta')} # Run data_navigator_mock = Mock() data_navigator_mock.tables = tables result = DataNavigator.get_data(data_navigator_mock, 'DEMO') # Asserts expect = pd.DataFrame({'foo': [0, 1]}) pd.testing.assert_frame_equal(result, expect)
def test_CPA(self, extensions_mock, merge_mock): """CPA will append extensions to the original table.""" # Setup data_navigator = MagicMock(spec=DataNavigator) table = Table(pd.DataFrame({'table_pk': range(5)}), {'primary_key': 'table_pk'}) data_navigator.tables = {'table': table} transformed_table = pd.DataFrame({'table_pk': range(5)}) data_navigator.transformed_data = {'table': transformed_table} data_navigator.get_children.return_value = 'children of table' modeler = Modeler(data_navigator) extension = MagicMock() extensions_mock.return_value = [extension] extended_table = MagicMock() merge_mock.return_value = extended_table table_name = 'table' # Run modeler.CPA(table_name) # Check assert modeler.tables[table_name] == extended_table extensions_mock.assert_called_once_with(modeler, 'table_pk', 'children of table') merge_mock.assert_called_once_with(transformed_table, extension.reset_index.return_value, how='left', on='table_pk') data_navigator.get_children.assert_called_once_with('table') extension.reset_index.assert_called_once_with() extended_table.drop.assert_not_called() call_args_list = extended_table.__setitem__.call_args_list assert len(call_args_list) == 1 args, kwargs = call_args_list[0] assert kwargs == {} assert len(args) == 2 assert args[0] == 'table_pk' assert args[1].equals(transformed_table['table_pk'])
def test_sample_all_with_reset_primary_key(self): """Check sample_all with reset_primary_keys True""" # Setup reset_primary_keys_generators_mock = Mock() dn_mock = Mock() dn_mock.tables = {'DEMO': Table(pd.DataFrame(), {'some': 'meta'})} dn_mock.get_parents.return_value = True # Run sampler_mock = Mock() sampler_mock._reset_primary_keys_generators = reset_primary_keys_generators_mock sampler_mock.dn = dn_mock Sampler.sample_all(sampler_mock, reset_primary_keys=True) # Asserts reset_primary_keys_generators_mock.assert_called_once_with()
def test__get_relashionships(self): """_get_relashionships returns parents, children and foreign_keys dicts.""" # Setup meta = { 'fields': { 'a_field': { 'name': 'a_field', 'ref': { 'table': 'DEMO_2', 'field': 'DEMO_2_ID' } } } } tables = {'DEMO': Table('data', meta)} update_mock = Mock() update_mock.side_effect = ['child', 'parent'] # Run data_navigator_mock = Mock() data_navigator_mock.update_mapping = update_mock result = DataNavigator._get_relationships(data_navigator_mock, tables) # Asserts expect = 'child', 'parent', { ('DEMO', 'DEMO_2'): ('DEMO_2_ID', 'a_field') } exp_args_list = [ call({}, 'DEMO_2', 'DEMO'), call({}, 'DEMO', 'DEMO_2') ] assert result == expect update_mock.call_args_list == exp_args_list
def test_model_database_gaussian_copula_single_table(self): """model_database can model a single table using the gausian copula model.""" # Setup data_navigator = MagicMock(spec=DataNavigator) modeler = Modeler(data_navigator=data_navigator, model=GaussianMultivariate) # Setup - Mocks - DataNavigator table_data = pd.DataFrame({ 'column_A': list('abdc'), 'column_B': range(4) }) table_metadata = { 'name': 'table_name', 'fields': { 'column_A': { 'name': 'column_A', 'type': 'categorical' }, 'column_B': { 'name': 'column_B', 'type': 'number', 'subtype': 'integer' } } } data_navigator.tables = { 'table_name': Table(table_data, table_metadata) } data_navigator.get_parents.return_value = set() data_navigator.get_children.return_value = set() data_navigator.transformed_data = { 'table_name': pd.DataFrame({ 'column_A': [0.1, 0.2, 0.5, 1.0], 'column_B': range(4) }) } metadata = { 'name': 'table_name', 'fields': [{ 'name': 'column_A', 'type': 'categorical' }, { 'name': 'column_B', 'type': 'number', 'subtype': 'integer' }] } data_navigator.meta = {'tables': [metadata]} data_navigator.ht = MagicMock() data_navigator.ht.transformers = { ('table_name', 'column_A'): None, ('table_name', 'column_B'): None } # Run modeler.model_database() # Check assert 'table_name' in modeler.models sampler = Sampler(data_navigator, modeler) samples = sampler.sample_all() assert 'table_name' in samples
def test_CPA_transformed_index(self, extension_mock): """CPA is able to merge extensions in tables with transformed index. """ # Setup data_navigator = MagicMock(spec=DataNavigator) modeler = Modeler(data_navigator) # Setup - Mock parent_data = pd.DataFrame([ { 'parent_id': 'A', 'values': 1 }, { 'parent_id': 'B', 'values': 2 }, { 'parent_id': 'C', 'values': 3 }, ]) parent_meta = { 'name': 'parent', 'primary_key': 'parent_id', 'fields': { 'parent_id': { 'name': 'parent_id', 'type': 'categorical', 'regex': '^[A-Z]$' }, 'values': { 'name': 'values', 'type': 'number', 'subtype': 'integer' } } } child_data = pd.DataFrame([ { 'child_id': 1, 'parent_id': 'A', 'value': 0.1 }, { 'child_id': 2, 'parent_id': 'A', 'value': 0.2 }, { 'child_id': 3, 'parent_id': 'A', 'value': 0.3 }, { 'child_id': 4, 'parent_id': 'B', 'value': 0.4 }, { 'child_id': 5, 'parent_id': 'B', 'value': 0.5 }, { 'child_id': 6, 'parent_id': 'B', 'value': 0.6 }, { 'child_id': 7, 'parent_id': 'C', 'value': 0.7 }, { 'child_id': 8, 'parent_id': 'C', 'value': 0.8 }, { 'child_id': 9, 'parent_id': 'C', 'value': 0.9 }, ]) child_meta = { 'name': 'child', 'primary_key': 'child_id', 'fields': { 'child_id': { 'name': 'child_id', 'type': 'number' }, 'parent_id': { 'name': 'parent_id', 'type': 'category', 'ref': { 'table': 'parent', 'field': 'parent_id' } }, 'value': { 'name': 'value', 'type': 'number' } } } data_navigator.tables = { 'parent': Table(parent_data, parent_meta), 'child': Table(child_data, child_meta) } children_map = {'parent': {'child'}} parent_map = {'child': {'parent'}} data_navigator.get_children.side_effect = lambda x: children_map.get( x, set()) data_navigator.get_parents.side_effect = lambda x: parent_map.get( x, set()) transformed_parent = pd.DataFrame([ { 'parent_id': 0.1, 'values': 1 }, { 'parent_id': 0.4, 'values': 2 }, { 'parent_id': 0.8, 'values': 3 }, ]) transformed_child = pd.DataFrame([ { 'child_id': 1, 'parent_id': 0.15, 'value': 0.1 }, { 'child_id': 2, 'parent_id': 0.10, 'value': 0.2 }, { 'child_id': 3, 'parent_id': 0.20, 'value': 0.3 }, { 'child_id': 4, 'parent_id': 0.35, 'value': 0.4 }, { 'child_id': 5, 'parent_id': 0.50, 'value': 0.5 }, { 'child_id': 6, 'parent_id': 0.55, 'value': 0.6 }, { 'child_id': 7, 'parent_id': 0.70, 'value': 0.7 }, { 'child_id': 8, 'parent_id': 0.80, 'value': 0.8 }, { 'child_id': 9, 'parent_id': 0.85, 'value': 0.9 }, ]) data_navigator.transformed_data = { 'parent': transformed_parent, 'child': transformed_child } extension = pd.DataFrame( **{ 'data': [ { 'param_1': 0.5, 'param_2': 0.4 }, { 'param_1': 0.7, 'param_2': 0.2 }, { 'param_1': 0.2, 'param_2': 0.1 }, ], 'index': list('ABC') }) extension.index.name = 'parent_id' extension_mock.return_value = [extension] expected_extended_parent = pd.DataFrame( [ { 'parent_id': 0.1, 'values': 1, 'param_1': 0.5, 'param_2': 0.4 }, { 'parent_id': 0.4, 'values': 2, 'param_1': 0.7, 'param_2': 0.2 }, { 'parent_id': 0.8, 'values': 3, 'param_1': 0.2, 'param_2': 0.1 }, ], columns=['parent_id', 'values', 'param_1', 'param_2']) # Run modeler.CPA('parent') # Check 'parent' in modeler.tables assert modeler.tables['parent'].equals(expected_extended_parent) data_navigator.get_children.assert_called_once_with('parent') extension_mock.assert_called_once_with('parent_id', {'child'})
def test_load_data(self, read_mock, dn_mock): """load_data to build a DataNavigator""" # SetUp meta = { 'path': '', 'tables': [{ 'use': True, 'name': 'DEMO', 'path': 'some_path.csv', 'fields': [{ 'name': 'a_field', 'foo': 'foo' }] }] } meta_filename = 'meta_filename.json' dn_mock.return_value = Mock() format_mock = Mock() format_mock.return_value = {'some': 'meta'} read_mock.return_value = pd.DataFrame({'foo': [0, 1]}) # Run csv_data_loader_mock = Mock() csv_data_loader_mock.meta = meta csv_data_loader_mock.meta_filename = meta_filename csv_data_loader_mock._format_table_meta = format_mock CSVDataLoader.load_data(csv_data_loader_mock) # Asserts exp_format_args = { 'use': True, 'name': 'DEMO', 'path': 'some_path.csv', 'fields': [{ 'name': 'a_field', 'foo': 'foo' }] } exp_data_navigator_meta = { 'path': '', 'tables': [{ 'use': True, 'name': 'DEMO', 'path': 'some_path.csv', 'fields': [{ 'name': 'a_field', 'foo': 'foo' }] }] } exp_data_navigator_tables = { 'DEMO': Table(pd.DataFrame({'foo': [0, 1]}), {'some': 'meta'}) } assert_meta_filename, assert_meta, assert_tables = dn_mock.call_args[0] format_mock.assert_called_once_with(exp_format_args) assert assert_meta_filename == 'meta_filename.json' assert assert_meta == exp_data_navigator_meta assert assert_tables.keys() == exp_data_navigator_tables.keys() pd.testing.assert_frame_equal(assert_tables['DEMO'].data, exp_data_navigator_tables['DEMO'].data)
def test_transform_synthesized_rows_no_pk(self, get_table_meta_mock, fill_mock): """transform_synthesized_rows will update internal state and reverse transform rows.""" # Setup - Class Instantiation data_navigator = MagicMock() modeler = MagicMock() sampler = Sampler(data_navigator, modeler) # Setup - Mock configuration table_metadata = { 'fields': { 'column_A': { 'type': 'number', 'subtype': 'integer' }, 'column_B': { 'name': 'column', 'type': 'number' } }, 'primary_key': None } table_data = pd.DataFrame(columns=['column_A', 'column_B']) test_table = Table(table_data, table_metadata) data_navigator.tables = {'table': test_table} data_navigator.ht.transformers = { ('table', 'column_A'): None, ('table', 'column_B'): None } data_navigator.ht.reverse_transform_table.return_value = pd.DataFrame({ 'column_A': ['some', 'transformed values'], 'column_B': ['another', 'transformed column'] }) get_table_meta_mock.return_value = {'original': 'meta', 'fields': []} fill_mock.return_value = pd.DataFrame( { 'column_A': ['filled', 'text_values'], 'column_B': ['nothing', 'numerical'] }, columns=[column[1] for column in data_navigator.ht.transformers]) # Setup - Method arguments / expected result synthesized_rows = pd.DataFrame({ 'column_A': [1.7, 2.5], 'column_B': [4.7, 5.1], 'model_parameters': ['some', 'parameters'] }) table_name = 'table' expected_result = pd.DataFrame({ 'column_A': ['some', 'transformed values'], 'column_B': ['another', 'transformed column'] }) # Run result = sampler._transform_synthesized_rows(synthesized_rows, table_name) # Check - Result assert result.equals(expected_result) # Check - Mock calls get_table_meta_mock.assert_called_once_with(sampler, data_navigator.meta, 'table') fill_mock.assert_called_once_with(sampler, synthesized_rows, ['column_A', 'column_B'], 'table') call_args = data_navigator.ht.reverse_transform_table.call_args_list assert len(call_args) == 1 assert len(call_args[0][0]) == 2 assert call_args[0][0][0].equals(fill_mock.return_value) assert call_args[0][0][1] == get_table_meta_mock.return_value assert call_args[0][1] == {}
def test_model_database_vine_modeler_single_table(self): """model_database works fine with vine modeler.""" # Setup data_navigator = MagicMock(spec=DataNavigator) modeler = Modeler(data_navigator=data_navigator, model=VineCopula) # Setup - Mock data = pd.DataFrame({'column_A': list('abdc'), 'column_B': range(4)}) meta = { 'name': 'table_name', 'fields': { 'column_A': { 'name': 'column_A', 'type': 'categorical' }, 'column_B': { 'name': 'column_B', 'type': 'number', 'subtype': 'integer' } } } data_navigator.tables = {'table_name': Table(data, meta)} data_navigator.get_parents.return_value = set() data_navigator.get_children.return_value = set() data_navigator.transformed_data = { 'table_name': pd.DataFrame({ 'column_A': [0.1, 0.2, 0.5, 1.0], 'column_B': range(4) }) } metadata = { 'name': 'table_name', 'fields': [{ 'name': 'column_A', 'type': 'categorical' }, { 'name': 'column_B', 'type': 'number', 'subtype': 'integer' }] } data_navigator.meta = {'tables': [metadata]} ht = MagicMock(spec=HyperTransformer) ht.transformers = { ('table_name', 'column_A'): None, ('table_name', 'column_B'): None } reverse_transform_dataframe = pd.DataFrame( { 'column_A': list('bcda'), 'column_B': [1.0, 2.0, 3.0, 4.0] }, columns=['column_A', 'column_B']) ht.reverse_transform_table.return_value = reverse_transform_dataframe data_navigator.ht = ht # Run modeler.model_database() # Check assert len(modeler.models) == 1 model = modeler.models['table_name'] assert isinstance(model, VineCopula) assert model.fitted is True assert data_navigator.get_parents.call_args_list == [ (('table_name', ), ) ] assert data_navigator.get_children.call_args_list == [ (('table_name', ), ), (('table_name', ), ) ] assert modeler.tables['table_name'].equals( modeler.dn.transformed_data['table_name'])
def test_model_database_kde_distribution(self): """model_database works fine with kde distribution.""" # Setup data_navigator = MagicMock(spec=DataNavigator) modeler = Modeler(data_navigator=data_navigator, distribution=KDEUnivariate) # Setup - Mocks - DataNavigator table_data = pd.DataFrame({ 'column_A': list('abdc'), 'column_B': range(4) }) table_metadata = { 'name': 'table_name', 'fields': { 'column_A': { 'name': 'column_A', 'type': 'categorical' }, 'column_B': { 'name': 'column_B', 'type': 'number', 'subtype': 'integer' } } } data_navigator.tables = { 'table_name': Table(table_data, table_metadata) } data_navigator.get_parents.return_value = set() data_navigator.get_children.return_value = set() data_navigator.transformed_data = { 'table_name': pd.DataFrame({ 'column_A': [0.1, 0.2, 0.5, 1.0], 'column_B': range(4) }) } metadata = { 'name': 'table_name', 'fields': [{ 'name': 'column_A', 'type': 'categorical' }, { 'name': 'column_B', 'type': 'number', 'subtype': 'integer' }] } data_navigator.meta = {'tables': [metadata]} ht = MagicMock(spec=HyperTransformer) ht.transformers = { ('table_name', 'column_A'): None, ('table_name', 'column_B'): None } reverse_transform_dataframe = pd.DataFrame( { 'column_A': list('bcda'), 'column_B': [1.0, 2.0, 3.0, 4.0] }, columns=['column_A', 'column_B']) ht.reverse_transform_table.return_value = reverse_transform_dataframe data_navigator.ht = ht # Run modeler.model_database() # Check assert len(modeler.models) == 1 assert 'table_name' in modeler.models model = modeler.models['table_name'] assert isinstance(model, GaussianMultivariate) assert model.distribution == 'copulas.univariate.kde.KDEUnivariate' assert model.fitted is True assert data_navigator.get_parents.call_args_list == [ (('table_name', ), ) ] assert data_navigator.get_children.call_args_list == [ (('table_name', ), ), (('table_name', ), ) ]