def test__unflatten_dict_respect_covariance_matrix(self): """unflatten_dict restructures the covariance matrix into an square matrix.""" # Setup data_navigator = MagicMock() modeler = MagicMock() sampler = Sampler(data_navigator, modeler) def fake_values(i, j): return '{}, {}'.format(i, j) expected_result = { 'covariance': np.array([[fake_values(i, j) for j in range(40)] for i in range(40)]).tolist() } flat = { 'covariance__{}__{}'.format(i, j): fake_values(i, j) for i in range(40) for j in range(40) } # Run result = sampler._unflatten_dict(flat) # Check assert result == expected_result
def fit(self, metadata, tables=None, root_path=None): """Fit this SDV instance to the dataset data. Args: metadata (dict, str or Metadata): Metadata dict, path to the metadata JSON file or Metadata instance itself. tables (dict): Dictionary with the table names as key and ``pandas.DataFrame`` instances as values. If ``None`` is given, the tables will be loaded from the paths indicated in ``metadata``. Defaults to ``None``. root_path (str or None): Path to the dataset directory. If ``None`` and metadata is a path, the metadata location is used. If ``None`` and metadata is a dict, the current working directory is used. """ if isinstance(metadata, Metadata): self.metadata = metadata else: self.metadata = Metadata(metadata, root_path) self.metadata.validate(tables) self.modeler = Modeler(self.metadata, self.model, self.model_kwargs) self.modeler.model_database(tables) self.sampler = Sampler(self.metadata, self.modeler.models, self.model, self.model_kwargs)
def test__unflatten_dict_child_name(self): """unflatten_dict will respect the name of child tables.""" # Setup data_navigator = MagicMock() modeler = MagicMock() sampler = Sampler(data_navigator, modeler) flat = { 'first_key__a__b': 1, 'first_key____CHILD_TABLE__model_param': 0, 'distribs____CHILD_TABLE__distribs__UNIT_PRICE__std__mean': 0 } expected_result = { 'first_key': { 'a': { 'b': 1 }, '__CHILD_TABLE': { 'model_param': 0 } }, 'distribs': { '__CHILD_TABLE__distribs__UNIT_PRICE__std': { 'mean': 0 } } } # Run result = sampler._unflatten_dict(flat) # Check assert result == expected_result modeler.assert_not_called() data_navigator.assert_not_called()
def test_sample_table(self, rows_mock): """ """ # Setup data_navigator = MagicMock(spec=DataNavigator) data_navigator.tables = { 'table': MagicMock(**{'data.shape': ('rows', 'columns')}) } modeler = MagicMock(spec=Modeler) sampler = Sampler(data_navigator=data_navigator, modeler=modeler) rows_mock.return_value = {'table': 'samples'} table_name = 'table' reset_primary_keys = False expected_result = 'samples' # Run result = sampler.sample_table(table_name, reset_primary_keys=reset_primary_keys) # Check assert result == expected_result rows_mock.assert_called_once_with(sampler, 'table', 'rows', sample_children=False, reset_primary_keys=False)
def test__unflatten_dict_child_name(self): """unflatten_dict will respect the name of child tables.""" # Setup data_navigator = MagicMock() data_navigator.get_children.return_value = ['CHILD_TABLE'] modeler = MagicMock() sampler = Sampler(data_navigator, modeler) flat = { 'first_key__a': 1, 'first_key____CHILD_TABLE__model_param': 0, 'distribs____CHILD_TABLE__distribs__UNIT_PRICE__std__mean': 0 } table_name = 'TABLE_NAME' expected_result = { 'first_key': { 'a': 1, '__CHILD_TABLE': { 'model_param': 0 } }, 'distribs': { '__CHILD_TABLE__distribs__UNIT_PRICE__std': { 'mean': 0 } } } # Run result = sampler._unflatten_dict(flat, table_name) # Check assert result == expected_result modeler.assert_not_called() data_navigator.get_children.assert_called_once_with('TABLE_NAME')
def test_sample_all(self, rows_mock): """Check sample_all and returns some value.""" # Setup data_navigator = MagicMock() data_navigator.tables = ['TABLE_A', 'TABLE_B'] data_navigator.get_parents.side_effect = lambda x: x != 'TABLE_A' modeler = MagicMock() sampler = Sampler(data_navigator, modeler) def fake_dataframe(*args, **kwargs): kwargs['sampled_data'][args[1]] = 'sampled_data' rows_mock.side_effect = fake_dataframe expected_get_parents_call_list = [(('TABLE_A', ), {}), (('TABLE_B', ), {})] expected_result = {'TABLE_A': 'sampled_data'} # Run result = sampler.sample_all(num_rows=5) # Check assert result == expected_result assert data_navigator.get_parents.call_args_list == expected_get_parents_call_list rows_mock.assert_called_once_with( sampler, 'TABLE_A', 5, sampled_data={'TABLE_A': 'sampled_data'})
def test__get_missing_valid_rows(self): """get_missing_valid_rows return an a dataframe and an integer. The dataframe contains valid_rows concatenated to synthesized and their index reset. The integer is the diference between num_rows and the returned dataframe rows. """ # Setup data_navigator = MagicMock(spec=DataNavigator) modeler = MagicMock(spec=Modeler) sampler = Sampler(data_navigator, modeler) synthesized = pd.DataFrame(columns=list('AB'), index=range(3, 5)) drop_indices = pd.Series(False, index=range(3, 5)) valid_rows = pd.DataFrame(columns=list('AB'), index=range(2)) num_rows = 5 # Run result = sampler._get_missing_valid_rows(synthesized, drop_indices, valid_rows, num_rows) missing_rows, valid_rows = result # Check assert missing_rows == 1 assert valid_rows.equals( pd.DataFrame(columns=list('AB'), index=[0, 1, 2, 3])) data_navigator.assert_not_called() assert data_navigator.method_calls == [] modeler.assert_not_called() assert modeler.method_calls == []
def test_sample_rows_parent_table(self, primary_mock, parent_mock, sample_mock, update_mock, trans_mock): """sample_rows samples using modeler.models if the table hasn't parents.""" # Setup data_navigator = MagicMock(spec=DataNavigator) modeler = MagicMock(spec=Modeler) modeler.models = {'parent_table': 'model for parent table'} sampler = Sampler(data_navigator=data_navigator, modeler=modeler) primary_mock.return_value = ('primary_key', pd.Series(range(5))) parent_mock.return_value = None sample_mock.return_value = pd.DataFrame() update_mock.return_value = {'table_name': 'samples'} trans_mock.return_value = 'transformed rows' expected_result = {'parent_table': 'transformed rows'} # Run result = sampler.sample_rows('parent_table', 5) # Check assert result == expected_result assert sampler.sampled == {'table_name': 'samples'} primary_mock.assert_called_once_with(sampler, 'parent_table', 5) parent_mock.assert_called_once_with(sampler, 'parent_table') sample_mock.assert_called_once_with(sampler, 'model for parent table', 5, 'parent_table') expected_sample_info = ('primary_key', sample_mock.return_value) update_mock.assert_called_once_with({}, 'parent_table', expected_sample_info) trans_mock.assert_called_once_with(sampler, sample_mock.return_value, 'parent_table')
def test__get_missing_valid_rows_excess_rows(self): """If more rows than required are passed, the result is cut to num_rows.""" # Setup data_navigator = MagicMock(spec=DataNavigator) modeler = MagicMock(spec=Modeler) sampler = Sampler(data_navigator, modeler) synthesized = pd.DataFrame(columns=list('AB'), index=range(3, 7)) drop_indices = pd.Series(False, index=range(3, 7)) valid_rows = pd.DataFrame(columns=list('AB'), index=range(2)) num_rows = 5 # Run result = sampler._get_missing_valid_rows(synthesized, drop_indices, valid_rows, num_rows) missing_rows, valid_rows = result # Check assert missing_rows == 0 assert valid_rows.equals( pd.DataFrame(columns=list('AB'), index=range(5))) data_navigator.assert_not_called() assert data_navigator.method_calls == [] modeler.assert_not_called() assert modeler.method_calls == []
def test__sample_model(self, qualified_mock): """_sample_model sample the number of rows from the given model.""" # Setup data_navigator = MagicMock(spec=DataNavigator) modeler = MagicMock(spec=Modeler) sampler = Sampler(data_navigator, modeler) model = MagicMock() values = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]]) qualified_mock.return_value = 'package.module.full_qualified_name' model.sample.return_value = values num_rows = 3 columns = list('ABC') expected_result = pd.DataFrame(values, columns=columns) # Run result = sampler._sample_model(model, num_rows, columns) # Check assert result.equals(expected_result) qualified_mock.assert_called_once_with(model) model.sample.assert_called_once_with(3)
def test__sample_model_vine(self, qualified_mock): """_sample_model sample the number of rows from the given model.""" # Setup data_navigator = MagicMock(spec=DataNavigator) modeler = MagicMock(spec=Modeler) sampler = Sampler(data_navigator, modeler) model = MagicMock() values = [ np.array([1, 1, 1]), np.array([2, 2, 2]), np.array([3, 3, 3]) ] qualified_mock.return_value = 'copulas.multivariate.vine.VineCopula' model.sample.side_effect = values num_rows = 3 columns = list('ABC') expected_result = pd.DataFrame(values, columns=columns) # Run result = sampler._sample_model(model, num_rows, columns) # Check assert result.equals(expected_result) qualified_mock.assert_called_once_with(model) assert model.sample.call_args_list == [((3, ), ), ((3, ), ), ((3, ), )]
def test__unflatten_dict(self): """unflatten_dict restructure flatten dicts.""" # Setup data_navigator = MagicMock() modeler = MagicMock() sampler = Sampler(data_navigator, modeler) flat = { 'a__first_key__a': 1, 'a__first_key__b': 2, 'b__second_key__x': 0 } expected_result = { 'a': { 'first_key': { 'a': 1, 'b': 2 }, }, 'b': { 'second_key': { 'x': 0 }, } } # Run result = sampler._unflatten_dict(flat) # Check assert result == expected_result data_navigator.assert_not_called() modeler.assert_not_called()
def test_sample_all(self, rows_mock, child_mock, reset_mock, concat_mock): """Check sample_all and returns some value.""" # Setup data_navigator = MagicMock() data_navigator.tables = ['TABLE_A', 'TABLE_B'] data_navigator.get_parents.side_effect = lambda x: x != 'TABLE_A' modeler = MagicMock() sampler = Sampler(data_navigator, modeler) def fake_dataframe(name, number): return pd.DataFrame([{name: 0} for i in range(number)], index=[0]*number) rows_mock.side_effect = fake_dataframe concat_mock.return_value = 'concatenated_dataframe' expected_get_parents_call_list = [(('TABLE_A',), {}), (('TABLE_B',), {})] expected_rows_mock_call_list = [(('TABLE_A', 1), {}) for i in range(5)] # Run result = sampler.sample_all(num_rows=5) # Check assert data_navigator.get_parents.call_args_list == expected_get_parents_call_list assert result == reset_mock.return_value assert rows_mock.call_args_list == expected_rows_mock_call_list assert child_mock.call_count == 5 reset_mock.assert_called_once_with({'TABLE_A': 'concatenated_dataframe'})
def test_model_database_vine_modeler_single_table(self): """model_database works fine with vine modeler.""" # Setup data_navigator = MagicMock(spec=DataNavigator) modeler = Modeler(data_navigator=data_navigator, model=VineCopula) # Setup - Mock data = pd.DataFrame({ 'column_A': list('abdc'), 'column_B': range(4) }) meta = { 'name': 'table_name', 'fields': { 'column_A': { 'name': 'A', 'type': 'categorical' }, 'column_B': { 'name': 'B', 'type': 'number', 'subtype': 'integer' } } } data_navigator.tables = { 'table_name': Table(data, meta) } data_navigator.get_parents.return_value = set() data_navigator.get_children.return_value = set() data_navigator.transformed_data = { 'table_name': pd.DataFrame({ 'column_A': [0.1, 0.2, 0.5, 1.0], 'column_B': range(4) }) } data_navigator.meta = { 'tables': [ { 'name': meta } ] } data_navigator.ht = MagicMock() data_navigator.ht.transformers = { ('table_name', 'column_A'): None, ('table_name', 'column_B'): None } # Run modeler.model_database() # Check assert 'table_name' in modeler.models sampler = Sampler(data_navigator, modeler) samples = sampler.sample_all() assert 'table_name' in samples
def test__unflatten_gaussian_copula(self): """_unflatten_gaussian_copula add the distribution, type and fitted kwargs.""" # Setup data_navigator = MagicMock() modeler = MagicMock() modeler.model_kwargs = { 'distribution': 'distribution_name' } sampler = Sampler(data_navigator, modeler) model_parameters = { 'some': 'key', 'covariance': [ [1], [0, 1] ], 'distribs': { 0: { 'first': 'distribution', 'std': 0 }, 1: { 'second': 'distribution', 'std': 0 } } } expected_result = { 'some': 'key', 'distribution': 'distribution_name', 'covariance': [ [1, 0], [0, 1] ], 'distribs': { 0: { 'type': 'distribution_name', 'fitted': True, 'first': 'distribution', 'std': 1 }, 1: { 'type': 'distribution_name', 'fitted': True, 'second': 'distribution', 'std': 1 } } } # Run result = sampler._unflatten_gaussian_copula(model_parameters) # Check assert result == expected_result data_navigator.assert_not_called() modeler.assert_not_called()
def test__unflatten_gaussian_copula_negative_std(self): """_unflatten_gaussian_copula will transform negative or 0 std into positive.""" # Setup data_navigator = MagicMock() modeler = MagicMock() modeler.model_kwargs = { 'distribution': 'distribution_name' } sampler = Sampler(data_navigator, modeler) model_parameters = { 'some': 'key', 'covariance': [ [1], [0, 1] ], 'distribs': { 0: { 'first': 'distribution', 'std': 0 }, 1: { 'second': 'distribution', 'std': -1 } } } expected_result = { 'some': 'key', 'distribution': 'distribution_name', 'covariance': [ [1, 0], [0, 1] ], 'distribs': { 0: { 'type': 'distribution_name', 'fitted': True, 'first': 'distribution', 'std': 1 }, 1: { 'type': 'distribution_name', 'fitted': True, 'second': 'distribution', 'std': np.exp(-1) } } } # Run result = sampler._unflatten_gaussian_copula(model_parameters) # Check assert result == expected_result data_navigator.assert_not_called() modeler.assert_not_called()
def test__prepare_sampled_covariance(self): """Test prepare_sampler_covariante""" # Run covariance = [[0, 1], [1]] result = Sampler(None, None)._prepare_sampled_covariance(covariance) # Asserts expected = np.array([[1., 1.], [1., 1.0]]) np.testing.assert_almost_equal(result, expected)
def fit(self): """Transform the data and model the database.""" data_loader = CSVDataLoader(self.meta_file_name) self.dn = data_loader.load_data() # transform data self.dn.transform_data() self.modeler = Modeler(self.dn) self.modeler.model_database() self.sampler = Sampler(self.dn, self.modeler)
def test___init__(self): """Test create a default instance of Sampler class""" # Run models = {'test': Mock()} sampler = Sampler('test_metadata', models) # Asserts assert sampler.metadata == 'test_metadata' assert sampler.models == models assert sampler.primary_key == dict() assert sampler.remaining_primary_key == dict()
def test_sample_rows_children_table(self, primary_mock, parent_mock, model_mock, extension_mock, sample_mock, update_mock, trans_mock): """sample_rows samples using extensions when the table has parents.""" # Setup data_navigator = MagicMock(spec=DataNavigator) data_navigator.foreign_keys = { ('child_table', 'parent_name'): ('parent_pk', 'child_fk') } modeler = MagicMock(spec=Modeler) sampler = Sampler(data_navigator=data_navigator, modeler=modeler) primary_mock.return_value = ('primary_key', pd.Series(range(5))) parent_mock.return_value = ('parent_name', 'foreign_key', pd.DataFrame({'foreign_key': [0, 1, 2]})) extension_mock.return_value = 'extension' model_mock.return_value = 'model from extension' sample_mock.return_value = pd.DataFrame() update_mock.return_value = {'table_name': 'samples'} trans_mock.return_value = 'transformed_rows' expected_result = {'child_table': 'transformed_rows'} # Run result = sampler.sample_rows('child_table', 5) # Check assert result == expected_result assert sampler.sampled == {'table_name': 'samples'} primary_mock.assert_called_once_with(sampler, 'child_table', 5) parent_mock.assert_called_once_with(sampler, 'child_table') sample_mock.assert_called_once_with(sampler, 'model from extension', 5, 'child_table') expected_sample_info = ('primary_key', sample_mock.return_value) update_mock.assert_called_once_with({}, 'child_table', expected_sample_info) trans_mock.assert_called_once_with(sampler, sample_mock.return_value, 'child_table') call_args_list = extension_mock.call_args_list assert len(call_args_list) == 1 args, kwargs = call_args_list[0] assert kwargs == {} assert len(args) == 4 assert args[0] == sampler assert args[1].equals(pd.DataFrame({'foreign_key': [0]})) assert args[2] == 'child_table' assert args[3] == 'parent_name' model_mock.assert_called_once_with(sampler, 'extension')
def test__get_primary_keys_no_pk(self): """If no primary key, _get_primary_keys return a duple of None """ # Setup data_navigator = MagicMock(spec=DataNavigator) data_navigator.get_meta_data.return_value = {} modeler = MagicMock(spec=Modeler) sampler = Sampler(data_navigator=data_navigator, modeler=modeler) # Run result = sampler._get_primary_keys('table', 5) # Check primary_key, primary_key_values = result assert primary_key is None assert primary_key_values is None
def test___init__(self): """Test create a default instance of Sampler class""" # Run models = {'test': Mock()} sampler = Sampler('test_metadata', models, SDVModel, {'model': 'kwargs'}, {'table': 'sizes'}) # Asserts assert sampler.metadata == 'test_metadata' assert sampler.models == models assert sampler.primary_key == dict() assert sampler.remaining_primary_key == dict() assert sampler.model == SDVModel assert sampler.model_kwargs == {'model': 'kwargs'} assert sampler.table_sizes == {'table': 'sizes'}
def fit(self): """Transform the data and model the database. Raises: ValueError: If the provided dataset has an unsupported structure. """ data_loader = CSVDataLoader(self.meta_file_name) self.dn = data_loader.load_data() self._check_unsupported_dataset_structure() self.dn.transform_data() self.modeler = Modeler(self.dn) self.modeler.model_database() self.sampler = Sampler(self.dn, self.modeler)
def test__reset_primary_keys_generators(self): """_reset_primary_keys deletes all generators and counters.""" # Setup data_navigator = MagicMock(spec=DataNavigator) modeler = MagicMock(spec=Modeler) sampler = Sampler(data_navigator=data_navigator, modeler=modeler) sampler.primary_key = {'table': 'generator for table'} sampler.remaining_primary_key = {'table': 'counter for table'} # Run sampler._reset_primary_keys_generators() # Check assert sampler.primary_key == dict() assert sampler.remaining_primary_key == dict()
def test__prepare_sampled_covariance(self): """ """ # Setup data_navigator = MagicMock() modeler = MagicMock() sampler = Sampler(data_navigator, modeler) covariance = [[1.0], [0.5, 1.0], [0.5, 0.5, 1.0]] expected_result = np.array([[1.0, 0.5, 0.5], [0.5, 1.0, 0.5], [0.5, 0.5, 1.0]]) # Run result = sampler._prepare_sampled_covariance(covariance) # Check assert (result == expected_result).all().all()
def test__square_matrix(self): """_square_matrix transform triagular list of list into square matrix.""" # Setup data_navigator = MagicMock() modeler = MagicMock() sampler = Sampler(data_navigator, modeler) triangular_matrix = [[1], [1, 1], [1, 1, 1]] expected_result = [[1, 0, 0], [1, 1, 0], [1, 1, 1]] # Run result = sampler._square_matrix(triangular_matrix) # Check assert result == expected_result
def test__unflatten_dict_mixed_array(self): """unflatten_dict restructure arrays.""" # Setup data_navigator = MagicMock() modeler = MagicMock() sampler = Sampler(data_navigator, modeler) flat = { 'first_key__0__0': 1, 'first_key__0__1': 0, 'first_key__1__0': 0, 'first_key__1__1': 1, 'second_key__0__std': 0.5, 'second_key__0__mean': 0.5, 'second_key__1__std': 0.25, 'second_key__1__mean': 0.25 } expected_result = { 'first_key': [ [1, 0], [0, 1] ], 'second_key': [ { 'std': 0.5, 'mean': 0.5 }, { 'std': 0.25, 'mean': 0.25 } ] } # Run result = sampler._unflatten_dict(flat) # Check assert result == expected_result data_navigator.assert_not_called() modeler.assert_not_called()
def test__get_primary_keys_raises_error(self): """_get_primary_keys raises an exception if there aren't enough values.""" # Setup data_navigator = MagicMock(spec=DataNavigator) data_navigator.get_meta_data.return_value = { 'primary_key': 'table_pk', 'fields': { 'table_pk': { 'regex': 'regex for table_pk', 'type': 'number', 'subtype': 'integer' }, } } modeler = MagicMock(spec=Modeler) sampler = Sampler(data_navigator=data_navigator, modeler=modeler) sampler.primary_key['table'] = 'a generator' sampler.remaining_primary_key['table'] = 0 # Run / Check with self.assertRaises(ValueError): sampler._get_primary_keys('table', 5)
def test__get_primary_keys_create_generator(self, exrex_gen_mock, exrex_count_mock): """If there's a primary key, but no generator, a new one is created and used.""" # Setup data_navigator = MagicMock(spec=DataNavigator) data_navigator.get_meta_data.return_value = { 'primary_key': 'table_pk', 'fields': { 'table_pk': { 'regex': 'regex for table_pk', 'type': 'number', 'subtype': 'integer' }, } } modeler = MagicMock(spec=Modeler) sampler = Sampler(data_navigator=data_navigator, modeler=modeler) exrex_gen_mock.return_value = (str(x) for x in range(10)) exrex_count_mock.return_value = 10 expected_primary_key = 'table_pk' expected_primary_key_values = pd.Series(range(5)) # Run result = sampler._get_primary_keys('table', 5) # Check primary_key, primary_key_values = result assert primary_key == expected_primary_key primary_key_values.equals(expected_primary_key_values) assert sampler.primary_key['table'] == exrex_gen_mock.return_value assert sampler.remaining_primary_key['table'] == 5 data_navigator.get_meta_data.assert_called_once_with('table') exrex_count_mock.assert_called_once_with('regex for table_pk') exrex_gen_mock.assert_called_once_with('regex for table_pk')
def test__sample_valid_rows_raises_unfitted_model(self): """_sample_valid_rows raise an exception for invalid models.""" # Setup data_navigator = MagicMock(spec=DataNavigator) modeler = MagicMock(spec=Modeler) sampler = Sampler(data_navigator, modeler) data_navigator.get_parents.return_value = set() num_rows = 5 table_name = 'table_name' model = None # Run with self.assertRaises(ValueError): sampler._sample_valid_rows(model, num_rows, table_name) # Check modeler.assert_not_called() assert len(modeler.method_calls) == 0 data_navigator.assert_not_called() data_navigator.get_parents.assert_called_once_with('table_name')