def test_parameters(): gc = GaussianCopula(field_distributions={'foo': 'beta'}, default_distribution='gaussian_kde', categorical_transformer='label_encoding') new_gc = GaussianCopula(table_metadata=gc.get_metadata().to_dict()) assert new_gc._metadata._dtype_transformers['O'] == 'label_encoding'
def test___init___copies_metadata(): """Test the ``__init__`` method. This test assures that the metadata provided to the model is copied, so that any modifications don't change the input. Setup: - Initialize two models with the same metadata and data. Expected behavior: - The metadata for each model and the provided metadata should all be different. """ # Setup metadata, data = load_tabular_demo('student_placements', metadata=True) # Run model = GaussianCopula(table_metadata=metadata, categorical_transformer='label_encoding', default_distribution='gamma') model.fit(data) model2 = GaussianCopula(table_metadata=metadata, categorical_transformer='label_encoding', default_distribution='beta') model2.fit(data) # Assert assert model._metadata != metadata assert model._metadata != model2._metadata assert model2._metadata != metadata gamma = 'copulas.univariate.gamma.GammaUnivariate' beta = 'copulas.univariate.beta.BetaUnivariate' assert all(distribution == gamma for distribution in model.get_distributions().values()) assert all(distribution == beta for distribution in model2.get_distributions().values())
def test_conditional_sampling_constraint_uses_reject_sampling( gm_mock, isinstance_mock): """Test that the ``sample`` method handles constraints with conditions. The ``sample`` method is expected to properly apply constraint transformations by dropping columns that cannot be conditonally sampled on due to them being part of a constraint. Setup: - The model is being passed a ``UniqueCombination`` constraint and then asked to sample with two conditions, one of which the constraint depends on. The constraint is expected to skip its transformations since only some of the columns are provided by the conditions and the model will use reject sampling to meet the constraint instead. Input: - Conditions Side Effects: - Correct columns to condition on are passed to underlying sample method """ # Setup isinstance_mock.side_effect = _isinstance_side_effect constraint = FixedCombinations(column_names=['city', 'state']) data = pd.DataFrame({ 'city': ['LA', 'SF', 'CHI', 'LA', 'LA'], 'state': ['CA', 'CA', 'IL', 'CA', 'CA'], 'age': [27, 28, 26, 21, 30] }) model = GaussianCopula(constraints=[constraint], categorical_transformer='label_encoding') sampled_numeric_data = [ pd.DataFrame({ 'city#state.value': [0, 1, 2, 0, 0], 'age.value': [30, 30, 30, 30, 30] }), pd.DataFrame({ 'city#state.value': [1], 'age.value': [30] }) ] gm_mock.return_value.sample.side_effect = sampled_numeric_data model.fit(data) # Run conditions = [Condition({'age': 30, 'state': 'CA'}, num_rows=5)] sampled_data = model.sample_conditions(conditions=conditions) # Assert expected_transformed_conditions = {'age.value': 30} expected_data = pd.DataFrame({ 'city': ['LA', 'SF', 'LA', 'LA', 'SF'], 'state': ['CA', 'CA', 'CA', 'CA', 'CA'], 'age': [30, 30, 30, 30, 30] }) sample_calls = model._model.sample.mock_calls assert len(sample_calls) == 2 model._model.sample.assert_any_call( 50, conditions=expected_transformed_conditions) pd.testing.assert_frame_equal(sampled_data, expected_data)
def test_fit_with_unique_constraint_on_data_which_has_index_column(): """Test that the ``fit`` method runs without error when metadata specifies unique constraint, ``fit`` is called on data containing a column named index and other columns. The ``fit`` method is expected to fit the model to data, taking into account the metadata and the ``Unique`` constraint. Setup: - The model is passed the unique constraint and the primary key column. - The unique constraint is set on the ``test_column`` Input: - Data, Unique constraint Github Issue: - Tests that https://github.com/sdv-dev/SDV/issues/616 does not occur """ # Setup test_df = pd.DataFrame({ "key": [ 1, 2, 3, 4, 5, ], "index": [ "A", "B", "C", "D", "E", ], "test_column": [ "A1", "B2", "C3", "D4", "E5", ] }) unique = Unique(column_names=["test_column"]) model = GaussianCopula(primary_key="key", constraints=[unique]) # Run model.fit(test_df) samples = model.sample(2) # Assert assert len(samples) == 2 assert samples["test_column"].is_unique
def test_conditional_sampling_dataframe(): data = pd.DataFrame({ "column1": [1.0, 0.5, 2.5] * 10, "column2": ["a", "b", "c"] * 10 }) model = GaussianCopula() model.fit(data) conditions = pd.DataFrame({"column2": ["b", "b", "b", "c", "c"]}) sampled = model.sample(conditions=conditions) assert sampled.shape[0] == len(conditions["column2"]) assert (sampled["column2"] == np.array(["b", "b", "b", "c", "c"])).all()
def test_conditional_sampling_two_conditions(): data = pd.DataFrame({ "column1": [1.0, 0.5, 2.5] * 10, "column2": ["a", "b", "c"] * 10, "column3": ["d", "e", "f"] * 10 }) model = GaussianCopula() model.fit(data) conditions = {"column2": "b", "column3": "f"} samples = model.sample(5, conditions=conditions) assert list(samples.column2) == ['b'] * 5 assert list(samples.column3) == ['f'] * 5
def test_conditional_sampling_dict(): data = pd.DataFrame({ "column1": [1.0, 0.5, 2.5] * 10, "column2": ["a", "b", "c"] * 10 }) model = GaussianCopula() model.fit(data) conditions = {"column2": "b"} sampled = model.sample(30, conditions=conditions) assert sampled.shape == data.shape assert set(sampled["column2"].unique()) == set(["b"])
def test_conditional_sampling_two_conditions(): data = pd.DataFrame({ 'column1': [1.0, 0.5, 2.5] * 10, 'column2': ['a', 'b', 'c'] * 10, 'column3': ['d', 'e', 'f'] * 10 }) model = GaussianCopula() model.fit(data) conditions = [Condition({'column2': 'b', 'column3': 'f'}, num_rows=5)] samples = model.sample_conditions(conditions=conditions) assert list(samples.column2) == ['b'] * 5 assert list(samples.column3) == ['f'] * 5
def test_conditional_sampling_dataframe(): data = pd.DataFrame({ 'column1': [1.0, 0.5, 2.5] * 10, 'column2': ['a', 'b', 'c'] * 10 }) model = GaussianCopula() model.fit(data) conditions = pd.DataFrame({'column2': ['b', 'b', 'b', 'c', 'c']}) sampled = model.sample_remaining_columns(conditions) assert sampled.shape[0] == len(conditions['column2']) assert (sampled['column2'] == np.array(['b', 'b', 'b', 'c', 'c'])).all()
def test_conditional_sampling_dict(): data = pd.DataFrame({ 'column1': [1.0, 0.5, 2.5] * 10, 'column2': ['a', 'b', 'c'] * 10 }) model = GaussianCopula() model.fit(data) conditions = [Condition({'column2': 'b'}, num_rows=30)] sampled = model.sample_conditions(conditions=conditions) assert sampled.shape == data.shape assert set(sampled['column2'].unique()) == set(['b'])
def test_conditional_sampling_numerical(): data = pd.DataFrame({ "column1": [1.0, 0.5, 2.5] * 10, "column2": ["a", "b", "c"] * 10, "column3": ["d", "e", "f"] * 10 }) model = GaussianCopula() model.fit(data) conditions = { "column1": 1.0, } sampled = model.sample(5, conditions=conditions) assert list(sampled.column1) == [1.0] * 5
def test_conditional_sampling_numerical(): data = pd.DataFrame({ 'column1': [1.0, 0.5, 2.5] * 10, 'column2': ['a', 'b', 'c'] * 10, 'column3': ['d', 'e', 'f'] * 10 }) model = GaussianCopula() model.fit(data) conditions = [Condition({ 'column1': 1.0, }, num_rows=5)] sampled = model.sample_conditions(conditions=conditions) assert list(sampled.column1) == [1.0] * 5
def test__rebuild_correlation_matrix_outside(self): """Test ``_rebuild_correlation_matrix`` with an invalid correlation input. If the input contains values outside -1 and 1, the method is expected to scale them down to the valid range. Input: - list of lists with values outside of -1 and 1 Expected Output: - numpy array with the square correlation matrix """ # Run triangular_covariance = [ [1.0], [2.0, 1.0] ] correlation = GaussianCopula._rebuild_correlation_matrix(triangular_covariance) # Assert expected = [ [1.0, 0.5, 1.0], [0.5, 1.0, 0.5], [1.0, 0.5, 1.0] ] assert expected == correlation
def test__rebuild_correlation_matrix_valid(self): """Test ``_rebuild_correlation_matrix`` with a valid correlation input. If the input contains values between -1 and 1, the method is expected to simply rebuild the square matrix with the same values. Input: - list of lists with values between -1 and 1 Expected Output: - numpy array with the square correlation matrix """ # Run triangular_covariance = [ [0.1], [0.2, 0.3] ] correlation = GaussianCopula._rebuild_correlation_matrix(triangular_covariance) # Assert expected = [ [1.0, 0.1, 0.2], [0.1, 1.0, 0.3], [0.2, 0.3, 1.0] ] assert expected == correlation
def test__get_nearest_correlation_matrix_valid(self): """Test ``_get_nearest_correlation_matrix`` with a psd input. If the matrix is positive semi-definite, do nothing. Input: - matrix which is positive semi-definite. Expected Output: - the input, unmodified. """ # Run correlation_matrix = np.array([ [1, 0, 0], [0, 1, 0], [0, 0, 1], ]) output = GaussianCopula._get_nearest_correlation_matrix( correlation_matrix) # Assert expected = [ [1, 0, 0], [0, 1, 0], [0, 0, 1], ] assert expected == output.tolist() assert output is correlation_matrix
def test_sample_no_transformed_columns(self): """Test the ``BaseTabularModel.sample`` method with no transformed columns. When the transformed conditions DataFrame has no columns, expect that sample does not pass through any conditions when conditionally sampling. Setup: - Mock the ``_make_conditions_df`` method to return a dataframe representing the expected conditions, and the ``get_fields`` method to return metadata fields containing the expected conditioned column. - Mock the ``_metadata.transform`` method to return an empty transformed conditions dataframe. - Mock the ``_conditionally_sample_rows`` method to return the expected sampled rows. - Mock the `make_ids_unique` to return the expected sampled rows. Input: - number of rows - one set of conditions Output: - the expected sampled rows Side Effects: - Expect ``_conditionally_sample_rows`` to be called with the given condition and a transformed_condition of None. """ # Setup gaussian_copula = Mock(spec_set=GaussianCopula) expected = pd.DataFrame(['a', 'a', 'a']) gaussian_copula._make_conditions_df.return_value = pd.DataFrame( {'a': ['a', 'a', 'a']}) gaussian_copula._metadata.get_fields.return_value = ['a'] gaussian_copula._metadata.transform.return_value = pd.DataFrame( {}, index=[0, 1, 2]) gaussian_copula._conditionally_sample_rows.return_value = pd.DataFrame( { 'a': ['a', 'a', 'a'], COND_IDX: [0, 1, 2] }) gaussian_copula._metadata.make_ids_unique.return_value = expected # Run out = GaussianCopula.sample(gaussian_copula, num_rows=3, conditions={'a': 'a'}) # Asserts gaussian_copula._conditionally_sample_rows.assert_called_once_with( DataFrameMatcher( pd.DataFrame({ COND_IDX: [0, 1, 2], 'a': ['a', 'a', 'a'] })), 100, 10, {'a': 'a'}, None, 0.01, False, ) pd.testing.assert_frame_equal(out, expected)
def test_sample_conditions(self): """Test ``sample_conditions`` method. Expect the correct args to be passed to ``_sample_conditions``. Input: - valid conditions Side Effects: - The expected ``_sample_conditions`` call. """ # Setup model = Mock(spec_set=GaussianCopula) condition = Condition( {'column1': 'b'}, num_rows=5, ) batch_size = 1 randomize_samples = False output_file_path = 'test.csv' # Run out = GaussianCopula.sample_conditions( model, [condition], batch_size=batch_size, randomize_samples=False, output_file_path=output_file_path, ) # Assert model._sample_conditions.assert_called_once_with( [condition], 100, batch_size, randomize_samples, output_file_path) assert out == model._sample_conditions.return_value
def test_sample_remaining_columns(self): """Test ``sample_remaining_columns`` method. Expect the correct args to be passed to ``_sample_remaining_columns`` Input: - valid DataFrame Side Effects: - The expected ``_sample_remaining_columns`` call. """ # Setup model = Mock(spec_set=GaussianCopula) conditions = pd.DataFrame([{'cola': 'a'}] * 5) batch_size = 1 randomize_samples = False output_file_path = 'test.csv' # Run out = GaussianCopula.sample_remaining_columns( model, conditions, batch_size=batch_size, randomize_samples=randomize_samples, output_file_path=output_file_path, ) # Assert model._sample_remaining_columns.assert_called_once_with( conditions, 100, batch_size, randomize_samples, output_file_path) assert out == model._sample_remaining_columns.return_value
def test__sample(self): """Test the ``GaussianCopula._sample`` method. The GaussianCopula._sample method is expected to: - call ``self._model.sample`` method passing the given num_rows. - Return the output from the ``self._model.sample call``. Input: - Integer Expected Output: - ``self._model.sample.return_value`` Side Effects: - ``self._model.sample`` is called with the given integer as input """ # Setup n_rows = 2 gaussian_copula = Mock(spec_set=GaussianCopula) expected = pd.DataFrame([1, 2, 3]) gaussian_copula._model.sample.return_value = expected # Run out = GaussianCopula._sample(gaussian_copula, n_rows) # Asserts gaussian_copula._model.sample.assert_called_once_with(n_rows) assert expected.equals(out)
def test__get_nearest_correlation_matrix_invalid(self): """Test ``_get_nearest_correlation_matrix`` with a non psd input. If the matrix is not positive semi-definite, modify it to make it PSD. Input: - matrix which is not positive semi-definite. Expected Output: - modified matrix which is positive semi-definite. """ # Run not_psd_matrix = np.array([ [1, 0, 0], [0, 1, 0], [0, 0, -1], ]) output = GaussianCopula._get_nearest_correlation_matrix(not_psd_matrix) # Assert expected = [ [1, 0, 0], [0, 1, 0], [0, 0, 1], ] assert expected == output.tolist() not_psd_eigenvalues = scipy.linalg.eigh(not_psd_matrix)[0] output_eigenvalues = scipy.linalg.eigh(output)[0] assert (not_psd_eigenvalues < 0).any() assert (output_eigenvalues >= 0).all()
def test_fit_with_unique_constraint_on_data_subset(): """Test that the ``fit`` method runs without error when metadata specifies unique constraint, ``fit`` is called on a subset of the original data. The ``fit`` method is expected to fit the model to the subset of data, taking into account the metadata and the ``Unique`` constraint. Setup: - The model is passed a ``Unique`` constraint and is matched to a subset of the specified data. Subdividing the data results in missing indexes in the subset contained in the original data. Input: - Subset of data, unique constraint Github Issue: - Tests that https://github.com/sdv-dev/SDV/issues/610 does not occur """ # Setup test_df = pd.DataFrame({ "key": [ 1, 2, 3, 4, 5, ], "test_column": [ "A", "B", "C", "D", "E", ] }) unique = Unique(column_names=["test_column"]) test_df = test_df.iloc[[1, 3, 4]] model = GaussianCopula(primary_key="key", constraints=[unique]) # Run model.fit(test_df) samples = model.sample(2) # Assert assert len(samples) == 2 assert samples["test_column"].is_unique
def test__sample_rows_previous_rows_appended_correctly(): """Test the ``BaseTabularModel._sample_rows`` method. If ``_sample_rows`` is passed ``previous_rows``, then it should reset the index when appending them to the new sampled rows. Input: - num_rows is 5 - previous_rows is a DataFrame of 3 existing rows. Output: - 5 sampled rows with index set to [0, 1, 2, 3, 4] """ # Setup model = GaussianCopula() previous_data = pd.DataFrame({ 'column1': [1, 2, 3], 'column2': [4, 5, 6], 'column3': [7, 8, 9] }) new_data = pd.DataFrame({ 'column1': [4, 5], 'column2': [7, 8], 'column3': [10, 11] }) model._metadata = Mock() model._sample = Mock() model._sample.return_value = new_data model._metadata.reverse_transform.return_value = new_data model._metadata.filter_valid = lambda x: x # Run sampled, num_valid = model._sample_rows(5, previous_rows=previous_data) # Assert expected = pd.DataFrame({ 'column1': [1, 2, 3, 4, 5], 'column2': [4, 5, 6, 7, 8], 'column3': [7, 8, 9, 10, 11] }) assert num_valid == 5 pd.testing.assert_frame_equal(sampled, expected)
def test_integer_categoricals(): """Ensure integer categoricals are still sampled as integers. The origin of this tests can be found in the github issue #194: https://github.com/sdv-dev/SDV/issues/194 """ users = load_demo(metadata=False)['users'] field_types = { 'age': { 'type': 'categorical', }, } gc = GaussianCopula(field_types=field_types, categorical_transformer='categorical') gc.fit(users) sampled = gc.sample() assert users['age'].dtype == np.int64 assert sampled['age'].dtype == np.int64
def test_ids_only(): """Ensure that tables that do not contain anything other than id fields can be modeled.""" ids_only = pd.DataFrame({ 'id': range(10), 'other_id': range(10), }) model = GaussianCopula(field_types={ 'id': { 'type': 'id' }, 'other_id': { 'type': 'id' } }) model.fit(ids_only) sampled = model.sample() assert sampled.shape == ids_only.shape assert ids_only.equals(sampled)
def test__fit(self, mock_warnings, gm_mock): """Test the ``GaussianCopula._fit`` method. The ``_fit`` method is expected to: - Call the _get_distribution method to build the distributions dict. - Set the output from _get_distribution method as self._distribution. - Create a GaussianMultivriate object with the self._distribution value. - Store the GaussianMultivariate instance in the self._model attribute. - Fit the GaussianMultivariate instance with the given table data, unmodified. - Call the _update_metadata method. Setup: - mock _get_distribution to return a distribution dict - mock warnings to ensure that during the model fit those are being ignored. Input: - pandas.DataFrame Expected Output: - None Side Effects: - self._distribution is set to the output from _get_distribution - GaussianMultivariate is called with self._distribution as input - GaussianMultivariate output is stored as self._model - self._model.fit is called with the input dataframe - self._update_metadata is called without arguments """ # Setup gaussian_copula = Mock(spec_set=GaussianCopula) gaussian_copula._field_distributions = {'a': 'a_distribution'} # Run data = pd.DataFrame({ 'a': [1, 2, 3] }) out = GaussianCopula._fit(gaussian_copula, data) # asserts assert out is None assert gaussian_copula._field_distributions == {'a': 'a_distribution'} gm_mock.assert_called_once_with(distribution={'a': 'a_distribution'}) assert gaussian_copula._model == gm_mock.return_value expected_data = pd.DataFrame({ 'a': [1, 2, 3] }) call_args = gaussian_copula._model.fit.call_args_list passed_table_data = call_args[0][0][0] pd.testing.assert_frame_equal(expected_data, passed_table_data) gaussian_copula._update_metadata.assert_called_once_with() mock_warnings.catch_warnings.assert_called_once() mock_warnings.filterwarnings.assert_called_once_with('ignore', module='scipy')
def test___init__metadata_object(self): """Test ``__init__`` passing a ``Table`` object. In this case, the metadata object should be copied and stored as ``instance.table_metadata``. Input: - table_metadata - field_distributions - default_distribution - categorical_transformer Side Effects - attributes are set to the right values - metadata is created with the right values - ``instance.metadata`` is different than the object provided """ # Setup metadata_dict = { 'name': 'test', 'fields': { 'a_field': { 'type': 'categorical' }, }, 'model_kwargs': { 'GaussianCopula': { 'field_distributions': { 'a_field': 'gaussian', }, 'categorical_transformer': 'categorical_fuzzy', } } } table_metadata = Table.from_dict(metadata_dict) # Run gc = GaussianCopula( default_distribution='gamma', table_metadata=table_metadata, ) # Assert assert gc._metadata.get_fields() == table_metadata.get_fields() kwargs = gc._metadata.get_model_kwargs('GaussianCopula') provided_kwargs = table_metadata.get_model_kwargs('GaussianCopula') assert kwargs['field_distributions'] == provided_kwargs[ 'field_distributions'] assert kwargs['categorical_transformer'] == provided_kwargs[ 'categorical_transformer'] assert 'default_distribution' not in provided_kwargs assert gc._metadata != table_metadata
def test_sample_empty_transformed_conditions(): """Test that None is passed to ``_sample_batch`` if transformed conditions are empty. The ``Sample`` method is expected to: - Return sampled data and pass None to ``sample_batch`` as the ``transformed_conditions``. Input: - Number of rows to sample - Conditions Output: - Sampled data """ # Setup model = GaussianCopula() data = pd.DataFrame({ 'column1': list(range(100)), 'column2': list(range(100)), 'column3': list(range(100)) }) conditions = {'column1': 25} conditions_series = pd.Series([25, 25, 25, 25, 25], name='column1') model._sample_batch = Mock() sampled = pd.DataFrame({ 'column1': [28, 28], 'column2': [37, 37], 'column3': [93, 93], }) model._sample_batch.return_value = sampled model.fit(data) model._metadata = Mock() model._metadata.get_fields.return_value = ['column1', 'column2', 'column3'] model._metadata.transform.return_value = pd.DataFrame() model._metadata.make_ids_unique.side_effect = lambda x: x # Run output = model.sample(5, conditions=conditions, graceful_reject_sampling=True) # Assert expected_output = pd.DataFrame({ 'column1': [28, 28], 'column2': [37, 37], 'column3': [93, 93], }) _, args, kwargs = model._metadata.transform.mock_calls[0] pd.testing.assert_series_equal(args[0]['column1'], conditions_series) assert kwargs['on_missing_column'] == 'drop' model._metadata.transform.assert_called_once() model._sample_batch.assert_called_with(5, 100, 10, conditions, None, 0.01) pd.testing.assert_frame_equal(output, expected_output)
def test__validate_distribution_fqn(self): """Test the ``_validate_distribution`` method passing a FQN distribution name. If the input is an importable FQN of a Python object, return the input. Input: - A univariate distribution FQN. Output: - The corresponding class. """ out = GaussianCopula._validate_distribution('copulas.univariate.GaussianUnivariate') assert out == 'copulas.univariate.GaussianUnivariate'
def test_get_parameters_non_parametric(self): """Test the ``get_parameters`` method when model is parametric. If there is at least one distributions in the model that is not parametric, a NonParametricError should be raised. Setup: - ``self._model`` is set to a ``GaussianMultivariate`` that uses ``GaussianKDE`` as its ``distribution``. Side Effects: - A NonParametricError is raised. """ # Setup gm = GaussianMultivariate(distribution=GaussianKDE()) data = pd.DataFrame([1, 1, 1]) gm.fit(data) gc = Mock() gc._model = gm # Run, Assert with pytest.raises(NonParametricError): GaussianCopula.get_parameters(gc)
def test___init__metadata_dict(self, init_mock, from_dict_mock): """Test ``__init__`` without passing a table_metadata dict. In this case, metadata will be loaded from the dict and passed to the parent. Input: - table_metadata - distribution - default_distribution - categorical_transformer Side Effects - attributes are set to the right values - super().__init__ is called with the loaded metadata """ table_metadata = { 'fields': { 'a_field': { 'type': 'categorical' }, }, 'model_kwargs': { 'GaussianCopula': { 'distribution': { 'a_field': 'gaussian', }, 'categorical_transformer': 'categorical_fuzzy', } } } gc = GaussianCopula( distribution={'a_field': 'gaussian'}, categorical_transformer='categorical_fuzzy', table_metadata=table_metadata, ) assert gc._distribution == {'a_field': 'gaussian'} assert gc._categorical_transformer == 'categorical_fuzzy' assert gc._DTYPE_TRANSFORMERS == {'O': 'categorical_fuzzy'} init_mock.assert_called_once_with( field_names=None, primary_key=None, field_types=None, field_transformers=None, anonymize_fields=None, constraints=None, table_metadata=from_dict_mock.return_value, )