def test_recreate(): data = load_demo(metadata=False)['users'] # If distribution is non parametric, get_parameters fails model = GaussianCopula() model.fit(data) sampled = model.sample() assert sampled.shape == data.shape assert (sampled.dtypes == data.dtypes).all() assert (sampled.notnull().sum(axis=1) != 0).all() # Metadata model_meta = GaussianCopula(table_metadata=model.get_metadata()) model_meta.fit(data) sampled = model_meta.sample() assert sampled.shape == data.shape assert (sampled.dtypes == data.dtypes).all() assert (sampled.notnull().sum(axis=1) != 0).all() # Metadata dict model_meta_dict = GaussianCopula( table_metadata=model.get_metadata().to_dict()) model_meta_dict.fit(data) sampled = model_meta_dict.sample() assert sampled.shape == data.shape assert (sampled.dtypes == data.dtypes).all() assert (sampled.notnull().sum(axis=1) != 0).all()
def test_sample_no_transformed_columns(self): """Test the ``BaseTabularModel.sample`` method with no transformed columns. When the transformed conditions DataFrame has no columns, expect that sample does not pass through any conditions when conditionally sampling. Setup: - Mock the ``_make_conditions_df`` method to return a dataframe representing the expected conditions, and the ``get_fields`` method to return metadata fields containing the expected conditioned column. - Mock the ``_metadata.transform`` method to return an empty transformed conditions dataframe. - Mock the ``_conditionally_sample_rows`` method to return the expected sampled rows. - Mock the `make_ids_unique` to return the expected sampled rows. Input: - number of rows - one set of conditions Output: - the expected sampled rows Side Effects: - Expect ``_conditionally_sample_rows`` to be called with the given condition and a transformed_condition of None. """ # Setup gaussian_copula = Mock(spec_set=GaussianCopula) expected = pd.DataFrame(['a', 'a', 'a']) gaussian_copula._make_conditions_df.return_value = pd.DataFrame( {'a': ['a', 'a', 'a']}) gaussian_copula._metadata.get_fields.return_value = ['a'] gaussian_copula._metadata.transform.return_value = pd.DataFrame( {}, index=[0, 1, 2]) gaussian_copula._conditionally_sample_rows.return_value = pd.DataFrame( { 'a': ['a', 'a', 'a'], COND_IDX: [0, 1, 2] }) gaussian_copula._metadata.make_ids_unique.return_value = expected # Run out = GaussianCopula.sample(gaussian_copula, num_rows=3, conditions={'a': 'a'}) # Asserts gaussian_copula._conditionally_sample_rows.assert_called_once_with( DataFrameMatcher( pd.DataFrame({ COND_IDX: [0, 1, 2], 'a': ['a', 'a', 'a'] })), 100, 10, {'a': 'a'}, None, 0.01, False, ) pd.testing.assert_frame_equal(out, expected)
def test_gaussian_copula(): users = load_demo(metadata=False)['users'] field_types = { 'age': { 'type': 'numerical', 'subtype': 'integer', }, 'country': { 'type': 'categorical' } } anonymize_fields = {'country': 'country_code'} gc = GaussianCopula( field_names=['user_id', 'country', 'gender', 'age'], field_types=field_types, primary_key='user_id', anonymize_fields=anonymize_fields, categorical_transformer='one_hot_encoding', ) gc.fit(users) parameters = gc.get_parameters() new_gc = GaussianCopula( table_metadata=gc.get_metadata(), categorical_transformer='one_hot_encoding', ) new_gc.set_parameters(parameters) sampled = new_gc.sample() # test shape is right assert sampled.shape == users.shape # test user_id has been generated as an ID field assert list(sampled['user_id']) == list(range(0, len(users))) # country codes have been replaced with new ones assert set(sampled.country.unique()) != set(users.country.unique()) metadata = gc.get_metadata().to_dict() assert metadata['fields'] == { 'user_id': { 'type': 'id', 'subtype': 'integer' }, 'country': { 'type': 'categorical' }, 'gender': { 'type': 'categorical' }, 'age': { 'type': 'numerical', 'subtype': 'integer' } } assert 'model_kwargs' in metadata
def test_sample_empty_transformed_conditions(): """Test that None is passed to ``_sample_batch`` if transformed conditions are empty. The ``Sample`` method is expected to: - Return sampled data and pass None to ``sample_batch`` as the ``transformed_conditions``. Input: - Number of rows to sample - Conditions Output: - Sampled data """ # Setup model = GaussianCopula() data = pd.DataFrame({ 'column1': list(range(100)), 'column2': list(range(100)), 'column3': list(range(100)) }) conditions = {'column1': 25} conditions_series = pd.Series([25, 25, 25, 25, 25], name='column1') model._sample_batch = Mock() sampled = pd.DataFrame({ 'column1': [28, 28], 'column2': [37, 37], 'column3': [93, 93], }) model._sample_batch.return_value = sampled model.fit(data) model._metadata = Mock() model._metadata.get_fields.return_value = ['column1', 'column2', 'column3'] model._metadata.transform.return_value = pd.DataFrame() model._metadata.make_ids_unique.side_effect = lambda x: x # Run output = model.sample(5, conditions=conditions, graceful_reject_sampling=True) # Assert expected_output = pd.DataFrame({ 'column1': [28, 28], 'column2': [37, 37], 'column3': [93, 93], }) _, args, kwargs = model._metadata.transform.mock_calls[0] pd.testing.assert_series_equal(args[0]['column1'], conditions_series) assert kwargs['on_missing_column'] == 'drop' model._metadata.transform.assert_called_once() model._sample_batch.assert_called_with(5, 100, 10, conditions, None, 0.01) pd.testing.assert_frame_equal(output, expected_output)
def test_fit_with_unique_constraint_on_data_which_has_index_column(): """Test that the ``fit`` method runs without error when metadata specifies unique constraint, ``fit`` is called on data containing a column named index and other columns. The ``fit`` method is expected to fit the model to data, taking into account the metadata and the ``Unique`` constraint. Setup: - The model is passed the unique constraint and the primary key column. - The unique constraint is set on the ``test_column`` Input: - Data, Unique constraint Github Issue: - Tests that https://github.com/sdv-dev/SDV/issues/616 does not occur """ # Setup test_df = pd.DataFrame({ "key": [ 1, 2, 3, 4, 5, ], "index": [ "A", "B", "C", "D", "E", ], "test_column": [ "A1", "B2", "C3", "D4", "E5", ] }) unique = Unique(column_names=["test_column"]) model = GaussianCopula(primary_key="key", constraints=[unique]) # Run model.fit(test_df) samples = model.sample(2) # Assert assert len(samples) == 2 assert samples["test_column"].is_unique
def test_conditional_sampling_dict(): data = pd.DataFrame({ "column1": [1.0, 0.5, 2.5] * 10, "column2": ["a", "b", "c"] * 10 }) model = GaussianCopula() model.fit(data) conditions = {"column2": "b"} sampled = model.sample(30, conditions=conditions) assert sampled.shape == data.shape assert set(sampled["column2"].unique()) == set(["b"])
def test_conditional_sampling_dataframe(): data = pd.DataFrame({ "column1": [1.0, 0.5, 2.5] * 10, "column2": ["a", "b", "c"] * 10 }) model = GaussianCopula() model.fit(data) conditions = pd.DataFrame({"column2": ["b", "b", "b", "c", "c"]}) sampled = model.sample(conditions=conditions) assert sampled.shape[0] == len(conditions["column2"]) assert (sampled["column2"] == np.array(["b", "b", "b", "c", "c"])).all()
def test_conditional_sampling_two_conditions(): data = pd.DataFrame({ "column1": [1.0, 0.5, 2.5] * 10, "column2": ["a", "b", "c"] * 10, "column3": ["d", "e", "f"] * 10 }) model = GaussianCopula() model.fit(data) conditions = {"column2": "b", "column3": "f"} samples = model.sample(5, conditions=conditions) assert list(samples.column2) == ['b'] * 5 assert list(samples.column3) == ['f'] * 5
def test_conditional_sampling_numerical(): data = pd.DataFrame({ "column1": [1.0, 0.5, 2.5] * 10, "column2": ["a", "b", "c"] * 10, "column3": ["d", "e", "f"] * 10 }) model = GaussianCopula() model.fit(data) conditions = { "column1": 1.0, } sampled = model.sample(5, conditions=conditions) assert list(sampled.column1) == [1.0] * 5
def test_fit_with_unique_constraint_on_data_subset(): """Test that the ``fit`` method runs without error when metadata specifies unique constraint, ``fit`` is called on a subset of the original data. The ``fit`` method is expected to fit the model to the subset of data, taking into account the metadata and the ``Unique`` constraint. Setup: - The model is passed a ``Unique`` constraint and is matched to a subset of the specified data. Subdividing the data results in missing indexes in the subset contained in the original data. Input: - Subset of data, unique constraint Github Issue: - Tests that https://github.com/sdv-dev/SDV/issues/610 does not occur """ # Setup test_df = pd.DataFrame({ "key": [ 1, 2, 3, 4, 5, ], "test_column": [ "A", "B", "C", "D", "E", ] }) unique = Unique(column_names=["test_column"]) test_df = test_df.iloc[[1, 3, 4]] model = GaussianCopula(primary_key="key", constraints=[unique]) # Run model.fit(test_df) samples = model.sample(2) # Assert assert len(samples) == 2 assert samples["test_column"].is_unique
def test_integer_categoricals(): """Ensure integer categoricals are still sampled as integers. The origin of this tests can be found in the github issue #194: https://github.com/sdv-dev/SDV/issues/194 """ users = load_demo(metadata=False)['users'] field_types = { 'age': { 'type': 'categorical', }, } gc = GaussianCopula(field_types=field_types, categorical_transformer='categorical') gc.fit(users) sampled = gc.sample() assert users['age'].dtype == np.int64 assert sampled['age'].dtype == np.int64
def test_ids_only(): """Ensure that tables that do not contain anything other than id fields can be modeled.""" ids_only = pd.DataFrame({ 'id': range(10), 'other_id': range(10), }) model = GaussianCopula(field_types={ 'id': { 'type': 'id' }, 'other_id': { 'type': 'id' } }) model.fit(ids_only) sampled = model.sample() assert sampled.shape == ids_only.shape assert ids_only.equals(sampled)
def test_sample_batches_transform_conditions_correctly(): """Test that transformed conditions are batched correctly. The ``Sample`` method is expected to: - Return sampled data and call ``_sample_batch`` for every unique transformed condition group. Input: - Number of rows to sample - Conditions Output: - Sampled data """ # Setup model = GaussianCopula() data = pd.DataFrame({ 'column1': list(range(100)), 'column2': list(range(100)), 'column3': list(range(100)) }) conditions = {'column1': [25, 25, 25, 30, 30]} conditions_series = pd.Series([25, 25, 25, 30, 30], name='column1') model._sample_batch = Mock() expected_outputs = [ pd.DataFrame({ 'column1': [25, 25, 25], 'column2': [37, 37, 37], 'column3': [93, 93, 93], }), pd.DataFrame({ 'column1': [30], 'column2': [37], 'column3': [93], }), pd.DataFrame({ 'column1': [30], 'column2': [37], 'column3': [93], }) ] model._sample_batch.side_effect = expected_outputs model.fit(data) model._metadata = Mock() model._metadata.get_fields.return_value = ['column1', 'column2', 'column3'] model._metadata.transform.return_value = pd.DataFrame( [[50], [50], [50], [60], [70]], columns=['transformed_column']) # Run model.sample(5, conditions=conditions, graceful_reject_sampling=True) # Assert _, args, kwargs = model._metadata.transform.mock_calls[0] pd.testing.assert_series_equal(args[0]['column1'], conditions_series) assert kwargs['on_missing_column'] == 'drop' model._metadata.transform.assert_called_once() model._sample_batch.assert_any_call(3, 100, 10, {'column1': 25}, {'transformed_column': 50}, 0.01) model._sample_batch.assert_any_call(1, 100, 10, {'column1': 30}, {'transformed_column': 60}, 0.01) model._sample_batch.assert_any_call(1, 100, 10, {'column1': 30}, {'transformed_column': 70}, 0.01)
def test_conditional_sampling_constraint_uses_columns_model_reject_sampling( column_model_mock): """Test that the ``sample`` method handles constraints with conditions. The ``sample`` method is expected to properly apply constraint transformations by sampling the missing columns for the constraint if ``fit_columns_model`` is True. All values sampled by the column model should be valid because reject sampling is used on any that aren't. Setup: - The model is being passed a ``GreaterThan`` constraint and then asked to sample with one condition. One of the constraint columns is the conditioned column. The ``GaussianMultivariate`` class is mocked so that the constraint's ``_column_model`` returns some invalid rows in order to test that the reject sampling is used. Input: - Conditions Side Effects: - Correct columns to condition on are passed to underlying sample method """ # Setup constraint = GreaterThan(low='age_joined', high='age', handling_strategy='transform', fit_columns_model=True, drop='high') data = pd.DataFrame({ 'age_joined': [22.0, 21.0, 15.0, 18.0, 29.0], 'age': [27.0, 28.0, 26.0, 21.0, 30.0], 'experience_years': [6.0, 7.0, 11.0, 3.0, 7.0], }) model = GaussianCopula(constraints=[constraint]) sampled_conditions = [ pd.DataFrame({ 'age_joined': [26.0, 18.0, 31.0, 29.0, 32.0], 'age': [30.0, 30.0, 30.0, 30.0, 30.0] }), pd.DataFrame({ 'age_joined': [28.0, 33.0, 31.0], 'age': [30.0, 30.0, 30.0] }), pd.DataFrame({ 'age_joined': [27.0], 'age': [30.0] }) ] column_model_mock.return_value.sample.side_effect = sampled_conditions model.fit(data) # Run conditions = {'age': 30.0} sampled_data = model.sample(5, conditions=conditions) # Assert assert len(column_model_mock.return_value.sample.mock_calls) == 3 expected_result = pd.DataFrame({ 'age_joined': [26.0, 18.0, 29.0, 28.0, 27.0], 'age': [30.0, 30.0, 30.0, 30.0, 30.0] }) pd.testing.assert_frame_equal( sampled_data[['age_joined', 'age']], expected_result[['age_joined', 'age']], )
def test_conditional_sampling_constraint_uses_columns_model(gm_mock): """Test that the ``sample`` method handles constraints with conditions. The ``sample`` method is expected to properly apply constraint transformations by sampling the missing columns for the constraint if ``fit_columns_model`` is True. Setup: - The model is being passed a ``UniqueCombination`` constraint and then asked to sample with two conditions, one of which the constraint depends on. The constraint will sample the columns it needs that are not present in the conditions and will then use constraint transformations to meet the requirements. Input: - Conditions Side Effects: - Correct columns to condition on are passed to underlying sample method """ # Setup constraint = UniqueCombinations( columns=['city', 'state'], handling_strategy='transform', fit_columns_model=True, ) data = pd.DataFrame({ 'city': ['LA', 'SF', 'CHI', 'LA', 'LA'], 'state': ['CA', 'CA', 'IL', 'CA', 'CA'], 'age': [27, 28, 26, 21, 30] }) model = GaussianCopula(constraints=[constraint], categorical_transformer='label_encoding') sampled_numeric_data = [ pd.DataFrame({ 'city#state': [2], 'age': [30] }), pd.DataFrame({ 'city#state': [1, 1, 0, 0, 0], 'age': [30, 30, 30, 30, 30] }), pd.DataFrame({ 'city#state': [0, 0, 1, 1, 1], 'age': [30, 30, 30, 30, 30] }) ] gm_mock.return_value.sample.side_effect = sampled_numeric_data model.fit(data) # Run conditions = {'age': 30, 'state': 'CA'} sampled_data = model.sample(5, conditions=conditions) # Assert expected_states = pd.Series(['CA', 'CA', 'CA', 'CA', 'CA'], name='state') expected_ages = pd.Series([30, 30, 30, 30, 30], name='age') sample_calls = model._model.sample.mock_calls assert len(sample_calls) >= 2 and len(sample_calls) <= 3 assert all(c[2]['conditions']['age'] == 30 for c in sample_calls) assert all('city#state' in c[2]['conditions'] for c in sample_calls) pd.testing.assert_series_equal(sampled_data['age'], expected_ages) pd.testing.assert_series_equal(sampled_data['state'], expected_states) assert all(c in ('SF', 'LA') for c in sampled_data['city'])
def test_conditional_sampling_constraint_uses_reject_sampling(gm_mock): """Test that the ``sample`` method handles constraints with conditions. The ``sample`` method is expected to properly apply constraint transformations by dropping columns that cannot be conditonally sampled on due to them being part of a constraint if ``fit_columns_model`` is False. Setup: - The model is being passed a ``UniqueCombination`` constraint and then asked to sample with two conditions, one of which the constraint depends on. The constraint is expected to skip its transformations since only some of the columns are provided by the conditions and the model will use reject sampling to meet the constraint instead. Input: - Conditions Side Effects: - Correct columns to condition on are passed to underlying sample method """ # Setup constraint = UniqueCombinations(columns=['city', 'state'], handling_strategy='transform', fit_columns_model=False) data = pd.DataFrame({ 'city': ['LA', 'SF', 'CHI', 'LA', 'LA'], 'state': ['CA', 'CA', 'IL', 'CA', 'CA'], 'age': [27, 28, 26, 21, 30] }) model = GaussianCopula(constraints=[constraint], categorical_transformer='label_encoding') sampled_numeric_data = [ pd.DataFrame({ 'city#state': [0, 1, 2, 0, 0], 'age': [30, 30, 30, 30, 30] }), pd.DataFrame({ 'city#state': [1], 'age': [30] }) ] gm_mock.return_value.sample.side_effect = sampled_numeric_data model.fit(data) # Run conditions = {'age': 30, 'state': 'CA'} sampled_data = model.sample(5, conditions=conditions) # Assert expected_transformed_conditions = {'age': 30} expected_data = pd.DataFrame({ 'city': ['LA', 'SF', 'LA', 'LA', 'SF'], 'state': ['CA', 'CA', 'CA', 'CA', 'CA'], 'age': [30, 30, 30, 30, 30] }) sample_calls = model._model.sample.mock_calls assert len(sample_calls) == 2 model._model.sample.assert_any_call( 5, conditions=expected_transformed_conditions) model._model.sample.assert_any_call( 1, conditions=expected_transformed_conditions) pd.testing.assert_frame_equal(sampled_data, expected_data)
def test_gaussian_copula(): users = load_demo(metadata=False)['users'] field_types = { 'age': { 'type': 'numerical', 'subtype': 'integer', }, 'country': { 'type': 'categorical' } } anonymize_fields = {'country': 'country_code'} # If distribution is non parametric, get_parameters fails gc = GaussianCopula( field_names=['user_id', 'country', 'gender', 'age'], field_types=field_types, primary_key='user_id', anonymize_fields=anonymize_fields, field_distributions={'age': 'gamma'}, default_distribution='gaussian_kde', ) gc.fit(users) with pytest.raises(NonParametricError): parameters = gc.get_parameters() # If distribution is parametric, copula can be recreated gc = GaussianCopula( field_names=['user_id', 'country', 'gender', 'age'], field_types=field_types, primary_key='user_id', anonymize_fields=anonymize_fields, field_distributions={'age': 'gamma'}, default_distribution='bounded', ) gc.fit(users) parameters = gc.get_parameters() new_gc = GaussianCopula(table_metadata=gc.get_metadata(), ) new_gc.set_parameters(parameters) # Validate sampled dat sampled = new_gc.sample() # test shape is right assert sampled.shape == users.shape # test user_id has been generated as an ID field assert list(sampled['user_id']) == list(range(0, len(users))) # country codes have been replaced with new ones assert set(sampled.country.unique()) != set(users.country.unique()) # Validate metadata metadata = gc.get_metadata().to_dict() assert metadata['fields'] == { 'user_id': { 'type': 'id', 'subtype': 'integer', 'transformer': 'integer', }, 'country': { 'type': 'categorical', 'pii': True, 'pii_category': 'country_code', 'transformer': 'one_hot_encoding', }, 'gender': { 'type': 'categorical', 'transformer': 'one_hot_encoding', }, 'age': { 'type': 'numerical', 'subtype': 'integer', 'transformer': 'integer', } } assert 'model_kwargs' in metadata assert 'GaussianCopula' in metadata['model_kwargs']