def test__fit(self, gct_mock, ht_mock, ctgan_fit_mock): """Test the ``CopulaGAN._fit`` method. The ``_fit`` method is expected to: - Build transformers for all the non-categorical data columns based on the field distributions. - Create a HyperTransformer with all the transformers. - Fit and transform the data with the HyperTransformer. - Call CTGAN fit. Setup: - mock _field_distribution and _default_distribution to return the desired distribution values Input: - pandas.DataFrame Expected Output: - None Side Effects: - GaussianCopulaTransformer is called with the expected disributions. - HyperTransformer is called to create a hyper transformer object. - HyperTransformer fit_transform is called with the expected data. - CTGAN's fit method is called with the expected data. """ # Setup model = Mock(spec_set=CopulaGAN) model._field_distributions = {'a': 'a_distribution'} model._default_distribution = 'default_distribution' model._metadata.get_fields.return_value = { 'a': {}, 'b': {}, 'c': { 'type': 'categorical' } } # Run data = pd.DataFrame({ 'a': [1, 2, 3], 'b': [5, 6, 7], 'c': ['c', 'c', 'c'], }) out = CopulaGAN._fit(model, data) # asserts assert out is None assert model._field_distributions == {'a': 'a_distribution'} gct_mock.assert_has_calls([ call(distribution='a_distribution'), call(distribution='default_distribution'), ]) assert gct_mock.call_count == 2 assert model._ht == ht_mock.return_value ht_mock.return_value.fit_transform.called_once_with( DataFrameMatcher(data)) ctgan_fit_mock.called_once_with(DataFrameMatcher(data))
def test__sample_conditions_with_multiple_conditions(self): """Test the `BaseTabularModel._sample_conditions` method with multiple condtions. When multiple condition dataframes are returned by `_make_condition_dfs`, expect `_sample_with_conditions` is called for each condition dataframe. Input: - 2 conditions with different columns Output: - The expected sampled rows """ # Setup model = Mock(spec_set=CTGAN) model._validate_file_path.return_value = None condition_values1 = {'cola': 'a'} condition1 = Condition(condition_values1, num_rows=2) sampled1 = pd.DataFrame({'a': ['a', 'a'], 'b': [1, 2]}) condition_values2 = {'colb': 1} condition2 = Condition(condition_values2, num_rows=3) sampled2 = pd.DataFrame({'a': ['b', 'c', 'a'], 'b': [1, 1, 1]}) expected = pd.DataFrame({ 'a': ['a', 'a', 'b', 'c', 'a'], 'b': [1, 2, 1, 1, 1], }) model._make_condition_dfs.return_value = [ pd.DataFrame([condition_values1] * 2), pd.DataFrame([condition_values2] * 3), ] model._sample_with_conditions.side_effect = [ sampled1, sampled2, ] # Run out = BaseTabularModel._sample_conditions(model, [condition1, condition2], 100, None, True, None) # Asserts model._sample_with_conditions.assert_has_calls([ call(DataFrameMatcher(pd.DataFrame([condition_values1] * 2)), 100, None, ANY, None), call(DataFrameMatcher(pd.DataFrame([condition_values2] * 3)), 100, None, ANY, None), ]) pd.testing.assert_frame_equal(out, expected)
def test_sample_no_transformed_columns(self): """Test the ``BaseTabularModel.sample`` method with no transformed columns. When the transformed conditions DataFrame has no columns, expect that sample does not pass through any conditions when conditionally sampling. Setup: - Mock the ``_make_conditions_df`` method to return a dataframe representing the expected conditions, and the ``get_fields`` method to return metadata fields containing the expected conditioned column. - Mock the ``_metadata.transform`` method to return an empty transformed conditions dataframe. - Mock the ``_conditionally_sample_rows`` method to return the expected sampled rows. - Mock the `make_ids_unique` to return the expected sampled rows. Input: - number of rows - one set of conditions Output: - the expected sampled rows Side Effects: - Expect ``_conditionally_sample_rows`` to be called with the given condition and a transformed_condition of None. """ # Setup gaussian_copula = Mock(spec_set=GaussianCopula) expected = pd.DataFrame(['a', 'a', 'a']) gaussian_copula._make_conditions_df.return_value = pd.DataFrame( {'a': ['a', 'a', 'a']}) gaussian_copula._metadata.get_fields.return_value = ['a'] gaussian_copula._metadata.transform.return_value = pd.DataFrame( {}, index=[0, 1, 2]) gaussian_copula._conditionally_sample_rows.return_value = pd.DataFrame( { 'a': ['a', 'a', 'a'], COND_IDX: [0, 1, 2] }) gaussian_copula._metadata.make_ids_unique.return_value = expected # Run out = GaussianCopula.sample(gaussian_copula, num_rows=3, conditions={'a': 'a'}) # Asserts gaussian_copula._conditionally_sample_rows.assert_called_once_with( DataFrameMatcher( pd.DataFrame({ COND_IDX: [0, 1, 2], 'a': ['a', 'a', 'a'] })), 100, 10, {'a': 'a'}, None, 0.01, False, ) pd.testing.assert_frame_equal(out, expected)
def test__sample_remaining_columns(self): """Test the `BaseTabularModel._sample_remaining_colmns` method. When a valid DataFrame is given, expect `_sample_with_conditions` to be called with the input DataFrame. Input: - DataFrame with condition column values populated. Output: - The expected sampled rows. Side Effects: - `_sample_with_conditions` is called once. """ # Setup model = Mock(spec_set=CTGAN) model._validate_file_path.return_value = None conditions = pd.DataFrame([{'cola': 'a'}] * 5) sampled = pd.DataFrame({ 'cola': ['a', 'a', 'a', 'a', 'a'], 'colb': [1, 2, 1, 1, 1], }) model._sample_with_conditions.return_value = sampled # Run out = BaseTabularModel._sample_remaining_columns( model, conditions, 100, None, True, None) # Asserts model._sample_with_conditions.assert_called_once_with( DataFrameMatcher(conditions), 100, None, ANY, None) pd.testing.assert_frame_equal(out, sampled)
def test_fit_with_null_values(self): """Test the ``TabularPreset.fit`` method with null values. Expect that the model's fit method is called with the expected args, and that the null percentage is calculated correctly. Input: - fit data Side Effects: - The model's ``fit`` method is called with the same data. """ # Setup metadata = Mock() metadata.to_dict.return_value = {'fields': {'a': {}}} model = Mock() model._metadata = metadata preset = Mock() preset._model = model preset._null_column = False preset._null_percentages = None data = {'a': [1, 2, np.nan]} # Run TabularPreset.fit(preset, pd.DataFrame(data)) # Assert model.fit.assert_called_once_with(DataFrameMatcher(pd.DataFrame(data))) assert preset._null_percentages == {'a': 1.0 / 3}
def test_fit_null_column_True(self): """Test the ``TabularPreset.fit`` method with modeling null columns. Expect that the model's fit method is called with the expected args when ``_null_column`` is set to ``True``. Setup: - _null_column is True Input: - fit data Side Effects: - The model's ``fit`` method is called with the same data. - ``_null_percentages`` is ``None`` """ # Setup metadata = Mock() metadata.to_dict.return_value = {'fields': {}} model = Mock() model._metadata = metadata preset = Mock() preset._model = model preset._null_column = True preset._null_percentages = None # Run TabularPreset.fit(preset, pd.DataFrame()) # Assert model.fit.assert_called_once_with(DataFrameMatcher(pd.DataFrame())) assert preset._null_percentages is None
def test_fit(self): """Test the ``TabularPreset.fit`` method. Expect that the model's fit method is called with the expected args. Input: - fit data Side Effects: - The model's ``fit`` method is called with the same data. """ # Setup metadata = Mock() metadata.to_dict.return_value = {'fields': {}} model = Mock() model._metadata = metadata preset = Mock() preset._model = model preset._null_percentages = None # Run TabularPreset.fit(preset, pd.DataFrame()) # Assert model.fit.assert_called_once_with(DataFrameMatcher(pd.DataFrame())) assert preset._null_percentages is None
def test__sample_with_conditions_no_transformed_columns(self): """Test the ``BaseTabularModel.sample`` method with no transformed columns. When the transformed conditions DataFrame has no columns, expect that sample does not pass through any conditions when conditionally sampling. Setup: - Mock the ``_make_condition_dfs`` method to return a dataframe representing the expected conditions, and the ``get_fields`` method to return metadata fields containing the expected conditioned column. - Mock the ``_metadata.transform`` method to return an empty transformed conditions dataframe. - Mock the ``_conditionally_sample_rows`` method to return the expected sampled rows. - Mock the `make_ids_unique` to return the expected sampled rows. Input: - number of rows - one set of conditions Output: - the expected sampled rows Side Effects: - Expect ``_conditionally_sample_rows`` to be called with the given condition and a transformed_condition of None. """ # Setup model = Mock(spec_set=CTGAN) expected = pd.DataFrame(['a', 'a', 'a']) condition_dataframe = pd.DataFrame({'a': ['a', 'a', 'a']}) model._make_condition_dfs.return_value = condition_dataframe model._metadata.get_fields.return_value = ['a'] model._metadata.transform.return_value = pd.DataFrame({}, index=[0, 1, 2]) model._conditionally_sample_rows.return_value = pd.DataFrame({ 'a': ['a', 'a', 'a'], COND_IDX: [0, 1, 2] }) model._metadata.make_ids_unique.return_value = expected # Run out = BaseTabularModel._sample_with_conditions(model, condition_dataframe, 100, None) # Asserts model._conditionally_sample_rows.assert_called_once_with( DataFrameMatcher( pd.DataFrame({ COND_IDX: [0, 1, 2], 'a': ['a', 'a', 'a'] })), {'a': 'a'}, None, 100, None, progress_bar=None, output_file_path=None, ) pd.testing.assert_frame_equal(out, expected)
def test__sample_batch_with_batch_size_per_try(self): """Test the `BaseTabularModel._sample_batch` method with `batch_size_per_try`. Expect that the expected calls to `_sample_rows` are made. Input: - num_rows = 10 - batch_size_per_try = 5 Output: - The requested number of sampled rows. Side Effect: - Call `_sample_rows` method twice with the expected number of rows. """ # Setup model = Mock(spec_set=CTGAN) sampled_data = pd.DataFrame({ 'column1': [28, 28, 21, 1, 2], 'column2': [37, 37, 1, 4, 5], 'column3': [93, 93, 6, 4, 12], }) model._sample_rows.side_effect = [ (sampled_data, 5), (sampled_data.append(sampled_data, ignore_index=False), 10), ] # Run output = BaseTabularModel._sample_batch(model, num_rows=10, batch_size_per_try=5) # Assert assert model._sample_rows.has_calls([ call(5, None, None, 0.01, DataFrameMatcher(pd.DataFrame())), call(5, None, None, 0.01, DataFrameMatcher(sampled_data)), ]) assert len(output) == 10