def test__make_condition_dfs_with_multiple_conditions_same_column(model): """Test ``_make_condition_dfs`` works correctly with multiple conditions. The ``_make_condition_dfs`` method is expected to: - Combine conditions for conditions with the same columns. Input: - Conditions Output: - Conditions as ``[DataFrame]`` """ # Setup column_values1 = {'column2': 'M'} column_values2 = {'column2': 'N'} conditions = [ Condition(column_values=column_values1, num_rows=2), Condition(column_values=column_values2, num_rows=3), ] expected_conditions = pd.DataFrame([column_values1] * 2 + [column_values2] * 3) # Run result_conditions_list = model._make_condition_dfs(conditions=conditions) # Assert assert len(result_conditions_list) == 1 result_conditions = result_conditions_list[0] assert isinstance(result_conditions, pd.DataFrame) assert len(result_conditions) == 5 assert all(result_conditions == expected_conditions)
def test__sample_conditions_with_multiple_conditions(self): """Test the `BaseTabularModel._sample_conditions` method with multiple condtions. When multiple condition dataframes are returned by `_make_condition_dfs`, expect `_sample_with_conditions` is called for each condition dataframe. Input: - 2 conditions with different columns Output: - The expected sampled rows """ # Setup model = Mock(spec_set=CTGAN) model._validate_file_path.return_value = None condition_values1 = {'cola': 'a'} condition1 = Condition(condition_values1, num_rows=2) sampled1 = pd.DataFrame({'a': ['a', 'a'], 'b': [1, 2]}) condition_values2 = {'colb': 1} condition2 = Condition(condition_values2, num_rows=3) sampled2 = pd.DataFrame({'a': ['b', 'c', 'a'], 'b': [1, 1, 1]}) expected = pd.DataFrame({ 'a': ['a', 'a', 'b', 'c', 'a'], 'b': [1, 2, 1, 1, 1], }) model._make_condition_dfs.return_value = [ pd.DataFrame([condition_values1] * 2), pd.DataFrame([condition_values2] * 3), ] model._sample_with_conditions.side_effect = [ sampled1, sampled2, ] # Run out = BaseTabularModel._sample_conditions(model, [condition1, condition2], 100, None, True, None) # Asserts model._sample_with_conditions.assert_has_calls([ call(DataFrameMatcher(pd.DataFrame([condition_values1] * 2)), 100, None, ANY, None), call(DataFrameMatcher(pd.DataFrame([condition_values2] * 3)), 100, None, ANY, None), ]) pd.testing.assert_frame_equal(out, expected)
def test__conditionally_sample_rows_graceful_reject_sampling_true(self): """Test the `BaseTabularModel._conditionally_sample_rows` method. When `_sample_with_conditions` is called with `graceful_reject_sampling` as True, expect that there are no errors if no valid rows are generated. Input: - An impossible condition Returns: - Empty DataFrame """ # Setup model = Mock(spec_set=CTGAN) model._validate_file_path.return_value = None condition_values = {'cola': 'c'} transformed_conditions = pd.DataFrame([condition_values] * 2) condition = Condition(condition_values, num_rows=2) model._sample_batch.return_value = pd.DataFrame() # Run sampled = BaseTabularModel._conditionally_sample_rows( model, pd.DataFrame([condition_values] * 2), condition, transformed_conditions, graceful_reject_sampling=True, ) # Assert assert len(sampled) == 0 model._sample_batch.assert_called_once_with(2, None, None, condition, transformed_conditions, 0.01, None, None)
def test__sample_conditions_no_rows(self): """Test `BaseTabularModel._sample_conditions` with invalid condition. If no valid rows are returned for any condition, expect a ValueError. Input: - condition that is impossible to satisfy Side Effects: - ValueError is thrown """ # Setup model = Mock(spec_set=CTGAN) condition = Condition( {'column1': 'b'}, num_rows=5, ) model._make_condition_dfs.return_value = pd.DataFrame([{ 'column1': 'b' }] * 5) model._sample_with_conditions.return_value = pd.DataFrame() # Run and assert with pytest.raises( ValueError, match='Unable to sample any rows for the given conditions.'): BaseTabularModel._sample_conditions(model, [condition], 100, None, True, None)
def test__make_condition_dfs_specifying_num_rows(model): """Test ``_make_condition_dfs`` works correctly when ``num_rows`` is passed. The ``_make_condition_dfs`` method is expected to: - Return as many condition rows as specified with ``num_rows`` as a ``DataFrame``. Input: - Conditions - Num_rows Output: - Conditions as ``[DataFrame]`` """ # Setup _NUM_ROWS = 10 column_values = {'column2': 'M'} conditions = [Condition(column_values=column_values, num_rows=_NUM_ROWS)] expected_conditions = pd.DataFrame([column_values] * _NUM_ROWS) # Run result_conditions_list = model._make_condition_dfs(conditions=conditions) # Assert assert len(result_conditions_list) == 1 result_conditions = result_conditions_list[0] assert isinstance(result_conditions, pd.DataFrame) assert len(result_conditions) == _NUM_ROWS assert all(result_conditions == expected_conditions)
def test__make_condition_dfs_without_num_rows(model): """Test ``_make_condition_dfs`` works correctly when ``num_rows`` is not passed. The ``_make_condition_dfs`` method is expected to: - Return conditions as a ``DataFrame`` for one row. Input: - Conditions Output: - Conditions as ``[DataFrame]`` """ # Setup column_values = {'column2': 'M'} conditions = [Condition(column_values=column_values)] expected_conditions = pd.DataFrame([column_values]) # Run result_conditions_list = model._make_condition_dfs(conditions=conditions) # Assert assert len(result_conditions_list) == 1 result_conditions = result_conditions_list[0] assert isinstance(result_conditions, pd.DataFrame) assert len(result_conditions) == 1 assert all(result_conditions == expected_conditions)
def test_conditional_sampling_constraint_uses_reject_sampling( gm_mock, isinstance_mock): """Test that the ``sample`` method handles constraints with conditions. The ``sample`` method is expected to properly apply constraint transformations by dropping columns that cannot be conditonally sampled on due to them being part of a constraint. Setup: - The model is being passed a ``UniqueCombination`` constraint and then asked to sample with two conditions, one of which the constraint depends on. The constraint is expected to skip its transformations since only some of the columns are provided by the conditions and the model will use reject sampling to meet the constraint instead. Input: - Conditions Side Effects: - Correct columns to condition on are passed to underlying sample method """ # Setup isinstance_mock.side_effect = _isinstance_side_effect constraint = FixedCombinations(column_names=['city', 'state']) data = pd.DataFrame({ 'city': ['LA', 'SF', 'CHI', 'LA', 'LA'], 'state': ['CA', 'CA', 'IL', 'CA', 'CA'], 'age': [27, 28, 26, 21, 30] }) model = GaussianCopula(constraints=[constraint], categorical_transformer='label_encoding') sampled_numeric_data = [ pd.DataFrame({ 'city#state.value': [0, 1, 2, 0, 0], 'age.value': [30, 30, 30, 30, 30] }), pd.DataFrame({ 'city#state.value': [1], 'age.value': [30] }) ] gm_mock.return_value.sample.side_effect = sampled_numeric_data model.fit(data) # Run conditions = [Condition({'age': 30, 'state': 'CA'}, num_rows=5)] sampled_data = model.sample_conditions(conditions=conditions) # Assert expected_transformed_conditions = {'age.value': 30} expected_data = pd.DataFrame({ 'city': ['LA', 'SF', 'LA', 'LA', 'SF'], 'state': ['CA', 'CA', 'CA', 'CA', 'CA'], 'age': [30, 30, 30, 30, 30] }) sample_calls = model._model.sample.mock_calls assert len(sample_calls) == 2 model._model.sample.assert_any_call( 50, conditions=expected_transformed_conditions) pd.testing.assert_frame_equal(sampled_data, expected_data)
def test_conditional_sampling_graceful_reject_sampling_True_dict(model): data = pd.DataFrame({ 'column1': list(range(100)), 'column2': list(range(100)), 'column3': list(range(100)) }) model.fit(data) conditions = [Condition({'column1': 28, 'column2': 37, 'column3': 93})] with pytest.raises(ValueError): model.sample_conditions(conditions=conditions)
def test_conditional_sampling_dict(): data = pd.DataFrame({ 'column1': [1.0, 0.5, 2.5] * 10, 'column2': ['a', 'b', 'c'] * 10 }) model = TVAE(epochs=1) model.fit(data) conditions = [Condition({'column2': 'b'}, num_rows=30)] sampled = model.sample_conditions(conditions=conditions) assert sampled.shape == data.shape assert set(sampled['column2'].unique()) == set(['b'])
def test_conditional_sampling_two_conditions(): data = pd.DataFrame({ 'column1': [1.0, 0.5, 2.5] * 10, 'column2': ['a', 'b', 'c'] * 10, 'column3': ['d', 'e', 'f'] * 10 }) model = TVAE(epochs=1) model.fit(data) conditions = [Condition({'column2': 'b', 'column3': 'f'}, num_rows=5)] samples = model.sample_conditions(conditions=conditions) assert list(samples.column2) == ['b'] * 5 assert list(samples.column3) == ['f'] * 5
def test__sample_conditions_with_value_zero(model): data = pd.DataFrame({ 'column1': list(range(100)), 'column2': list(range(100)), 'column3': list(range(100)) }) data = data.astype(float) conditions = [ Condition( {'column1': 0}, num_rows=1, ), Condition( {'column1': 0.0}, num_rows=1, ) ] model.fit(data) output = model._sample_conditions(conditions, 100, None, True, None) assert len(output) == 2, 'Expected 2 valid rows.'
def test__make_condition_dfs_with_multiple_conditions_different_columns(model): """Test ``_make_condition_dfs`` works correctly with multiple conditions. The ``_make_condition_dfs`` method is expected to: - Return multiple DataFrames if conditions are not able to be combined. Input: - Conditions Output: - Conditions as ``[DataFrame]`` """ # Setup column_values1 = {'column2': 'M'} column_values2 = {'column3': 'N'} conditions = [ Condition(column_values=column_values1, num_rows=2), Condition(column_values=column_values2, num_rows=3), ] expected_conditions1 = pd.DataFrame([column_values1] * 2) expected_conditions2 = pd.DataFrame([column_values2] * 3) # Run result_conditions_list = model._make_condition_dfs(conditions=conditions) # Assert assert len(result_conditions_list) == 2 result_conditions1 = result_conditions_list[0] assert isinstance(result_conditions1, pd.DataFrame) assert len(result_conditions1) == 2 assert all(result_conditions1 == expected_conditions1) result_conditions2 = result_conditions_list[1] assert isinstance(result_conditions2, pd.DataFrame) assert len(result_conditions2) == 3 assert all(result_conditions2 == expected_conditions2)
def test_conditional_sampling_numerical(): data = pd.DataFrame({ 'column1': [1.0, 0.5, 2.5] * 10, 'column2': ['a', 'b', 'c'] * 10, 'column3': ['d', 'e', 'f'] * 10 }) model = TVAE(epochs=1) model.fit(data) conditions = [Condition({ 'column1': 1.0, }, num_rows=5)] sampled = model.sample_conditions(conditions=conditions) assert list(sampled.column1) == [1.0] * 5
def test__sample_conditions_graceful_reject_sampling(model): data = pd.DataFrame({ 'column1': list(range(100)), 'column2': list(range(100)), 'column3': list(range(100)) }) conditions = [Condition( {'column1': 'this is not used'}, num_rows=5, )] model._sample_batch = Mock() model._sample_batch.return_value = pd.DataFrame({ 'column1': [28, 28], 'column2': [37, 37], 'column3': [93, 93], }) model.fit(data) output = model._sample_conditions(conditions, 100, None, True, None) assert len(output) == 2, 'Only expected 2 valid rows.'
def test__conditionally_sample_rows_graceful_reject_sampling_false(self): """Test the `BaseTabularModel._conditionally_sample_rows` method. When `_sample_with_conditions` is called with `graceful_reject_sampling` as False, expect that an error is thrown if no valid rows are generated. Input: - An impossible condition Side Effect: - A ValueError is thrown """ # Setup model = Mock(spec_set=CTGAN) model._validate_file_path.return_value = None condition_values = {'cola': 'c'} transformed_conditions = pd.DataFrame([condition_values] * 2) condition = Condition(condition_values, num_rows=2) model._sample_batch.return_value = pd.DataFrame() # Run and assert with pytest.raises( ValueError, match='Unable to sample any rows for the given conditions'): BaseTabularModel._conditionally_sample_rows( model, pd.DataFrame([condition_values] * 2), condition, transformed_conditions, graceful_reject_sampling=False, ) model._sample_batch.assert_called_once_with(2, None, None, condition, transformed_conditions, 0.01, None, None)
def test_sample_conditions(self): """Test `BaseTabularModel.sample_conditions` method. Expect the correct args to be passed to `_sample_conditions`. Input: - valid conditions Side Effects: - The expected `_sample_conditions` call. """ # Setup model = Mock(spec_set=CTGAN) condition = Condition( {'column1': 'b'}, num_rows=5, ) # Run out = BaseTabularModel.sample_conditions(model, [condition]) # Assert model._sample_conditions.assert_called_once_with([condition], 100, None, True, None) assert out == model._sample_conditions.return_value