Exemple #1
0
def test__make_condition_dfs_with_multiple_conditions_same_column(model):
    """Test ``_make_condition_dfs`` works correctly with multiple conditions.

    The ``_make_condition_dfs`` method is expected to:
        - Combine conditions for conditions with the same columns.

    Input:
        - Conditions

    Output:
        - Conditions as ``[DataFrame]``
    """
    # Setup
    column_values1 = {'column2': 'M'}
    column_values2 = {'column2': 'N'}
    conditions = [
        Condition(column_values=column_values1, num_rows=2),
        Condition(column_values=column_values2, num_rows=3),
    ]
    expected_conditions = pd.DataFrame([column_values1] * 2 +
                                       [column_values2] * 3)

    # Run
    result_conditions_list = model._make_condition_dfs(conditions=conditions)

    # Assert
    assert len(result_conditions_list) == 1
    result_conditions = result_conditions_list[0]
    assert isinstance(result_conditions, pd.DataFrame)
    assert len(result_conditions) == 5
    assert all(result_conditions == expected_conditions)
Exemple #2
0
    def test__sample_conditions_with_multiple_conditions(self):
        """Test the `BaseTabularModel._sample_conditions` method with multiple condtions.

        When multiple condition dataframes are returned by `_make_condition_dfs`,
        expect `_sample_with_conditions` is called for each condition dataframe.

        Input:
            - 2 conditions with different columns
        Output:
            - The expected sampled rows
        """
        # Setup
        model = Mock(spec_set=CTGAN)
        model._validate_file_path.return_value = None

        condition_values1 = {'cola': 'a'}
        condition1 = Condition(condition_values1, num_rows=2)
        sampled1 = pd.DataFrame({'a': ['a', 'a'], 'b': [1, 2]})

        condition_values2 = {'colb': 1}
        condition2 = Condition(condition_values2, num_rows=3)
        sampled2 = pd.DataFrame({'a': ['b', 'c', 'a'], 'b': [1, 1, 1]})

        expected = pd.DataFrame({
            'a': ['a', 'a', 'b', 'c', 'a'],
            'b': [1, 2, 1, 1, 1],
        })

        model._make_condition_dfs.return_value = [
            pd.DataFrame([condition_values1] * 2),
            pd.DataFrame([condition_values2] * 3),
        ]
        model._sample_with_conditions.side_effect = [
            sampled1,
            sampled2,
        ]

        # Run
        out = BaseTabularModel._sample_conditions(model,
                                                  [condition1, condition2],
                                                  100, None, True, None)

        # Asserts
        model._sample_with_conditions.assert_has_calls([
            call(DataFrameMatcher(pd.DataFrame([condition_values1] * 2)), 100,
                 None, ANY, None),
            call(DataFrameMatcher(pd.DataFrame([condition_values2] * 3)), 100,
                 None, ANY, None),
        ])
        pd.testing.assert_frame_equal(out, expected)
Exemple #3
0
    def test__conditionally_sample_rows_graceful_reject_sampling_true(self):
        """Test the `BaseTabularModel._conditionally_sample_rows` method.

        When `_sample_with_conditions` is called with `graceful_reject_sampling` as True,
        expect that there are no errors if no valid rows are generated.

        Input:
            - An impossible condition
        Returns:
            - Empty DataFrame
        """
        # Setup
        model = Mock(spec_set=CTGAN)
        model._validate_file_path.return_value = None

        condition_values = {'cola': 'c'}
        transformed_conditions = pd.DataFrame([condition_values] * 2)
        condition = Condition(condition_values, num_rows=2)

        model._sample_batch.return_value = pd.DataFrame()

        # Run
        sampled = BaseTabularModel._conditionally_sample_rows(
            model,
            pd.DataFrame([condition_values] * 2),
            condition,
            transformed_conditions,
            graceful_reject_sampling=True,
        )

        # Assert
        assert len(sampled) == 0
        model._sample_batch.assert_called_once_with(2, None, None, condition,
                                                    transformed_conditions,
                                                    0.01, None, None)
Exemple #4
0
    def test__sample_conditions_no_rows(self):
        """Test `BaseTabularModel._sample_conditions` with invalid condition.

        If no valid rows are returned for any condition, expect a ValueError.

        Input:
            - condition that is impossible to satisfy
        Side Effects:
            - ValueError is thrown
        """
        # Setup
        model = Mock(spec_set=CTGAN)
        condition = Condition(
            {'column1': 'b'},
            num_rows=5,
        )
        model._make_condition_dfs.return_value = pd.DataFrame([{
            'column1': 'b'
        }] * 5)
        model._sample_with_conditions.return_value = pd.DataFrame()

        # Run and assert
        with pytest.raises(
                ValueError,
                match='Unable to sample any rows for the given conditions.'):
            BaseTabularModel._sample_conditions(model, [condition], 100, None,
                                                True, None)
Exemple #5
0
def test__make_condition_dfs_specifying_num_rows(model):
    """Test ``_make_condition_dfs`` works correctly when ``num_rows`` is passed.

    The ``_make_condition_dfs`` method is expected to:
    - Return as many condition rows as specified with ``num_rows`` as a ``DataFrame``.

    Input:
        - Conditions
        - Num_rows

    Output:
        - Conditions as ``[DataFrame]``
    """
    # Setup
    _NUM_ROWS = 10
    column_values = {'column2': 'M'}
    conditions = [Condition(column_values=column_values, num_rows=_NUM_ROWS)]
    expected_conditions = pd.DataFrame([column_values] * _NUM_ROWS)

    # Run
    result_conditions_list = model._make_condition_dfs(conditions=conditions)

    # Assert
    assert len(result_conditions_list) == 1
    result_conditions = result_conditions_list[0]
    assert isinstance(result_conditions, pd.DataFrame)
    assert len(result_conditions) == _NUM_ROWS
    assert all(result_conditions == expected_conditions)
Exemple #6
0
def test__make_condition_dfs_without_num_rows(model):
    """Test ``_make_condition_dfs`` works correctly when ``num_rows`` is not passed.

    The ``_make_condition_dfs`` method is expected to:
        - Return conditions as a ``DataFrame`` for one row.

    Input:
        - Conditions

    Output:
        - Conditions as ``[DataFrame]``
    """
    # Setup
    column_values = {'column2': 'M'}
    conditions = [Condition(column_values=column_values)]
    expected_conditions = pd.DataFrame([column_values])

    # Run
    result_conditions_list = model._make_condition_dfs(conditions=conditions)

    # Assert
    assert len(result_conditions_list) == 1
    result_conditions = result_conditions_list[0]
    assert isinstance(result_conditions, pd.DataFrame)
    assert len(result_conditions) == 1
    assert all(result_conditions == expected_conditions)
Exemple #7
0
def test_conditional_sampling_constraint_uses_reject_sampling(
        gm_mock, isinstance_mock):
    """Test that the ``sample`` method handles constraints with conditions.

    The ``sample`` method is expected to properly apply constraint
    transformations by dropping columns that cannot be conditonally sampled
    on due to them being part of a constraint.

    Setup:
    - The model is being passed a ``UniqueCombination`` constraint and then
    asked to sample with two conditions, one of which the constraint depends on.
    The constraint is expected to skip its transformations since only some of
    the columns are provided by the conditions and the model will use reject
    sampling to meet the constraint instead.

    Input:
    - Conditions
    Side Effects:
    - Correct columns to condition on are passed to underlying sample method
    """
    # Setup
    isinstance_mock.side_effect = _isinstance_side_effect
    constraint = FixedCombinations(column_names=['city', 'state'])
    data = pd.DataFrame({
        'city': ['LA', 'SF', 'CHI', 'LA', 'LA'],
        'state': ['CA', 'CA', 'IL', 'CA', 'CA'],
        'age': [27, 28, 26, 21, 30]
    })
    model = GaussianCopula(constraints=[constraint],
                           categorical_transformer='label_encoding')
    sampled_numeric_data = [
        pd.DataFrame({
            'city#state.value': [0, 1, 2, 0, 0],
            'age.value': [30, 30, 30, 30, 30]
        }),
        pd.DataFrame({
            'city#state.value': [1],
            'age.value': [30]
        })
    ]
    gm_mock.return_value.sample.side_effect = sampled_numeric_data
    model.fit(data)

    # Run
    conditions = [Condition({'age': 30, 'state': 'CA'}, num_rows=5)]
    sampled_data = model.sample_conditions(conditions=conditions)

    # Assert
    expected_transformed_conditions = {'age.value': 30}
    expected_data = pd.DataFrame({
        'city': ['LA', 'SF', 'LA', 'LA', 'SF'],
        'state': ['CA', 'CA', 'CA', 'CA', 'CA'],
        'age': [30, 30, 30, 30, 30]
    })
    sample_calls = model._model.sample.mock_calls
    assert len(sample_calls) == 2
    model._model.sample.assert_any_call(
        50, conditions=expected_transformed_conditions)
    pd.testing.assert_frame_equal(sampled_data, expected_data)
Exemple #8
0
def test_conditional_sampling_graceful_reject_sampling_True_dict(model):
    data = pd.DataFrame({
        'column1': list(range(100)),
        'column2': list(range(100)),
        'column3': list(range(100))
    })

    model.fit(data)
    conditions = [Condition({'column1': 28, 'column2': 37, 'column3': 93})]

    with pytest.raises(ValueError):
        model.sample_conditions(conditions=conditions)
Exemple #9
0
def test_conditional_sampling_dict():
    data = pd.DataFrame({
        'column1': [1.0, 0.5, 2.5] * 10,
        'column2': ['a', 'b', 'c'] * 10
    })

    model = TVAE(epochs=1)
    model.fit(data)
    conditions = [Condition({'column2': 'b'}, num_rows=30)]
    sampled = model.sample_conditions(conditions=conditions)

    assert sampled.shape == data.shape
    assert set(sampled['column2'].unique()) == set(['b'])
Exemple #10
0
def test_conditional_sampling_two_conditions():
    data = pd.DataFrame({
        'column1': [1.0, 0.5, 2.5] * 10,
        'column2': ['a', 'b', 'c'] * 10,
        'column3': ['d', 'e', 'f'] * 10
    })

    model = TVAE(epochs=1)
    model.fit(data)
    conditions = [Condition({'column2': 'b', 'column3': 'f'}, num_rows=5)]
    samples = model.sample_conditions(conditions=conditions)
    assert list(samples.column2) == ['b'] * 5
    assert list(samples.column3) == ['f'] * 5
Exemple #11
0
def test__sample_conditions_with_value_zero(model):
    data = pd.DataFrame({
        'column1': list(range(100)),
        'column2': list(range(100)),
        'column3': list(range(100))
    })
    data = data.astype(float)

    conditions = [
        Condition(
            {'column1': 0},
            num_rows=1,
        ),
        Condition(
            {'column1': 0.0},
            num_rows=1,
        )
    ]

    model.fit(data)
    output = model._sample_conditions(conditions, 100, None, True, None)
    assert len(output) == 2, 'Expected 2 valid rows.'
Exemple #12
0
def test__make_condition_dfs_with_multiple_conditions_different_columns(model):
    """Test ``_make_condition_dfs`` works correctly with multiple conditions.

    The ``_make_condition_dfs`` method is expected to:
        - Return multiple DataFrames if conditions are not able to be combined.

    Input:
        - Conditions

    Output:
        - Conditions as ``[DataFrame]``
    """
    # Setup
    column_values1 = {'column2': 'M'}
    column_values2 = {'column3': 'N'}
    conditions = [
        Condition(column_values=column_values1, num_rows=2),
        Condition(column_values=column_values2, num_rows=3),
    ]
    expected_conditions1 = pd.DataFrame([column_values1] * 2)
    expected_conditions2 = pd.DataFrame([column_values2] * 3)

    # Run
    result_conditions_list = model._make_condition_dfs(conditions=conditions)

    # Assert
    assert len(result_conditions_list) == 2

    result_conditions1 = result_conditions_list[0]
    assert isinstance(result_conditions1, pd.DataFrame)
    assert len(result_conditions1) == 2
    assert all(result_conditions1 == expected_conditions1)

    result_conditions2 = result_conditions_list[1]
    assert isinstance(result_conditions2, pd.DataFrame)
    assert len(result_conditions2) == 3
    assert all(result_conditions2 == expected_conditions2)
Exemple #13
0
def test_conditional_sampling_numerical():
    data = pd.DataFrame({
        'column1': [1.0, 0.5, 2.5] * 10,
        'column2': ['a', 'b', 'c'] * 10,
        'column3': ['d', 'e', 'f'] * 10
    })

    model = TVAE(epochs=1)
    model.fit(data)
    conditions = [Condition({
        'column1': 1.0,
    }, num_rows=5)]
    sampled = model.sample_conditions(conditions=conditions)

    assert list(sampled.column1) == [1.0] * 5
Exemple #14
0
def test__sample_conditions_graceful_reject_sampling(model):
    data = pd.DataFrame({
        'column1': list(range(100)),
        'column2': list(range(100)),
        'column3': list(range(100))
    })

    conditions = [Condition(
        {'column1': 'this is not used'},
        num_rows=5,
    )]

    model._sample_batch = Mock()
    model._sample_batch.return_value = pd.DataFrame({
        'column1': [28, 28],
        'column2': [37, 37],
        'column3': [93, 93],
    })

    model.fit(data)
    output = model._sample_conditions(conditions, 100, None, True, None)
    assert len(output) == 2, 'Only expected 2 valid rows.'
Exemple #15
0
    def test__conditionally_sample_rows_graceful_reject_sampling_false(self):
        """Test the `BaseTabularModel._conditionally_sample_rows` method.

        When `_sample_with_conditions` is called with `graceful_reject_sampling` as False,
        expect that an error is thrown if no valid rows are generated.

        Input:
            - An impossible condition
        Side Effect:
            - A ValueError is thrown
        """
        # Setup
        model = Mock(spec_set=CTGAN)
        model._validate_file_path.return_value = None

        condition_values = {'cola': 'c'}
        transformed_conditions = pd.DataFrame([condition_values] * 2)
        condition = Condition(condition_values, num_rows=2)

        model._sample_batch.return_value = pd.DataFrame()

        # Run and assert
        with pytest.raises(
                ValueError,
                match='Unable to sample any rows for the given conditions'):
            BaseTabularModel._conditionally_sample_rows(
                model,
                pd.DataFrame([condition_values] * 2),
                condition,
                transformed_conditions,
                graceful_reject_sampling=False,
            )

        model._sample_batch.assert_called_once_with(2, None, None, condition,
                                                    transformed_conditions,
                                                    0.01, None, None)
Exemple #16
0
    def test_sample_conditions(self):
        """Test `BaseTabularModel.sample_conditions` method.

        Expect the correct args to be passed to `_sample_conditions`.

        Input:
            - valid conditions
        Side Effects:
            - The expected `_sample_conditions` call.
        """
        # Setup
        model = Mock(spec_set=CTGAN)
        condition = Condition(
            {'column1': 'b'},
            num_rows=5,
        )

        # Run
        out = BaseTabularModel.sample_conditions(model, [condition])

        # Assert
        model._sample_conditions.assert_called_once_with([condition], 100,
                                                         None, True, None)
        assert out == model._sample_conditions.return_value