Example #1
0
    def test_sample_conditions(self):
        """Test ``sample_conditions`` method.

        Expect the correct args to be passed to ``_sample_conditions``.

        Input:
            - valid conditions
        Side Effects:
            - The expected ``_sample_conditions`` call.
        """
        # Setup
        model = Mock(spec_set=GaussianCopula)
        condition = Condition(
            {'column1': 'b'},
            num_rows=5,
        )
        batch_size = 1
        randomize_samples = False
        output_file_path = 'test.csv'

        # Run
        out = GaussianCopula.sample_conditions(
            model,
            [condition],
            batch_size=batch_size,
            randomize_samples=False,
            output_file_path=output_file_path,
        )

        # Assert
        model._sample_conditions.assert_called_once_with(
            [condition], 100, batch_size, randomize_samples, output_file_path)
        assert out == model._sample_conditions.return_value
Example #2
0
def test_conditional_sampling_constraint_uses_reject_sampling(
        gm_mock, isinstance_mock):
    """Test that the ``sample`` method handles constraints with conditions.

    The ``sample`` method is expected to properly apply constraint
    transformations by dropping columns that cannot be conditonally sampled
    on due to them being part of a constraint.

    Setup:
    - The model is being passed a ``UniqueCombination`` constraint and then
    asked to sample with two conditions, one of which the constraint depends on.
    The constraint is expected to skip its transformations since only some of
    the columns are provided by the conditions and the model will use reject
    sampling to meet the constraint instead.

    Input:
    - Conditions
    Side Effects:
    - Correct columns to condition on are passed to underlying sample method
    """
    # Setup
    isinstance_mock.side_effect = _isinstance_side_effect
    constraint = FixedCombinations(column_names=['city', 'state'])
    data = pd.DataFrame({
        'city': ['LA', 'SF', 'CHI', 'LA', 'LA'],
        'state': ['CA', 'CA', 'IL', 'CA', 'CA'],
        'age': [27, 28, 26, 21, 30]
    })
    model = GaussianCopula(constraints=[constraint],
                           categorical_transformer='label_encoding')
    sampled_numeric_data = [
        pd.DataFrame({
            'city#state.value': [0, 1, 2, 0, 0],
            'age.value': [30, 30, 30, 30, 30]
        }),
        pd.DataFrame({
            'city#state.value': [1],
            'age.value': [30]
        })
    ]
    gm_mock.return_value.sample.side_effect = sampled_numeric_data
    model.fit(data)

    # Run
    conditions = [Condition({'age': 30, 'state': 'CA'}, num_rows=5)]
    sampled_data = model.sample_conditions(conditions=conditions)

    # Assert
    expected_transformed_conditions = {'age.value': 30}
    expected_data = pd.DataFrame({
        'city': ['LA', 'SF', 'LA', 'LA', 'SF'],
        'state': ['CA', 'CA', 'CA', 'CA', 'CA'],
        'age': [30, 30, 30, 30, 30]
    })
    sample_calls = model._model.sample.mock_calls
    assert len(sample_calls) == 2
    model._model.sample.assert_any_call(
        50, conditions=expected_transformed_conditions)
    pd.testing.assert_frame_equal(sampled_data, expected_data)
Example #3
0
def test_conditional_sampling_two_conditions():
    data = pd.DataFrame({
        'column1': [1.0, 0.5, 2.5] * 10,
        'column2': ['a', 'b', 'c'] * 10,
        'column3': ['d', 'e', 'f'] * 10
    })

    model = GaussianCopula()
    model.fit(data)
    conditions = [Condition({'column2': 'b', 'column3': 'f'}, num_rows=5)]
    samples = model.sample_conditions(conditions=conditions)
    assert list(samples.column2) == ['b'] * 5
    assert list(samples.column3) == ['f'] * 5
Example #4
0
def test_conditional_sampling_dict():
    data = pd.DataFrame({
        'column1': [1.0, 0.5, 2.5] * 10,
        'column2': ['a', 'b', 'c'] * 10
    })

    model = GaussianCopula()
    model.fit(data)
    conditions = [Condition({'column2': 'b'}, num_rows=30)]
    sampled = model.sample_conditions(conditions=conditions)

    assert sampled.shape == data.shape
    assert set(sampled['column2'].unique()) == set(['b'])
Example #5
0
def test_conditional_sampling_numerical():
    data = pd.DataFrame({
        'column1': [1.0, 0.5, 2.5] * 10,
        'column2': ['a', 'b', 'c'] * 10,
        'column3': ['d', 'e', 'f'] * 10
    })

    model = GaussianCopula()
    model.fit(data)
    conditions = [Condition({
        'column1': 1.0,
    }, num_rows=5)]
    sampled = model.sample_conditions(conditions=conditions)

    assert list(sampled.column1) == [1.0] * 5