Ejemplo n.º 1
0
def test_parameters():
    gc = GaussianCopula(field_distributions={'foo': 'beta'},
                        default_distribution='gaussian_kde',
                        categorical_transformer='label_encoding')
    new_gc = GaussianCopula(table_metadata=gc.get_metadata().to_dict())

    assert new_gc._metadata._dtype_transformers['O'] == 'label_encoding'
Ejemplo n.º 2
0
def test___init___copies_metadata():
    """Test the ``__init__`` method.

    This test assures that the metadata provided to the model is copied,
    so that any modifications don't change the input.

    Setup:
        - Initialize two models with the same metadata and data.

    Expected behavior:
        - The metadata for each model and the provided metadata should all be different.
    """
    # Setup
    metadata, data = load_tabular_demo('student_placements', metadata=True)

    # Run
    model = GaussianCopula(table_metadata=metadata,
                           categorical_transformer='label_encoding',
                           default_distribution='gamma')
    model.fit(data)
    model2 = GaussianCopula(table_metadata=metadata,
                            categorical_transformer='label_encoding',
                            default_distribution='beta')
    model2.fit(data)

    # Assert
    assert model._metadata != metadata
    assert model._metadata != model2._metadata
    assert model2._metadata != metadata
    gamma = 'copulas.univariate.gamma.GammaUnivariate'
    beta = 'copulas.univariate.beta.BetaUnivariate'
    assert all(distribution == gamma
               for distribution in model.get_distributions().values())
    assert all(distribution == beta
               for distribution in model2.get_distributions().values())
Ejemplo n.º 3
0
def test_conditional_sampling_constraint_uses_reject_sampling(
        gm_mock, isinstance_mock):
    """Test that the ``sample`` method handles constraints with conditions.

    The ``sample`` method is expected to properly apply constraint
    transformations by dropping columns that cannot be conditonally sampled
    on due to them being part of a constraint.

    Setup:
    - The model is being passed a ``UniqueCombination`` constraint and then
    asked to sample with two conditions, one of which the constraint depends on.
    The constraint is expected to skip its transformations since only some of
    the columns are provided by the conditions and the model will use reject
    sampling to meet the constraint instead.

    Input:
    - Conditions
    Side Effects:
    - Correct columns to condition on are passed to underlying sample method
    """
    # Setup
    isinstance_mock.side_effect = _isinstance_side_effect
    constraint = FixedCombinations(column_names=['city', 'state'])
    data = pd.DataFrame({
        'city': ['LA', 'SF', 'CHI', 'LA', 'LA'],
        'state': ['CA', 'CA', 'IL', 'CA', 'CA'],
        'age': [27, 28, 26, 21, 30]
    })
    model = GaussianCopula(constraints=[constraint],
                           categorical_transformer='label_encoding')
    sampled_numeric_data = [
        pd.DataFrame({
            'city#state.value': [0, 1, 2, 0, 0],
            'age.value': [30, 30, 30, 30, 30]
        }),
        pd.DataFrame({
            'city#state.value': [1],
            'age.value': [30]
        })
    ]
    gm_mock.return_value.sample.side_effect = sampled_numeric_data
    model.fit(data)

    # Run
    conditions = [Condition({'age': 30, 'state': 'CA'}, num_rows=5)]
    sampled_data = model.sample_conditions(conditions=conditions)

    # Assert
    expected_transformed_conditions = {'age.value': 30}
    expected_data = pd.DataFrame({
        'city': ['LA', 'SF', 'LA', 'LA', 'SF'],
        'state': ['CA', 'CA', 'CA', 'CA', 'CA'],
        'age': [30, 30, 30, 30, 30]
    })
    sample_calls = model._model.sample.mock_calls
    assert len(sample_calls) == 2
    model._model.sample.assert_any_call(
        50, conditions=expected_transformed_conditions)
    pd.testing.assert_frame_equal(sampled_data, expected_data)
Ejemplo n.º 4
0
def test_fit_with_unique_constraint_on_data_which_has_index_column():
    """Test that the ``fit`` method runs without error when metadata specifies unique constraint,
    ``fit`` is called on data containing a column named index and other columns.

    The ``fit`` method is expected to fit the model to data,
    taking into account the metadata and the ``Unique`` constraint.

    Setup:
    - The model is passed the unique constraint and
    the primary key column.
    - The unique constraint is set on the ``test_column``

    Input:
    - Data, Unique constraint

    Github Issue:
    - Tests that https://github.com/sdv-dev/SDV/issues/616 does not occur
    """
    # Setup
    test_df = pd.DataFrame({
        "key": [
            1,
            2,
            3,
            4,
            5,
        ],
        "index": [
            "A",
            "B",
            "C",
            "D",
            "E",
        ],
        "test_column": [
            "A1",
            "B2",
            "C3",
            "D4",
            "E5",
        ]
    })
    unique = Unique(column_names=["test_column"])
    model = GaussianCopula(primary_key="key", constraints=[unique])

    # Run
    model.fit(test_df)
    samples = model.sample(2)

    # Assert
    assert len(samples) == 2
    assert samples["test_column"].is_unique
Ejemplo n.º 5
0
def test_conditional_sampling_dataframe():
    data = pd.DataFrame({
        "column1": [1.0, 0.5, 2.5] * 10,
        "column2": ["a", "b", "c"] * 10
    })

    model = GaussianCopula()
    model.fit(data)
    conditions = pd.DataFrame({"column2": ["b", "b", "b", "c", "c"]})
    sampled = model.sample(conditions=conditions)

    assert sampled.shape[0] == len(conditions["column2"])
    assert (sampled["column2"] == np.array(["b", "b", "b", "c", "c"])).all()
Ejemplo n.º 6
0
def test_conditional_sampling_two_conditions():
    data = pd.DataFrame({
        "column1": [1.0, 0.5, 2.5] * 10,
        "column2": ["a", "b", "c"] * 10,
        "column3": ["d", "e", "f"] * 10
    })

    model = GaussianCopula()
    model.fit(data)
    conditions = {"column2": "b", "column3": "f"}
    samples = model.sample(5, conditions=conditions)
    assert list(samples.column2) == ['b'] * 5
    assert list(samples.column3) == ['f'] * 5
Ejemplo n.º 7
0
def test_conditional_sampling_dict():
    data = pd.DataFrame({
        "column1": [1.0, 0.5, 2.5] * 10,
        "column2": ["a", "b", "c"] * 10
    })

    model = GaussianCopula()
    model.fit(data)
    conditions = {"column2": "b"}
    sampled = model.sample(30, conditions=conditions)

    assert sampled.shape == data.shape
    assert set(sampled["column2"].unique()) == set(["b"])
Ejemplo n.º 8
0
def test_conditional_sampling_two_conditions():
    data = pd.DataFrame({
        'column1': [1.0, 0.5, 2.5] * 10,
        'column2': ['a', 'b', 'c'] * 10,
        'column3': ['d', 'e', 'f'] * 10
    })

    model = GaussianCopula()
    model.fit(data)
    conditions = [Condition({'column2': 'b', 'column3': 'f'}, num_rows=5)]
    samples = model.sample_conditions(conditions=conditions)
    assert list(samples.column2) == ['b'] * 5
    assert list(samples.column3) == ['f'] * 5
Ejemplo n.º 9
0
def test_conditional_sampling_dataframe():
    data = pd.DataFrame({
        'column1': [1.0, 0.5, 2.5] * 10,
        'column2': ['a', 'b', 'c'] * 10
    })

    model = GaussianCopula()
    model.fit(data)
    conditions = pd.DataFrame({'column2': ['b', 'b', 'b', 'c', 'c']})
    sampled = model.sample_remaining_columns(conditions)

    assert sampled.shape[0] == len(conditions['column2'])
    assert (sampled['column2'] == np.array(['b', 'b', 'b', 'c', 'c'])).all()
Ejemplo n.º 10
0
def test_conditional_sampling_dict():
    data = pd.DataFrame({
        'column1': [1.0, 0.5, 2.5] * 10,
        'column2': ['a', 'b', 'c'] * 10
    })

    model = GaussianCopula()
    model.fit(data)
    conditions = [Condition({'column2': 'b'}, num_rows=30)]
    sampled = model.sample_conditions(conditions=conditions)

    assert sampled.shape == data.shape
    assert set(sampled['column2'].unique()) == set(['b'])
Ejemplo n.º 11
0
def test_conditional_sampling_numerical():
    data = pd.DataFrame({
        "column1": [1.0, 0.5, 2.5] * 10,
        "column2": ["a", "b", "c"] * 10,
        "column3": ["d", "e", "f"] * 10
    })

    model = GaussianCopula()
    model.fit(data)
    conditions = {
        "column1": 1.0,
    }
    sampled = model.sample(5, conditions=conditions)

    assert list(sampled.column1) == [1.0] * 5
Ejemplo n.º 12
0
def test_conditional_sampling_numerical():
    data = pd.DataFrame({
        'column1': [1.0, 0.5, 2.5] * 10,
        'column2': ['a', 'b', 'c'] * 10,
        'column3': ['d', 'e', 'f'] * 10
    })

    model = GaussianCopula()
    model.fit(data)
    conditions = [Condition({
        'column1': 1.0,
    }, num_rows=5)]
    sampled = model.sample_conditions(conditions=conditions)

    assert list(sampled.column1) == [1.0] * 5
Ejemplo n.º 13
0
    def test__rebuild_correlation_matrix_outside(self):
        """Test ``_rebuild_correlation_matrix`` with an invalid correlation input.

        If the input contains values outside -1 and 1, the method is expected
        to scale them down to the valid range.

        Input:
        - list of lists with values outside of -1 and 1

        Expected Output:
        - numpy array with the square correlation matrix
        """
        # Run
        triangular_covariance = [
            [1.0],
            [2.0, 1.0]
        ]
        correlation = GaussianCopula._rebuild_correlation_matrix(triangular_covariance)

        # Assert
        expected = [
            [1.0, 0.5, 1.0],
            [0.5, 1.0, 0.5],
            [1.0, 0.5, 1.0]
        ]
        assert expected == correlation
Ejemplo n.º 14
0
    def test__rebuild_correlation_matrix_valid(self):
        """Test ``_rebuild_correlation_matrix`` with a valid correlation input.

        If the input contains values between -1 and 1, the method is expected
        to simply rebuild the square matrix with the same values.

        Input:
        - list of lists with values between -1 and 1

        Expected Output:
        - numpy array with the square correlation matrix
        """
        # Run
        triangular_covariance = [
            [0.1],
            [0.2, 0.3]
        ]
        correlation = GaussianCopula._rebuild_correlation_matrix(triangular_covariance)

        # Assert
        expected = [
            [1.0, 0.1, 0.2],
            [0.1, 1.0, 0.3],
            [0.2, 0.3, 1.0]
        ]
        assert expected == correlation
Ejemplo n.º 15
0
    def test__get_nearest_correlation_matrix_valid(self):
        """Test ``_get_nearest_correlation_matrix`` with a psd input.

        If the matrix is positive semi-definite, do nothing.

        Input:
        - matrix which is positive semi-definite.

        Expected Output:
        - the input, unmodified.
        """
        # Run
        correlation_matrix = np.array([
            [1, 0, 0],
            [0, 1, 0],
            [0, 0, 1],
        ])
        output = GaussianCopula._get_nearest_correlation_matrix(
            correlation_matrix)

        # Assert
        expected = [
            [1, 0, 0],
            [0, 1, 0],
            [0, 0, 1],
        ]
        assert expected == output.tolist()
        assert output is correlation_matrix
Ejemplo n.º 16
0
    def test_sample_no_transformed_columns(self):
        """Test the ``BaseTabularModel.sample`` method with no transformed columns.

        When the transformed conditions DataFrame has no columns, expect that sample
        does not pass through any conditions when conditionally sampling.

        Setup:
            - Mock the ``_make_conditions_df`` method to return a dataframe representing
              the expected conditions, and the ``get_fields`` method to return metadata
              fields containing the expected conditioned column.
            - Mock the ``_metadata.transform`` method to return an empty transformed
              conditions dataframe.
            - Mock the ``_conditionally_sample_rows`` method to return the expected
              sampled rows.
            - Mock the `make_ids_unique` to return the expected sampled rows.
        Input:
            - number of rows
            - one set of conditions
        Output:
            - the expected sampled rows
        Side Effects:
            - Expect ``_conditionally_sample_rows`` to be called with the given condition
              and a transformed_condition of None.
        """
        # Setup
        gaussian_copula = Mock(spec_set=GaussianCopula)
        expected = pd.DataFrame(['a', 'a', 'a'])

        gaussian_copula._make_conditions_df.return_value = pd.DataFrame(
            {'a': ['a', 'a', 'a']})
        gaussian_copula._metadata.get_fields.return_value = ['a']
        gaussian_copula._metadata.transform.return_value = pd.DataFrame(
            {}, index=[0, 1, 2])
        gaussian_copula._conditionally_sample_rows.return_value = pd.DataFrame(
            {
                'a': ['a', 'a', 'a'],
                COND_IDX: [0, 1, 2]
            })
        gaussian_copula._metadata.make_ids_unique.return_value = expected

        # Run
        out = GaussianCopula.sample(gaussian_copula,
                                    num_rows=3,
                                    conditions={'a': 'a'})

        # Asserts
        gaussian_copula._conditionally_sample_rows.assert_called_once_with(
            DataFrameMatcher(
                pd.DataFrame({
                    COND_IDX: [0, 1, 2],
                    'a': ['a', 'a', 'a']
                })),
            100,
            10,
            {'a': 'a'},
            None,
            0.01,
            False,
        )
        pd.testing.assert_frame_equal(out, expected)
Ejemplo n.º 17
0
    def test_sample_conditions(self):
        """Test ``sample_conditions`` method.

        Expect the correct args to be passed to ``_sample_conditions``.

        Input:
            - valid conditions
        Side Effects:
            - The expected ``_sample_conditions`` call.
        """
        # Setup
        model = Mock(spec_set=GaussianCopula)
        condition = Condition(
            {'column1': 'b'},
            num_rows=5,
        )
        batch_size = 1
        randomize_samples = False
        output_file_path = 'test.csv'

        # Run
        out = GaussianCopula.sample_conditions(
            model,
            [condition],
            batch_size=batch_size,
            randomize_samples=False,
            output_file_path=output_file_path,
        )

        # Assert
        model._sample_conditions.assert_called_once_with(
            [condition], 100, batch_size, randomize_samples, output_file_path)
        assert out == model._sample_conditions.return_value
Ejemplo n.º 18
0
    def test_sample_remaining_columns(self):
        """Test ``sample_remaining_columns`` method.

        Expect the correct args to be passed to ``_sample_remaining_columns``

        Input:
            - valid DataFrame
        Side Effects:
            - The expected ``_sample_remaining_columns`` call.
        """
        # Setup
        model = Mock(spec_set=GaussianCopula)
        conditions = pd.DataFrame([{'cola': 'a'}] * 5)
        batch_size = 1
        randomize_samples = False
        output_file_path = 'test.csv'

        # Run
        out = GaussianCopula.sample_remaining_columns(
            model,
            conditions,
            batch_size=batch_size,
            randomize_samples=randomize_samples,
            output_file_path=output_file_path,
        )

        # Assert
        model._sample_remaining_columns.assert_called_once_with(
            conditions, 100, batch_size, randomize_samples, output_file_path)
        assert out == model._sample_remaining_columns.return_value
Ejemplo n.º 19
0
    def test__sample(self):
        """Test the ``GaussianCopula._sample`` method.

        The GaussianCopula._sample method is expected to:
        - call ``self._model.sample`` method passing the given num_rows.
        - Return the output from the ``self._model.sample call``.

        Input:
        - Integer
        Expected Output:
        - ``self._model.sample.return_value``
        Side Effects:
        - ``self._model.sample`` is called with the given integer as input
        """
        # Setup
        n_rows = 2
        gaussian_copula = Mock(spec_set=GaussianCopula)
        expected = pd.DataFrame([1, 2, 3])
        gaussian_copula._model.sample.return_value = expected
        # Run
        out = GaussianCopula._sample(gaussian_copula, n_rows)

        # Asserts
        gaussian_copula._model.sample.assert_called_once_with(n_rows)
        assert expected.equals(out)
Ejemplo n.º 20
0
    def test__get_nearest_correlation_matrix_invalid(self):
        """Test ``_get_nearest_correlation_matrix`` with a non psd input.

        If the matrix is not positive semi-definite, modify it to make it PSD.

        Input:
        - matrix which is not positive semi-definite.

        Expected Output:
        - modified matrix which is positive semi-definite.
        """
        # Run
        not_psd_matrix = np.array([
            [1, 0, 0],
            [0, 1, 0],
            [0, 0, -1],
        ])
        output = GaussianCopula._get_nearest_correlation_matrix(not_psd_matrix)

        # Assert
        expected = [
            [1, 0, 0],
            [0, 1, 0],
            [0, 0, 1],
        ]
        assert expected == output.tolist()

        not_psd_eigenvalues = scipy.linalg.eigh(not_psd_matrix)[0]
        output_eigenvalues = scipy.linalg.eigh(output)[0]
        assert (not_psd_eigenvalues < 0).any()
        assert (output_eigenvalues >= 0).all()
Ejemplo n.º 21
0
def test_fit_with_unique_constraint_on_data_subset():
    """Test that the ``fit`` method runs without error when metadata specifies unique constraint,
    ``fit`` is called on a subset of the original data.

    The ``fit`` method is expected to fit the model to the subset of data,
    taking into account the metadata and the ``Unique`` constraint.

    Setup:
    - The model is passed a ``Unique`` constraint and is
    matched to a subset of the specified data.
    Subdividing the data results in missing indexes in the subset contained in the original data.

    Input:
    - Subset of data, unique constraint

    Github Issue:
    - Tests that https://github.com/sdv-dev/SDV/issues/610 does not occur
    """
    # Setup
    test_df = pd.DataFrame({
        "key": [
            1,
            2,
            3,
            4,
            5,
        ],
        "test_column": [
            "A",
            "B",
            "C",
            "D",
            "E",
        ]
    })
    unique = Unique(column_names=["test_column"])

    test_df = test_df.iloc[[1, 3, 4]]
    model = GaussianCopula(primary_key="key", constraints=[unique])

    # Run
    model.fit(test_df)
    samples = model.sample(2)

    # Assert
    assert len(samples) == 2
    assert samples["test_column"].is_unique
Ejemplo n.º 22
0
def test__sample_rows_previous_rows_appended_correctly():
    """Test the ``BaseTabularModel._sample_rows`` method.

    If ``_sample_rows`` is passed ``previous_rows``, then it
    should reset the index when appending them to the new
    sampled rows.

    Input:
    - num_rows is 5
    - previous_rows is a DataFrame of 3 existing rows.

    Output:
    - 5 sampled rows with index set to [0, 1, 2, 3, 4]
    """
    # Setup
    model = GaussianCopula()
    previous_data = pd.DataFrame({
        'column1': [1, 2, 3],
        'column2': [4, 5, 6],
        'column3': [7, 8, 9]
    })
    new_data = pd.DataFrame({
        'column1': [4, 5],
        'column2': [7, 8],
        'column3': [10, 11]
    })
    model._metadata = Mock()
    model._sample = Mock()
    model._sample.return_value = new_data
    model._metadata.reverse_transform.return_value = new_data
    model._metadata.filter_valid = lambda x: x

    # Run
    sampled, num_valid = model._sample_rows(5, previous_rows=previous_data)

    # Assert
    expected = pd.DataFrame({
        'column1': [1, 2, 3, 4, 5],
        'column2': [4, 5, 6, 7, 8],
        'column3': [7, 8, 9, 10, 11]
    })
    assert num_valid == 5
    pd.testing.assert_frame_equal(sampled, expected)
Ejemplo n.º 23
0
def test_integer_categoricals():
    """Ensure integer categoricals are still sampled as integers.

    The origin of this tests can be found in the github issue #194:
    https://github.com/sdv-dev/SDV/issues/194
    """
    users = load_demo(metadata=False)['users']

    field_types = {
        'age': {
            'type': 'categorical',
        },
    }
    gc = GaussianCopula(field_types=field_types, categorical_transformer='categorical')
    gc.fit(users)

    sampled = gc.sample()

    assert users['age'].dtype == np.int64
    assert sampled['age'].dtype == np.int64
Ejemplo n.º 24
0
def test_ids_only():
    """Ensure that tables that do not contain anything other than id fields can be modeled."""
    ids_only = pd.DataFrame({
        'id': range(10),
        'other_id': range(10),
    })

    model = GaussianCopula(field_types={
        'id': {
            'type': 'id'
        },
        'other_id': {
            'type': 'id'
        }
    })
    model.fit(ids_only)
    sampled = model.sample()

    assert sampled.shape == ids_only.shape
    assert ids_only.equals(sampled)
Ejemplo n.º 25
0
    def test__fit(self, mock_warnings, gm_mock):
        """Test the ``GaussianCopula._fit`` method.

        The ``_fit`` method is expected to:
        - Call the _get_distribution method to build the distributions dict.
        - Set the output from _get_distribution method as self._distribution.
        - Create a GaussianMultivriate object with the self._distribution value.
        - Store the GaussianMultivariate instance in the self._model attribute.
        - Fit the GaussianMultivariate instance with the given table data, unmodified.
        - Call the _update_metadata method.

        Setup:
            - mock _get_distribution to return a distribution dict
            - mock warnings to ensure that during the model fit those are being ignored.

        Input:
            - pandas.DataFrame

        Expected Output:
            - None

        Side Effects:
            - self._distribution is set to the output from _get_distribution
            - GaussianMultivariate is called with self._distribution as input
            - GaussianMultivariate output is stored as self._model
            - self._model.fit is called with the input dataframe
            - self._update_metadata is called without arguments
        """
        # Setup
        gaussian_copula = Mock(spec_set=GaussianCopula)
        gaussian_copula._field_distributions = {'a': 'a_distribution'}

        # Run
        data = pd.DataFrame({
            'a': [1, 2, 3]
        })
        out = GaussianCopula._fit(gaussian_copula, data)

        # asserts
        assert out is None
        assert gaussian_copula._field_distributions == {'a': 'a_distribution'}
        gm_mock.assert_called_once_with(distribution={'a': 'a_distribution'})

        assert gaussian_copula._model == gm_mock.return_value
        expected_data = pd.DataFrame({
            'a': [1, 2, 3]
        })
        call_args = gaussian_copula._model.fit.call_args_list
        passed_table_data = call_args[0][0][0]

        pd.testing.assert_frame_equal(expected_data, passed_table_data)
        gaussian_copula._update_metadata.assert_called_once_with()
        mock_warnings.catch_warnings.assert_called_once()
        mock_warnings.filterwarnings.assert_called_once_with('ignore', module='scipy')
Ejemplo n.º 26
0
    def test___init__metadata_object(self):
        """Test ``__init__`` passing a ``Table`` object.

        In this case, the metadata object should be copied and stored as
        ``instance.table_metadata``.

        Input:
            - table_metadata
            - field_distributions
            - default_distribution
            - categorical_transformer

        Side Effects
            - attributes are set to the right values
            - metadata is created with the right values
            - ``instance.metadata`` is different than the object provided
        """
        # Setup
        metadata_dict = {
            'name': 'test',
            'fields': {
                'a_field': {
                    'type': 'categorical'
                },
            },
            'model_kwargs': {
                'GaussianCopula': {
                    'field_distributions': {
                        'a_field': 'gaussian',
                    },
                    'categorical_transformer': 'categorical_fuzzy',
                }
            }
        }
        table_metadata = Table.from_dict(metadata_dict)

        # Run
        gc = GaussianCopula(
            default_distribution='gamma',
            table_metadata=table_metadata,
        )

        # Assert
        assert gc._metadata.get_fields() == table_metadata.get_fields()
        kwargs = gc._metadata.get_model_kwargs('GaussianCopula')
        provided_kwargs = table_metadata.get_model_kwargs('GaussianCopula')
        assert kwargs['field_distributions'] == provided_kwargs[
            'field_distributions']
        assert kwargs['categorical_transformer'] == provided_kwargs[
            'categorical_transformer']
        assert 'default_distribution' not in provided_kwargs
        assert gc._metadata != table_metadata
Ejemplo n.º 27
0
def test_sample_empty_transformed_conditions():
    """Test that None is passed to ``_sample_batch`` if transformed conditions are empty.

    The ``Sample`` method is expected to:
    - Return sampled data and pass None to ``sample_batch`` as the
    ``transformed_conditions``.

    Input:
    - Number of rows to sample
    - Conditions

    Output:
    - Sampled data
    """
    # Setup
    model = GaussianCopula()
    data = pd.DataFrame({
        'column1': list(range(100)),
        'column2': list(range(100)),
        'column3': list(range(100))
    })

    conditions = {'column1': 25}
    conditions_series = pd.Series([25, 25, 25, 25, 25], name='column1')
    model._sample_batch = Mock()
    sampled = pd.DataFrame({
        'column1': [28, 28],
        'column2': [37, 37],
        'column3': [93, 93],
    })
    model._sample_batch.return_value = sampled
    model.fit(data)
    model._metadata = Mock()
    model._metadata.get_fields.return_value = ['column1', 'column2', 'column3']
    model._metadata.transform.return_value = pd.DataFrame()
    model._metadata.make_ids_unique.side_effect = lambda x: x

    # Run
    output = model.sample(5,
                          conditions=conditions,
                          graceful_reject_sampling=True)

    # Assert
    expected_output = pd.DataFrame({
        'column1': [28, 28],
        'column2': [37, 37],
        'column3': [93, 93],
    })
    _, args, kwargs = model._metadata.transform.mock_calls[0]
    pd.testing.assert_series_equal(args[0]['column1'], conditions_series)
    assert kwargs['on_missing_column'] == 'drop'
    model._metadata.transform.assert_called_once()
    model._sample_batch.assert_called_with(5, 100, 10, conditions, None, 0.01)
    pd.testing.assert_frame_equal(output, expected_output)
Ejemplo n.º 28
0
    def test__validate_distribution_fqn(self):
        """Test the ``_validate_distribution`` method passing a FQN distribution name.

        If the input is an importable FQN of a Python object, return the input.

        Input:
        - A univariate distribution FQN.

        Output:
        - The corresponding class.
        """
        out = GaussianCopula._validate_distribution('copulas.univariate.GaussianUnivariate')

        assert out == 'copulas.univariate.GaussianUnivariate'
Ejemplo n.º 29
0
    def test_get_parameters_non_parametric(self):
        """Test the ``get_parameters`` method when model is parametric.

        If there is at least one distributions in the model that is not
        parametric, a NonParametricError should be raised.

        Setup:
        - ``self._model`` is set to a ``GaussianMultivariate`` that
          uses ``GaussianKDE`` as its ``distribution``.

        Side Effects:
        - A NonParametricError is raised.
        """
        # Setup
        gm = GaussianMultivariate(distribution=GaussianKDE())
        data = pd.DataFrame([1, 1, 1])
        gm.fit(data)
        gc = Mock()
        gc._model = gm

        # Run, Assert
        with pytest.raises(NonParametricError):
            GaussianCopula.get_parameters(gc)
Ejemplo n.º 30
0
    def test___init__metadata_dict(self, init_mock, from_dict_mock):
        """Test ``__init__`` without passing a table_metadata dict.

        In this case, metadata will be loaded from the dict and passed
        to the parent.

        Input:
            - table_metadata
            - distribution
            - default_distribution
            - categorical_transformer

        Side Effects
            - attributes are set to the right values
            - super().__init__ is called with the loaded metadata
        """
        table_metadata = {
            'fields': {
                'a_field': {
                    'type': 'categorical'
                },
            },
            'model_kwargs': {
                'GaussianCopula': {
                    'distribution': {
                        'a_field': 'gaussian',
                    },
                    'categorical_transformer': 'categorical_fuzzy',
                }
            }
        }
        gc = GaussianCopula(
            distribution={'a_field': 'gaussian'},
            categorical_transformer='categorical_fuzzy',
            table_metadata=table_metadata,
        )

        assert gc._distribution == {'a_field': 'gaussian'}
        assert gc._categorical_transformer == 'categorical_fuzzy'
        assert gc._DTYPE_TRANSFORMERS == {'O': 'categorical_fuzzy'}

        init_mock.assert_called_once_with(
            field_names=None,
            primary_key=None,
            field_types=None,
            field_transformers=None,
            anonymize_fields=None,
            constraints=None,
            table_metadata=from_dict_mock.return_value,
        )