Esempio n. 1
0
    def test__fit(self, gct_mock, ht_mock, ctgan_fit_mock):
        """Test the ``CopulaGAN._fit`` method.

        The ``_fit`` method is expected to:
        - Build transformers for all the non-categorical data columns based on the
          field distributions.
        - Create a HyperTransformer with all the transformers.
        - Fit and transform the data with the HyperTransformer.
        - Call CTGAN fit.

        Setup:
            - mock _field_distribution and _default_distribution to return the desired
              distribution values

        Input:
            - pandas.DataFrame

        Expected Output:
            - None

        Side Effects:
            - GaussianCopulaTransformer is called with the expected disributions.
            - HyperTransformer is called to create a hyper transformer object.
            - HyperTransformer fit_transform is called with the expected data.
            - CTGAN's fit method is called with the expected data.
        """
        # Setup
        model = Mock(spec_set=CopulaGAN)
        model._field_distributions = {'a': 'a_distribution'}
        model._default_distribution = 'default_distribution'
        model._metadata.get_fields.return_value = {
            'a': {},
            'b': {},
            'c': {
                'type': 'categorical'
            }
        }

        # Run
        data = pd.DataFrame({
            'a': [1, 2, 3],
            'b': [5, 6, 7],
            'c': ['c', 'c', 'c'],
        })
        out = CopulaGAN._fit(model, data)

        # asserts
        assert out is None
        assert model._field_distributions == {'a': 'a_distribution'}
        gct_mock.assert_has_calls([
            call(distribution='a_distribution'),
            call(distribution='default_distribution'),
        ])
        assert gct_mock.call_count == 2

        assert model._ht == ht_mock.return_value
        ht_mock.return_value.fit_transform.called_once_with(
            DataFrameMatcher(data))
        ctgan_fit_mock.called_once_with(DataFrameMatcher(data))
Esempio n. 2
0
    def test__sample_conditions_with_multiple_conditions(self):
        """Test the `BaseTabularModel._sample_conditions` method with multiple condtions.

        When multiple condition dataframes are returned by `_make_condition_dfs`,
        expect `_sample_with_conditions` is called for each condition dataframe.

        Input:
            - 2 conditions with different columns
        Output:
            - The expected sampled rows
        """
        # Setup
        model = Mock(spec_set=CTGAN)
        model._validate_file_path.return_value = None

        condition_values1 = {'cola': 'a'}
        condition1 = Condition(condition_values1, num_rows=2)
        sampled1 = pd.DataFrame({'a': ['a', 'a'], 'b': [1, 2]})

        condition_values2 = {'colb': 1}
        condition2 = Condition(condition_values2, num_rows=3)
        sampled2 = pd.DataFrame({'a': ['b', 'c', 'a'], 'b': [1, 1, 1]})

        expected = pd.DataFrame({
            'a': ['a', 'a', 'b', 'c', 'a'],
            'b': [1, 2, 1, 1, 1],
        })

        model._make_condition_dfs.return_value = [
            pd.DataFrame([condition_values1] * 2),
            pd.DataFrame([condition_values2] * 3),
        ]
        model._sample_with_conditions.side_effect = [
            sampled1,
            sampled2,
        ]

        # Run
        out = BaseTabularModel._sample_conditions(model,
                                                  [condition1, condition2],
                                                  100, None, True, None)

        # Asserts
        model._sample_with_conditions.assert_has_calls([
            call(DataFrameMatcher(pd.DataFrame([condition_values1] * 2)), 100,
                 None, ANY, None),
            call(DataFrameMatcher(pd.DataFrame([condition_values2] * 3)), 100,
                 None, ANY, None),
        ])
        pd.testing.assert_frame_equal(out, expected)
Esempio n. 3
0
    def test_sample_no_transformed_columns(self):
        """Test the ``BaseTabularModel.sample`` method with no transformed columns.

        When the transformed conditions DataFrame has no columns, expect that sample
        does not pass through any conditions when conditionally sampling.

        Setup:
            - Mock the ``_make_conditions_df`` method to return a dataframe representing
              the expected conditions, and the ``get_fields`` method to return metadata
              fields containing the expected conditioned column.
            - Mock the ``_metadata.transform`` method to return an empty transformed
              conditions dataframe.
            - Mock the ``_conditionally_sample_rows`` method to return the expected
              sampled rows.
            - Mock the `make_ids_unique` to return the expected sampled rows.
        Input:
            - number of rows
            - one set of conditions
        Output:
            - the expected sampled rows
        Side Effects:
            - Expect ``_conditionally_sample_rows`` to be called with the given condition
              and a transformed_condition of None.
        """
        # Setup
        gaussian_copula = Mock(spec_set=GaussianCopula)
        expected = pd.DataFrame(['a', 'a', 'a'])

        gaussian_copula._make_conditions_df.return_value = pd.DataFrame(
            {'a': ['a', 'a', 'a']})
        gaussian_copula._metadata.get_fields.return_value = ['a']
        gaussian_copula._metadata.transform.return_value = pd.DataFrame(
            {}, index=[0, 1, 2])
        gaussian_copula._conditionally_sample_rows.return_value = pd.DataFrame(
            {
                'a': ['a', 'a', 'a'],
                COND_IDX: [0, 1, 2]
            })
        gaussian_copula._metadata.make_ids_unique.return_value = expected

        # Run
        out = GaussianCopula.sample(gaussian_copula,
                                    num_rows=3,
                                    conditions={'a': 'a'})

        # Asserts
        gaussian_copula._conditionally_sample_rows.assert_called_once_with(
            DataFrameMatcher(
                pd.DataFrame({
                    COND_IDX: [0, 1, 2],
                    'a': ['a', 'a', 'a']
                })),
            100,
            10,
            {'a': 'a'},
            None,
            0.01,
            False,
        )
        pd.testing.assert_frame_equal(out, expected)
Esempio n. 4
0
    def test__sample_remaining_columns(self):
        """Test the `BaseTabularModel._sample_remaining_colmns` method.

        When a valid DataFrame is given, expect `_sample_with_conditions` to be called
        with the input DataFrame.

        Input:
            - DataFrame with condition column values populated.
        Output:
            - The expected sampled rows.
        Side Effects:
            - `_sample_with_conditions` is called once.
        """
        # Setup
        model = Mock(spec_set=CTGAN)
        model._validate_file_path.return_value = None

        conditions = pd.DataFrame([{'cola': 'a'}] * 5)

        sampled = pd.DataFrame({
            'cola': ['a', 'a', 'a', 'a', 'a'],
            'colb': [1, 2, 1, 1, 1],
        })
        model._sample_with_conditions.return_value = sampled

        # Run
        out = BaseTabularModel._sample_remaining_columns(
            model, conditions, 100, None, True, None)

        # Asserts
        model._sample_with_conditions.assert_called_once_with(
            DataFrameMatcher(conditions), 100, None, ANY, None)
        pd.testing.assert_frame_equal(out, sampled)
Esempio n. 5
0
    def test_fit_with_null_values(self):
        """Test the ``TabularPreset.fit`` method with null values.

        Expect that the model's fit method is called with the expected args, and that
        the null percentage is calculated correctly.

        Input:
        - fit data

        Side Effects:
        - The model's ``fit`` method is called with the same data.
        """
        # Setup
        metadata = Mock()
        metadata.to_dict.return_value = {'fields': {'a': {}}}
        model = Mock()
        model._metadata = metadata
        preset = Mock()
        preset._model = model
        preset._null_column = False
        preset._null_percentages = None

        data = {'a': [1, 2, np.nan]}

        # Run
        TabularPreset.fit(preset, pd.DataFrame(data))

        # Assert
        model.fit.assert_called_once_with(DataFrameMatcher(pd.DataFrame(data)))
        assert preset._null_percentages == {'a': 1.0 / 3}
Esempio n. 6
0
    def test_fit_null_column_True(self):
        """Test the ``TabularPreset.fit`` method with modeling null columns.

        Expect that the model's fit method is called with the expected args when
        ``_null_column`` is set to ``True``.

        Setup:
        - _null_column is True

        Input:
        - fit data

        Side Effects:
        - The model's ``fit`` method is called with the same data.
        - ``_null_percentages`` is ``None``
        """
        # Setup
        metadata = Mock()
        metadata.to_dict.return_value = {'fields': {}}
        model = Mock()
        model._metadata = metadata
        preset = Mock()
        preset._model = model
        preset._null_column = True
        preset._null_percentages = None

        # Run
        TabularPreset.fit(preset, pd.DataFrame())

        # Assert
        model.fit.assert_called_once_with(DataFrameMatcher(pd.DataFrame()))
        assert preset._null_percentages is None
Esempio n. 7
0
    def test_fit(self):
        """Test the ``TabularPreset.fit`` method.

        Expect that the model's fit method is called with the expected args.

        Input:
        - fit data

        Side Effects:
        - The model's ``fit`` method is called with the same data.
        """
        # Setup
        metadata = Mock()
        metadata.to_dict.return_value = {'fields': {}}
        model = Mock()
        model._metadata = metadata
        preset = Mock()
        preset._model = model
        preset._null_percentages = None

        # Run
        TabularPreset.fit(preset, pd.DataFrame())

        # Assert
        model.fit.assert_called_once_with(DataFrameMatcher(pd.DataFrame()))
        assert preset._null_percentages is None
Esempio n. 8
0
    def test__sample_with_conditions_no_transformed_columns(self):
        """Test the ``BaseTabularModel.sample`` method with no transformed columns.

        When the transformed conditions DataFrame has no columns, expect that sample
        does not pass through any conditions when conditionally sampling.

        Setup:
            - Mock the ``_make_condition_dfs`` method to return a dataframe representing
              the expected conditions, and the ``get_fields`` method to return metadata
              fields containing the expected conditioned column.
            - Mock the ``_metadata.transform`` method to return an empty transformed
              conditions dataframe.
            - Mock the ``_conditionally_sample_rows`` method to return the expected
              sampled rows.
            - Mock the `make_ids_unique` to return the expected sampled rows.
        Input:
            - number of rows
            - one set of conditions
        Output:
            - the expected sampled rows
        Side Effects:
            - Expect ``_conditionally_sample_rows`` to be called with the given condition
              and a transformed_condition of None.
        """
        # Setup
        model = Mock(spec_set=CTGAN)
        expected = pd.DataFrame(['a', 'a', 'a'])

        condition_dataframe = pd.DataFrame({'a': ['a', 'a', 'a']})
        model._make_condition_dfs.return_value = condition_dataframe
        model._metadata.get_fields.return_value = ['a']
        model._metadata.transform.return_value = pd.DataFrame({},
                                                              index=[0, 1, 2])
        model._conditionally_sample_rows.return_value = pd.DataFrame({
            'a': ['a', 'a', 'a'],
            COND_IDX: [0, 1, 2]
        })
        model._metadata.make_ids_unique.return_value = expected

        # Run
        out = BaseTabularModel._sample_with_conditions(model,
                                                       condition_dataframe,
                                                       100, None)

        # Asserts
        model._conditionally_sample_rows.assert_called_once_with(
            DataFrameMatcher(
                pd.DataFrame({
                    COND_IDX: [0, 1, 2],
                    'a': ['a', 'a', 'a']
                })),
            {'a': 'a'},
            None,
            100,
            None,
            progress_bar=None,
            output_file_path=None,
        )
        pd.testing.assert_frame_equal(out, expected)
Esempio n. 9
0
    def test__sample_batch_with_batch_size_per_try(self):
        """Test the `BaseTabularModel._sample_batch` method with `batch_size_per_try`.

        Expect that the expected calls to `_sample_rows` are made.

        Input:
            - num_rows = 10
            - batch_size_per_try = 5
        Output:
            - The requested number of sampled rows.
        Side Effect:
            - Call `_sample_rows` method twice with the expected number of rows.
        """
        # Setup
        model = Mock(spec_set=CTGAN)
        sampled_data = pd.DataFrame({
            'column1': [28, 28, 21, 1, 2],
            'column2': [37, 37, 1, 4, 5],
            'column3': [93, 93, 6, 4, 12],
        })
        model._sample_rows.side_effect = [
            (sampled_data, 5),
            (sampled_data.append(sampled_data, ignore_index=False), 10),
        ]

        # Run
        output = BaseTabularModel._sample_batch(model,
                                                num_rows=10,
                                                batch_size_per_try=5)

        # Assert
        assert model._sample_rows.has_calls([
            call(5, None, None, 0.01, DataFrameMatcher(pd.DataFrame())),
            call(5, None, None, 0.01, DataFrameMatcher(sampled_data)),
        ])
        assert len(output) == 10