Ejemplo n.º 1
0
    def test_convert_column_name_value_to_id_multiple(self):
        ohe = Mock()
        ohe.transform.return_value = np.array(
            [[0, 1, 0]  # one hot encoding, second dimension
             ])
        transformer = DataTransformer()
        transformer._column_transform_info_list = [
            ColumnTransformInfo(
                column_name='x',
                column_type='continuous',
                transform=None,
                transform_aux=None,
                output_info=[SpanInfo(1, 'tanh'),
                             SpanInfo(3, 'softmax')],
                output_dimensions=1 + 3),
            ColumnTransformInfo(column_name='y',
                                column_type='discrete',
                                transform=ohe,
                                transform_aux=None,
                                output_info=[SpanInfo(2, 'softmax')],
                                output_dimensions=2),
            ColumnTransformInfo(column_name='z',
                                column_type='discrete',
                                transform=ohe,
                                transform_aux=None,
                                output_info=[SpanInfo(2, 'softmax')],
                                output_dimensions=2)
        ]

        result = transformer.convert_column_name_value_to_id('z', 'yes')
        assert result['column_id'] == 2  # this is the 3rd column
        assert result[
            'discrete_column_id'] == 1  # this is the 2nd discrete column
        assert result[
            'value_id'] == 1  # this is the 1st dimension in the one hot encoding
Ejemplo n.º 2
0
    def test__apply_activate_(self):
        """Test `_apply_activate` for tables with both continuous and categoricals.

        Check every continuous column has all values between -1 and 1
        (since they are normalized), and check every categorical column adds up to 1.

        Setup:
            - Mock `self._transformer.output_info_list`

        Input:
            - data = tensor of shape (N, data_dims)

        Output:
            - tensor = tensor of shape (N, data_dims)
        """
        model = CTGANSynthesizer()
        model._transformer = Mock()
        model._transformer.output_info_list = [[SpanInfo(
            3, 'softmax')], [SpanInfo(1, 'tanh'),
                             SpanInfo(2, 'softmax')]]

        data = torch.randn(100, 6)
        result = model._apply_activate(data)

        assert result.shape == (100, 6)
        _assert_is_between(result[:, 0:3], 0.0, 1.0)
        _assert_is_between(result[:3], -1.0, 1.0)
        _assert_is_between(result[:, 4:6], 0.0, 1.0)
Ejemplo n.º 3
0
    def test_fit(self):
        """Test 'fit' on a np.ndarray with one continuous and one discrete columns.

        The 'fit' method should:
            - Set 'self.dataframe' to 'False'
            - Set 'self._column_raw_dtypes' to the appropirate dtypes
            - Use the appropriate '_fit' type for each column'
            - Update 'self.output_info_list', 'self.output_dimensions' and
            'self._column_transform_info_list' appropriately

        Setup:
            - Create DataTransformer
            - Mock _fit_discrete
            - Mock _fit_continuous

        Input:
            - raw_data = a table with one continuous and one discrete columns.
            - discrete_columns = list with the name of the discrete column

        Output:
            - None

        Side Effects:
            - _fit_discrete and _fit_continuous should each be called once
            - Assigns 'self._column_raw_dtypes' the appropriate dtypes
            - Assigns 'self.output_info_list' the appropriate 'output_info'.
            - Assigns 'self.output_dimensions' the appropriate 'output_dimensions'.
            - Assigns 'self._column_transform_info_list' the appropriate 'column_transform_info'.
        """
        data = pd.DataFrame({
            "x": np.random.random(size=100),
            "y": np.random.choice(["yes", "no"], size=100)
        })

        transformer = DataTransformer()
        transformer._fit_continuous = Mock()
        transformer._fit_continuous.return_value = ColumnTransformInfo(
            column_name="x",
            column_type="continuous",
            transform=None,
            transform_aux=None,
            output_info=[SpanInfo(1, 'tanh'),
                         SpanInfo(3, 'softmax')],
            output_dimensions=1 + 3)

        transformer._fit_discrete = Mock()
        transformer._fit_discrete.return_value = ColumnTransformInfo(
            column_name="y",
            column_type="discrete",
            transform=None,
            transform_aux=None,
            output_info=[SpanInfo(2, 'softmax')],
            output_dimensions=2)

        transformer.fit(data, discrete_columns=["y"])

        transformer._fit_discrete.assert_called_once()
        transformer._fit_continuous.assert_called_once()
        assert transformer.output_dimensions == 6
Ejemplo n.º 4
0
    def test_fit(self):
        """Test ``fit`` on a np.ndarray with one continuous and one discrete columns.

        The ``fit`` method should:
            - Set ``self.dataframe`` to ``False``.
            - Set ``self._column_raw_dtypes`` to the appropirate dtypes.
            - Use the appropriate ``_fit`` type for each column.
            - Update ``self.output_info_list``, ``self.output_dimensions`` and
            ``self._column_transform_info_list`` appropriately.

        Setup:
            - Create ``DataTransformer``.
            - Mock ``_fit_discrete``.
            - Mock ``_fit_continuous``.

        Input:
            - A table with one continuous and one discrete columns.
            - A list with the name of the discrete column.

        Side Effects:
            - ``_fit_discrete`` and ``_fit_continuous`` should each be called once.
            - Assigns ``self._column_raw_dtypes`` the appropriate dtypes.
            - Assigns ``self.output_info_list`` the appropriate ``output_info``.
            - Assigns ``self.output_dimensions`` the appropriate ``output_dimensions``.
            - Assigns ``self._column_transform_info_list`` the appropriate
            ``column_transform_info``.
        """
        # Setup
        transformer = DataTransformer()
        transformer._fit_continuous = Mock()
        transformer._fit_continuous.return_value = ColumnTransformInfo(
            column_name='x',
            column_type='continuous',
            transform=None,
            output_info=[SpanInfo(1, 'tanh'),
                         SpanInfo(3, 'softmax')],
            output_dimensions=1 + 3)

        transformer._fit_discrete = Mock()
        transformer._fit_discrete.return_value = ColumnTransformInfo(
            column_name='y',
            column_type='discrete',
            transform=None,
            output_info=[SpanInfo(2, 'softmax')],
            output_dimensions=2)

        data = pd.DataFrame({
            'x': np.random.random(size=100),
            'y': np.random.choice(['yes', 'no'], size=100)
        })

        # Run
        transformer.fit(data, discrete_columns=['y'])

        # Assert
        transformer._fit_discrete.assert_called_once()
        transformer._fit_continuous.assert_called_once()
        assert transformer.output_dimensions == 6
Ejemplo n.º 5
0
    def test__cond_loss(self):
        """Test `_cond_loss`.

        Test that the loss is purely a function of the target categorical.

        Setup:
            - mock transformer.output_info_list
            - create two categoricals, one continuous
            - compute the conditional loss, conditioned on the 1st categorical
            - compare the loss to the cross-entropy of the 1st categorical, manually computed

        Input:
            data - the synthetic data generated by the model
            c - a tensor with the same shape as the data but with only a specific one-hot vector
                corresponding to the target column filled in
            m - binary mask used to select the categorical column to condition on

        Output:
            loss scalar; this should only be affected by the target column

        Note:
            - even though the implementation of this is probably right, I'm not sure if the idea
              behind it is correct
        """
        model = CTGANSynthesizer()
        model._transformer = Mock()
        model._transformer.output_info_list = [
            [SpanInfo(1, 'tanh'), SpanInfo(2, 'softmax')],
            [SpanInfo(3, 'softmax')
             ],  # this is the categorical column we are conditioning on
            [SpanInfo(2, 'softmax')
             ],  # this is the categorical column we are bry jrbec on
        ]

        data = torch.tensor([
            # first 3 dims ignored, next 3 dims are the prediction, last 2 dims are ignored
            [0.0, -1.0, 0.0, 0.05, 0.05, 0.9, 0.1, 0.4],
        ])

        c = torch.tensor([
            # first 3 dims are a one-hot for the categorical,
            # next 2 are for a different categorical that we are not conditioning on
            # (continuous values are not stored in this tensor)
            [0.0, 0.0, 1.0, 0.0, 0.0],
        ])

        # this indicates that we are conditioning on the first categorical
        m = torch.tensor([[1, 0]])

        result = model._cond_loss(data, c, m)
        expected = torch.nn.functional.cross_entropy(
            torch.tensor([
                [0.05, 0.05, 0.9],  # 3 categories, one hot
            ]),
            torch.tensor([2]))

        assert (result - expected).abs() < 1e-3
Ejemplo n.º 6
0
    def test_convert_column_name_value_to_id(self):
        """Test ``convert_column_name_value_to_id`` on a simple ``_column_transform_info_list``.

        Tests that the appropriate indexes are returned when a table of three columns,
        discrete, continuous, discrete, is passed as '_column_transform_info_list'.

        Setup:
            - Mock ``_column_transform_info_list``.

        Input:
            - column_name = the name of a discrete column
            - value = the categorical value

        Output:
            - dictionary containing:
              - ``discrete_column_id`` = the index of the target column,
                when considering only discrete columns
              - ``column_id`` = the index of the target column
                (e.g. 3 = the third column in the data)
              - ``value_id`` = the index of the indicator value in the one-hot encoding
        """
        # Setup
        ohe = Mock()
        ohe.transform.return_value = pd.DataFrame(
            [[0, 1]  # one hot encoding, second dimension
             ])
        transformer = DataTransformer()
        transformer._column_transform_info_list = [
            ColumnTransformInfo(
                column_name='x',
                column_type='continuous',
                transform=None,
                output_info=[SpanInfo(1, 'tanh'),
                             SpanInfo(3, 'softmax')],
                output_dimensions=1 + 3),
            ColumnTransformInfo(column_name='y',
                                column_type='discrete',
                                transform=ohe,
                                output_info=[SpanInfo(2, 'softmax')],
                                output_dimensions=2)
        ]

        # Run
        result = transformer.convert_column_name_value_to_id('y', 'yes')

        # Assert
        assert result['column_id'] == 1  # this is the 2nd column
        assert result[
            'discrete_column_id'] == 0  # this is the 1st discrete column
        assert result[
            'value_id'] == 1  # this is the 2nd dimension in the one hot encoding
Ejemplo n.º 7
0
    def test__transform_continuous(self, MockBGM):
        """Test ``_transform_continuous``.

        Setup:
            - Mock the ``BayesGMMTransformer`` with the transform method returning
            some dataframe.
            - Create ``DataTransformer``.

        Input:
            - ``ColumnTransformInfo`` object.
            - A dataframe containing a continuous column.

        Output:
            - A np.array where the first column contains the normalized part
            of the mocked transform, and the other columns are a one hot encoding
            representation of the component part of the mocked transform.
        """
        # Setup
        bgm_instance = MockBGM.return_value
        bgm_instance.transform.return_value = pd.DataFrame({
            'x.normalized': [0.1, 0.2, 0.3],
            'x.component': [0.0, 1.0, 1.0]
        })

        transformer = DataTransformer()
        data = pd.DataFrame({'x': np.array([0.1, 0.3, 0.5])})
        column_transform_info = ColumnTransformInfo(
            column_name='x',
            column_type='continuous',
            transform=bgm_instance,
            output_info=[SpanInfo(1, 'tanh'),
                         SpanInfo(3, 'softmax')],
            output_dimensions=1 + 3)

        # Run
        result = transformer._transform_continuous(column_transform_info, data)

        # Assert
        expected = np.array([
            [0.1, 1, 0, 0],
            [0.2, 0, 1, 0],
            [0.3, 0, 1, 0],
        ])
        np.testing.assert_array_equal(result, expected)
Ejemplo n.º 8
0
    def test_transform(self):
        """Test 'transform' on a dataframe with one continuous and one discrete columns.

        It should use the appropriate '_transform' type for each column and should return
        them concanenated appropriately.

        Setup:
            - Mock _column_transform_info_list
            - Mock _transform_discrete
            - Mock _transform_continuous

        Input:
            - raw_data = a table with one continuous and one discrete columns.

        Output:
            - numpy array containing the transformed two columns

        Side Effects:
            - _transform_discrete and _transform_continuous should each be called once.
        """
        data = pd.DataFrame({
            "x": np.array([0.1, 0.3, 0.5]),
            "y": np.array(["yes", "yes", "no"])
        })

        transformer = DataTransformer()
        transformer._column_transform_info_list = [
            ColumnTransformInfo(
                column_name="x",
                column_type="continuous",
                transform=None,
                transform_aux=None,
                output_info=[SpanInfo(1, 'tanh'),
                             SpanInfo(3, 'softmax')],
                output_dimensions=1 + 3),
            ColumnTransformInfo(column_name="y",
                                column_type="discrete",
                                transform=None,
                                transform_aux=None,
                                output_info=[SpanInfo(2, 'softmax')],
                                output_dimensions=2)
        ]

        transformer._transform_continuous = Mock()
        selected_normalized_value = np.array([[0.1], [0.3], [0.5]])
        selected_component_onehot = np.array([
            [1, 0, 0],
            [1, 0, 0],
            [1, 0, 0],
        ])
        return_value = (selected_normalized_value, selected_component_onehot)
        transformer._transform_continuous.return_value = return_value

        transformer._transform_discrete = Mock()
        transformer._transform_discrete.return_value = [
            np.array([
                [0, 1],
                [0, 1],
                [1, 0],
            ])
        ]

        result = transformer.transform(data)
        transformer._transform_continuous.assert_called_once()
        transformer._transform_discrete.assert_called_once()

        expected = np.array([
            [0.1, 1, 0, 0, 0, 1],
            [0.3, 1, 0, 0, 0, 1],
            [0.5, 1, 0, 0, 1, 0],
        ])

        assert result.shape == (3, 6)
        assert (result[:, 0] == expected[:, 0]).all(), "continuous-cdf"
        assert (result[:, 1:4] == expected[:, 1:4]).all(), "continuous-softmax"
        assert (result[:, 4:6] == expected[:, 4:6]).all(), "discrete"
Ejemplo n.º 9
0
    def test__inverse_transform_continuous(self, MockBGM):
        """Test ``_inverse_transform_continuous``.

        Setup:
            - Create ``DataTransformer``.
            - Mock the ``BayesGMMTransformer`` where:
                - ``get_output_types`` returns the appropriate dictionary.
                - ``reverse_transform`` returns some dataframe.

        Input:
            - A ``ColumnTransformInfo`` object.
            - A np.ndarray where:
              - The first column contains the normalized value
              - The remaining columns correspond to the one-hot
            - sigmas = np.ndarray of floats
            - st = index of the sigmas ndarray

        Output:
            - Dataframe where the first column are floats and the second is a lable encoding.

        Side Effects:
            - The ``reverse_transform`` method should be called with a dataframe
            where the first column are floats and the second is a lable encoding.
        """
        # Setup
        bgm_instance = MockBGM.return_value
        bgm_instance.get_output_types.return_value = {
            'x.normalized': 'numerical',
            'x.component': 'numerical'
        }

        bgm_instance.reverse_transform.return_value = pd.DataFrame({
            'x.normalized': [0.1, 0.2, 0.3],
            'x.component': [0.0, 1.0, 1.0]
        })

        transformer = DataTransformer()
        column_data = np.array([
            [0.1, 1, 0, 0],
            [0.3, 0, 1, 0],
            [0.5, 0, 1, 0],
        ])

        column_transform_info = ColumnTransformInfo(
            column_name='x',
            column_type='continuous',
            transform=bgm_instance,
            output_info=[SpanInfo(1, 'tanh'),
                         SpanInfo(3, 'softmax')],
            output_dimensions=1 + 3)

        # Run
        result = transformer._inverse_transform_continuous(
            column_transform_info, column_data, None, None)

        # Assert
        expected = pd.DataFrame({
            'x.normalized': [0.1, 0.2, 0.3],
            'x.component': [0.0, 1.0, 1.0]
        })

        np.testing.assert_array_equal(result, expected)

        expected_data = pd.DataFrame({
            'x.normalized': [0.1, 0.3, 0.5],
            'x.component': [0, 1, 1]
        })

        pd.testing.assert_frame_equal(
            bgm_instance.reverse_transform.call_args[0][0], expected_data)
Ejemplo n.º 10
0
    def test_transform(self):
        """Test ``transform`` on a dataframe with one continuous and one discrete columns.

        It should use the appropriate ``_transform`` type for each column and should return
        them concanenated appropriately.

        Setup:
            - Initialize a ``DataTransformer`` with a ``column_transform_info`` detailing
            a continuous and a discrete columns.
            - Mock the ``_transform_discrete`` and ``_transform_continuous`` methods.

        Input:
            - A table with one continuous and one discrete columns.

        Output:
            - np.array containing the transformed columns.

        Side Effects:
            - ``_transform_discrete`` and ``_transform_continuous`` should each be called once.
        """
        # Setup
        data = pd.DataFrame({
            'x': np.array([0.1, 0.3, 0.5]),
            'y': np.array(['yes', 'yes', 'no'])
        })

        transformer = DataTransformer()
        transformer._column_transform_info_list = [
            ColumnTransformInfo(
                column_name='x',
                column_type='continuous',
                transform=None,
                output_info=[SpanInfo(1, 'tanh'),
                             SpanInfo(3, 'softmax')],
                output_dimensions=1 + 3),
            ColumnTransformInfo(column_name='y',
                                column_type='discrete',
                                transform=None,
                                output_info=[SpanInfo(2, 'softmax')],
                                output_dimensions=2)
        ]

        transformer._transform_continuous = Mock()
        selected_normalized_value = np.array([[0.1], [0.3], [0.5]])
        selected_component_onehot = np.array([
            [1, 0, 0],
            [0, 1, 0],
            [0, 1, 0],
        ])
        return_value = np.concatenate(
            (selected_normalized_value, selected_component_onehot), axis=1)
        transformer._transform_continuous.return_value = return_value

        transformer._transform_discrete = Mock()
        transformer._transform_discrete.return_value = np.array([
            [0, 1],
            [0, 1],
            [1, 0],
        ])

        # Run
        result = transformer.transform(data)

        # Assert
        transformer._transform_continuous.assert_called_once()
        transformer._transform_discrete.assert_called_once()

        expected = np.array([
            [0.1, 1, 0, 0, 0, 1],
            [0.3, 0, 1, 0, 0, 1],
            [0.5, 0, 1, 0, 1, 0],
        ])
        assert result.shape == (3, 6)
        assert (result[:, 0] == expected[:, 0]).all(), 'continuous-cdf'
        assert (result[:, 1:4] == expected[:, 1:4]).all(), 'continuous-softmax'
        assert (result[:, 4:6] == expected[:, 4:6]).all(), 'discrete'