Python CTGANSynthesizer Beispiele, ctgan.synthesizers.ctgan.CTGANSynthesizer Python Beispiele

Beispiel #1

0

Datei anzeigen

def test_wrong_discrete_columns_numpy():
    data = pd.DataFrame({'discrete': ['a', 'b']})
    discrete_columns = [0, 1]

    ctgan = CTGANSynthesizer(epochs=1)
    with pytest.raises(ValueError):
        ctgan.fit(data.to_numpy(), discrete_columns)

Beispiel #2

0

Datei anzeigen

    def test__apply_activate_(self):
        """Test `_apply_activate` for tables with both continuous and categoricals.

        Check every continuous column has all values between -1 and 1
        (since they are normalized), and check every categorical column adds up to 1.

        Setup:
            - Mock `self._transformer.output_info_list`

        Input:
            - data = tensor of shape (N, data_dims)

        Output:
            - tensor = tensor of shape (N, data_dims)
        """
        model = CTGANSynthesizer()
        model._transformer = Mock()
        model._transformer.output_info_list = [[SpanInfo(
            3, 'softmax')], [SpanInfo(1, 'tanh'),
                             SpanInfo(2, 'softmax')]]

        data = torch.randn(100, 6)
        result = model._apply_activate(data)

        assert result.shape == (100, 6)
        _assert_is_between(result[:, 0:3], 0.0, 1.0)
        _assert_is_between(result[:3], -1.0, 1.0)
        _assert_is_between(result[:, 4:6], 0.0, 1.0)

Beispiel #3

0

Datei anzeigen

    def test__validate_discrete_columns(self):
        """Test `_validate_discrete_columns` if the discrete column doesn't exist.

        Check the appropriate error is raised if `discrete_columns` is invalid, both
        for numpy arrays and dataframes.

        Setup:
            - Create dataframe with a discrete column
            - Define `discrete_columns` as something not in the dataframe

        Input:
            - train_data = 2-dimensional numpy array or a pandas.DataFrame
            - discrete_columns = list of strings or integers

        Output:
            None

        Side Effects:
            - Raises error if the discrete column is invalid.

        Note:
            - could create another function for numpy array
            - TODO: it is currently a integration test, needs to be changed to a proper unit test
        """
        data = pd.DataFrame({'discrete': ['a', 'b']})
        discrete_columns = ['doesnt exist']

        ctgan = CTGANSynthesizer(epochs=1)
        with pytest.raises(ValueError):
            ctgan.fit(data, discrete_columns)

Beispiel #4

0

Datei anzeigen

def test_wrong_discrete_columns_numpy():
    """Test the CTGANSynthesizer correctly crashes when passed non-existing discrete columns."""
    data = pd.DataFrame({'discrete': ['a', 'b']})
    discrete_columns = [0, 1]

    ctgan = CTGANSynthesizer(epochs=1)
    with pytest.raises(ValueError, match=r'Invalid columns found: \[1\]'):
        ctgan.fit(data.to_numpy(), discrete_columns)

Beispiel #5

0

Datei anzeigen

def test_wrong_discrete_columns_dataframe():
    """Test the CTGANSynthesizer correctly crashes when passed non-existing discrete columns."""
    data = pd.DataFrame({'discrete': ['a', 'b']})
    discrete_columns = ['b', 'c']

    ctgan = CTGANSynthesizer(epochs=1)
    with pytest.raises(ValueError,
                       match="Invalid columns found: {'.*', '.*'}"):
        ctgan.fit(data, discrete_columns)

Beispiel #6

0

Datei anzeigen

def test_synthesizer_sample():
    data = pd.DataFrame({'discrete': np.random.choice(['a', 'b', 'c'], 100)})
    discrete_columns = ['discrete']

    ctgan = CTGANSynthesizer(epochs=1)
    ctgan.fit(data, discrete_columns)

    samples = ctgan.sample(1000, 'discrete', 'a')
    assert isinstance(samples, pd.DataFrame)

Beispiel #7

0

Datei anzeigen

    def test__cond_loss(self):
        """Test `_cond_loss`.

        Test that the loss is purely a function of the target categorical.

        Setup:
            - mock transformer.output_info_list
            - create two categoricals, one continuous
            - compute the conditional loss, conditioned on the 1st categorical
            - compare the loss to the cross-entropy of the 1st categorical, manually computed

        Input:
            data - the synthetic data generated by the model
            c - a tensor with the same shape as the data but with only a specific one-hot vector
                corresponding to the target column filled in
            m - binary mask used to select the categorical column to condition on

        Output:
            loss scalar; this should only be affected by the target column

        Note:
            - even though the implementation of this is probably right, I'm not sure if the idea
              behind it is correct
        """
        model = CTGANSynthesizer()
        model._transformer = Mock()
        model._transformer.output_info_list = [
            [SpanInfo(1, 'tanh'), SpanInfo(2, 'softmax')],
            [SpanInfo(3, 'softmax')
             ],  # this is the categorical column we are conditioning on
            [SpanInfo(2, 'softmax')
             ],  # this is the categorical column we are bry jrbec on
        ]

        data = torch.tensor([
            # first 3 dims ignored, next 3 dims are the prediction, last 2 dims are ignored
            [0.0, -1.0, 0.0, 0.05, 0.05, 0.9, 0.1, 0.4],
        ])

        c = torch.tensor([
            # first 3 dims are a one-hot for the categorical,
            # next 2 are for a different categorical that we are not conditioning on
            # (continuous values are not stored in this tensor)
            [0.0, 0.0, 1.0, 0.0, 0.0],
        ])

        # this indicates that we are conditioning on the first categorical
        m = torch.tensor([[1, 0]])

        result = model._cond_loss(data, c, m)
        expected = torch.nn.functional.cross_entropy(
            torch.tensor([
                [0.05, 0.05, 0.9],  # 3 categories, one hot
            ]),
            torch.tensor([2]))

        assert (result - expected).abs() < 1e-3

Beispiel #8

0

Datei anzeigen

def test_ctgan_no_categoricals():
    data = pd.DataFrame({'continuous': np.random.random(1000)})

    ctgan = CTGANSynthesizer(epochs=1)
    ctgan.fit(data, [])

    sampled = ctgan.sample(100)

    assert sampled.shape == (100, 1)
    assert isinstance(sampled, pd.DataFrame)
    assert set(sampled.columns) == {'continuous'}

Beispiel #9

0

Datei anzeigen