Beispiel #1
0
def test_wrong_discrete_columns_numpy():
    data = pd.DataFrame({'discrete': ['a', 'b']})
    discrete_columns = [0, 1]

    ctgan = CTGANSynthesizer(epochs=1)
    with pytest.raises(ValueError):
        ctgan.fit(data.to_numpy(), discrete_columns)
Beispiel #2
0
    def test__apply_activate_(self):
        """Test `_apply_activate` for tables with both continuous and categoricals.

        Check every continuous column has all values between -1 and 1
        (since they are normalized), and check every categorical column adds up to 1.

        Setup:
            - Mock `self._transformer.output_info_list`

        Input:
            - data = tensor of shape (N, data_dims)

        Output:
            - tensor = tensor of shape (N, data_dims)
        """
        model = CTGANSynthesizer()
        model._transformer = Mock()
        model._transformer.output_info_list = [[SpanInfo(
            3, 'softmax')], [SpanInfo(1, 'tanh'),
                             SpanInfo(2, 'softmax')]]

        data = torch.randn(100, 6)
        result = model._apply_activate(data)

        assert result.shape == (100, 6)
        _assert_is_between(result[:, 0:3], 0.0, 1.0)
        _assert_is_between(result[:3], -1.0, 1.0)
        _assert_is_between(result[:, 4:6], 0.0, 1.0)
Beispiel #3
0
    def test__validate_discrete_columns(self):
        """Test `_validate_discrete_columns` if the discrete column doesn't exist.

        Check the appropriate error is raised if `discrete_columns` is invalid, both
        for numpy arrays and dataframes.

        Setup:
            - Create dataframe with a discrete column
            - Define `discrete_columns` as something not in the dataframe

        Input:
            - train_data = 2-dimensional numpy array or a pandas.DataFrame
            - discrete_columns = list of strings or integers

        Output:
            None

        Side Effects:
            - Raises error if the discrete column is invalid.

        Note:
            - could create another function for numpy array
            - TODO: it is currently a integration test, needs to be changed to a proper unit test
        """
        data = pd.DataFrame({'discrete': ['a', 'b']})
        discrete_columns = ['doesnt exist']

        ctgan = CTGANSynthesizer(epochs=1)
        with pytest.raises(ValueError):
            ctgan.fit(data, discrete_columns)
Beispiel #4
0
def test_wrong_discrete_columns_numpy():
    """Test the CTGANSynthesizer correctly crashes when passed non-existing discrete columns."""
    data = pd.DataFrame({'discrete': ['a', 'b']})
    discrete_columns = [0, 1]

    ctgan = CTGANSynthesizer(epochs=1)
    with pytest.raises(ValueError, match=r'Invalid columns found: \[1\]'):
        ctgan.fit(data.to_numpy(), discrete_columns)
Beispiel #5
0
def test_wrong_discrete_columns_dataframe():
    """Test the CTGANSynthesizer correctly crashes when passed non-existing discrete columns."""
    data = pd.DataFrame({'discrete': ['a', 'b']})
    discrete_columns = ['b', 'c']

    ctgan = CTGANSynthesizer(epochs=1)
    with pytest.raises(ValueError,
                       match="Invalid columns found: {'.*', '.*'}"):
        ctgan.fit(data, discrete_columns)
Beispiel #6
0
def test_synthesizer_sample():
    data = pd.DataFrame({'discrete': np.random.choice(['a', 'b', 'c'], 100)})
    discrete_columns = ['discrete']

    ctgan = CTGANSynthesizer(epochs=1)
    ctgan.fit(data, discrete_columns)

    samples = ctgan.sample(1000, 'discrete', 'a')
    assert isinstance(samples, pd.DataFrame)
Beispiel #7
0
    def test__cond_loss(self):
        """Test `_cond_loss`.

        Test that the loss is purely a function of the target categorical.

        Setup:
            - mock transformer.output_info_list
            - create two categoricals, one continuous
            - compute the conditional loss, conditioned on the 1st categorical
            - compare the loss to the cross-entropy of the 1st categorical, manually computed

        Input:
            data - the synthetic data generated by the model
            c - a tensor with the same shape as the data but with only a specific one-hot vector
                corresponding to the target column filled in
            m - binary mask used to select the categorical column to condition on

        Output:
            loss scalar; this should only be affected by the target column

        Note:
            - even though the implementation of this is probably right, I'm not sure if the idea
              behind it is correct
        """
        model = CTGANSynthesizer()
        model._transformer = Mock()
        model._transformer.output_info_list = [
            [SpanInfo(1, 'tanh'), SpanInfo(2, 'softmax')],
            [SpanInfo(3, 'softmax')
             ],  # this is the categorical column we are conditioning on
            [SpanInfo(2, 'softmax')
             ],  # this is the categorical column we are bry jrbec on
        ]

        data = torch.tensor([
            # first 3 dims ignored, next 3 dims are the prediction, last 2 dims are ignored
            [0.0, -1.0, 0.0, 0.05, 0.05, 0.9, 0.1, 0.4],
        ])

        c = torch.tensor([
            # first 3 dims are a one-hot for the categorical,
            # next 2 are for a different categorical that we are not conditioning on
            # (continuous values are not stored in this tensor)
            [0.0, 0.0, 1.0, 0.0, 0.0],
        ])

        # this indicates that we are conditioning on the first categorical
        m = torch.tensor([[1, 0]])

        result = model._cond_loss(data, c, m)
        expected = torch.nn.functional.cross_entropy(
            torch.tensor([
                [0.05, 0.05, 0.9],  # 3 categories, one hot
            ]),
            torch.tensor([2]))

        assert (result - expected).abs() < 1e-3
Beispiel #8
0
def test_ctgan_no_categoricals():
    data = pd.DataFrame({'continuous': np.random.random(1000)})

    ctgan = CTGANSynthesizer(epochs=1)
    ctgan.fit(data, [])

    sampled = ctgan.sample(100)

    assert sampled.shape == (100, 1)
    assert isinstance(sampled, pd.DataFrame)
    assert set(sampled.columns) == {'continuous'}
Beispiel #9
0
def test_ctgan_numpy():
    data = pd.DataFrame({
        'continuous': np.random.random(100),
        'discrete': np.random.choice(['a', 'b', 'c'], 100)
    })
    discrete_columns = [1]

    ctgan = CTGANSynthesizer(epochs=1)
    ctgan.fit(data.values, discrete_columns)

    sampled = ctgan.sample(100)

    assert sampled.shape == (100, 2)
    assert isinstance(sampled, np.ndarray)
    assert set(np.unique(sampled[:, 1])) == {'a', 'b', 'c'}
Beispiel #10
0
def test_ctgan_dataframe():
    data = pd.DataFrame({
        'continuous': np.random.random(100),
        'discrete': np.random.choice(['a', 'b', 'c'], 100)
    })
    discrete_columns = ['discrete']

    ctgan = CTGANSynthesizer(epochs=1)
    ctgan.fit(data, discrete_columns)

    sampled = ctgan.sample(100)

    assert sampled.shape == (100, 2)
    assert isinstance(sampled, pd.DataFrame)
    assert set(sampled.columns) == {'continuous', 'discrete'}
    assert set(sampled['discrete'].unique()) == {'a', 'b', 'c'}
Beispiel #11
0
def test_log_frequency():
    data = pd.DataFrame({
        'continuous': np.random.random(1000),
        'discrete': np.repeat(['a', 'b', 'c'], [950, 25, 25])
    })

    discrete_columns = ['discrete']

    ctgan = CTGANSynthesizer(epochs=100)
    ctgan.fit(data, discrete_columns)

    sampled = ctgan.sample(10000)
    counts = sampled['discrete'].value_counts()
    assert counts['a'] < 6500

    ctgan = CTGANSynthesizer(log_frequency=False, epochs=100)
    ctgan.fit(data, discrete_columns)

    sampled = ctgan.sample(10000)
    counts = sampled['discrete'].value_counts()
    assert counts['a'] > 9000
Beispiel #12
0
def test_categorical_nan():
    data = pd.DataFrame({
        'continuous': np.random.random(30),
        # This must be a list (not a np.array) or NaN will be cast to a string.
        'discrete': [np.nan, 'b', 'c'] * 10
    })
    discrete_columns = ['discrete']

    ctgan = CTGANSynthesizer(epochs=1)
    ctgan.fit(data, discrete_columns)

    sampled = ctgan.sample(100)

    assert sampled.shape == (100, 2)
    assert isinstance(sampled, pd.DataFrame)
    assert set(sampled.columns) == {'continuous', 'discrete'}

    # since np.nan != np.nan, we need to be careful here
    values = set(sampled['discrete'].unique())
    assert len(values) == 3
    assert any(pd.isnull(x) for x in values)
    assert {"b", "c"}.issubset(values)
Beispiel #13
0
def test_wrong_sampling_conditions():
    data = pd.DataFrame({
        'continuous': np.random.random(100),
        'discrete': np.random.choice(['a', 'b', 'c'], 100)
    })
    discrete_columns = ['discrete']

    ctgan = CTGANSynthesizer(epochs=1)
    ctgan.fit(data, discrete_columns)

    with pytest.raises(ValueError):
        ctgan.sample(1, 'cardinal', "doesn't matter")

    with pytest.raises(ValueError):
        ctgan.sample(1, 'discrete', "d")
Beispiel #14
0
def test_save_load():
    data = pd.DataFrame({
        'continuous': np.random.random(100),
        'discrete': np.random.choice(['a', 'b', 'c'], 100)
    })
    discrete_columns = ['discrete']

    ctgan = CTGANSynthesizer(epochs=1)
    ctgan.fit(data, discrete_columns)

    with tf.TemporaryDirectory() as temporary_directory:
        ctgan.save(temporary_directory + "test_tvae.pkl")
        ctgan = CTGANSynthesizer.load(temporary_directory + "test_tvae.pkl")

    sampled = ctgan.sample(1000)
    assert set(sampled.columns) == {'continuous', 'discrete'}
    assert set(sampled['discrete'].unique()) == {'a', 'b', 'c'}
Beispiel #15
0
def test_wrong_sampling_conditions():
    """Test the CTGANSynthesizer correctly crashes when passed incorrect sampling conditions."""
    data = pd.DataFrame({
        'continuous': np.random.random(100),
        'discrete': np.random.choice(['a', 'b', 'c'], 100)
    })
    discrete_columns = ['discrete']

    ctgan = CTGANSynthesizer(epochs=1)
    ctgan.fit(data, discrete_columns)

    with pytest.raises(
            ValueError,
            match="The column_name `cardinal` doesn't exist in the data."):
        ctgan.sample(1, 'cardinal', "doesn't matter")

    with pytest.raises(
            ValueError
    ):  # noqa: RDT currently incorrectly raises a tuple instead of a string
        ctgan.sample(1, 'discrete', 'd')
Beispiel #16
0
def main():
    """CLI."""
    args = _parse_args()
    if args.tsv:
        data, discrete_columns = read_tsv(args.data, args.metadata)
    else:
        data, discrete_columns = read_csv(args.data, args.metadata,
                                          args.header, args.discrete)

    if args.load:
        model = CTGANSynthesizer.load(args.load)
    else:
        generator_dim = [int(x) for x in args.generator_dim.split(',')]
        discriminator_dim = [int(x) for x in args.discriminator_dim.split(',')]
        model = CTGANSynthesizer(embedding_dim=args.embedding_dim,
                                 generator_dim=generator_dim,
                                 discriminator_dim=discriminator_dim,
                                 generator_lr=args.generator_lr,
                                 generator_decay=args.generator_decay,
                                 discriminator_lr=args.discriminator_lr,
                                 discriminator_decay=args.discriminator_decay,
                                 batch_size=args.batch_size,
                                 epochs=args.epochs)
    model.fit(data, discrete_columns)

    if args.save is not None:
        model.save(args.save)

    num_samples = args.num_samples or len(data)

    if args.sample_condition_column is not None:
        assert args.sample_condition_column_value is not None

    sampled = model.sample(num_samples, args.sample_condition_column,
                           args.sample_condition_column_value)

    if args.tsv:
        write_tsv(sampled, args.metadata, args.output)
    else:
        sampled.to_csv(args.output, index=False)
Beispiel #17
0
def test_fixed_random_seed():
    """Test the CTGANSynthesizer with a fixed seed.

    Expect that when the random seed is reset with the same seed, the same sequence
    of data will be produced. Expect that the data generated with the seed is
    different than randomly sampled data.
    """
    # Setup
    data = pd.DataFrame({
        'continuous': np.random.random(100),
        'discrete': np.random.choice(['a', 'b', 'c'], 100)
    })
    discrete_columns = ['discrete']

    ctgan = CTGANSynthesizer(epochs=1)

    # Run
    ctgan.fit(data, discrete_columns)
    sampled_random = ctgan.sample(10)

    ctgan.set_random_state(0)
    sampled_0_0 = ctgan.sample(10)
    sampled_0_1 = ctgan.sample(10)

    ctgan.set_random_state(0)
    sampled_1_0 = ctgan.sample(10)
    sampled_1_1 = ctgan.sample(10)

    # Assert
    assert not np.array_equal(sampled_random, sampled_0_0)
    assert not np.array_equal(sampled_random, sampled_0_1)
    np.testing.assert_array_equal(sampled_0_0, sampled_1_0)
    np.testing.assert_array_equal(sampled_0_1, sampled_1_1)