Exemple #1
0
def test_synthesizer_sample():
    data = pd.DataFrame({'discrete': np.random.choice(['a', 'b'], 100)})
    discrete_columns = ['discrete']

    tvae = TVAESynthesizer(epochs=1)
    tvae.fit(data, discrete_columns)

    samples = tvae.sample(1000)
    assert isinstance(samples, pd.DataFrame)
Exemple #2
0
def test_tvae_numpy():
    data = pd.DataFrame({
        'continuous': np.random.random(1000),
        'discrete': np.random.choice(['a', 'b'], 1000)
    })
    discrete_columns = [1]

    tvae = TVAESynthesizer(epochs=10)
    tvae.fit(data.values, discrete_columns)

    sampled = tvae.sample(100)

    assert sampled.shape == (100, 2)
    assert isinstance(sampled, np.ndarray)
    assert set(np.unique(sampled[:, 1])) == {'a', 'b'}
Exemple #3
0
def test_drop_last_false():
    data = pd.DataFrame({
        '1': ['a', 'b', 'c'] * 150,
        '2': ['a', 'b', 'c'] * 150
    })

    tvae = TVAESynthesizer(epochs=300)
    tvae.fit(data, ['1', '2'])

    sampled = tvae.sample(100)
    correct = 0
    for _, row in sampled.iterrows():
        if row['1'] == row['2']:
            correct += 1

    assert correct >= 95
Exemple #4
0
def test_tvae_dataframe():
    data = pd.DataFrame({
        'continuous': np.random.random(1000),
        'discrete': np.random.choice(['a', 'b'], 1000)
    })
    discrete_columns = ['discrete']

    tvae = TVAESynthesizer(epochs=10)
    tvae.fit(data, discrete_columns)

    sampled = tvae.sample(100)

    assert sampled.shape == (100, 2)
    assert isinstance(sampled, pd.DataFrame)
    assert set(sampled.columns) == {'continuous', 'discrete'}
    assert set(sampled['discrete'].unique()) == {'a', 'b'}
Exemple #5
0
def test_loss_function():
    data = pd.DataFrame({
        '1': [float(i) for i in range(1000)],
        '2': [float(2 * i) for i in range(1000)]
    })

    tvae = TVAESynthesizer(epochs=300)
    tvae.fit(data)

    num_samples = 1000
    sampled = tvae.sample(num_samples)
    error = 0
    for _, row in sampled.iterrows():
        error += abs(2 * row['1'] - row['2'])

    avg_error = error / num_samples

    assert avg_error < 400
Exemple #6
0
def test__loss_function():
    """Test the TVAESynthesizer produces average values similar to the training data."""
    data = pd.DataFrame({
        '1': [float(i) for i in range(1000)],
        '2': [float(2 * i) for i in range(1000)]
    })

    tvae = TVAESynthesizer(epochs=300)
    tvae.fit(data)

    num_samples = 1000
    sampled = tvae.sample(num_samples)
    error = 0
    for _, row in sampled.iterrows():
        error += abs(2 * row['1'] - row['2'])

    avg_error = error / num_samples

    assert avg_error < 400
Exemple #7
0
def test_save_load():
    data = pd.DataFrame({
        'continuous': np.random.random(100),
        'discrete': np.random.choice(['a', 'b'], 100)
    })
    discrete_columns = ['discrete']

    tvae = TVAESynthesizer(epochs=10)
    tvae.fit(data, discrete_columns)

    with tf.TemporaryDirectory() as temporary_directory:
        tvae.save(temporary_directory + "test_tvae.pkl")
        tvae = TVAESynthesizer.load(temporary_directory + "test_tvae.pkl")

    sampled = tvae.sample(1000)
    assert set(sampled.columns) == {'continuous', 'discrete'}
    assert set(sampled['discrete'].unique()) == {'a', 'b'}
Exemple #8
0
def test_tvae(tmpdir):
    iris = datasets.load_iris()
    data = pd.DataFrame(iris.data, columns=iris.feature_names)
    data['class'] = pd.Series(iris.target).map(iris.target_names.__getitem__)

    tvae = TVAESynthesizer(epochs=10)
    tvae.fit(data, ['class'])

    path = str(tmpdir / 'test_tvae.pkl')
    tvae.save(path)
    tvae = TVAESynthesizer.load(path)

    sampled = tvae.sample(100)

    assert sampled.shape == (100, 5)
    assert isinstance(sampled, pd.DataFrame)
    assert set(sampled.columns) == set(data.columns)
    assert set(sampled.dtypes) == set(data.dtypes)
Exemple #9
0
def test_fixed_random_seed():
    """Test the TVAESynthesizer with a fixed seed.

    Expect that when the random seed is reset with the same seed, the same sequence
    of data will be produced. Expect that the data generated with the seed is
    different than randomly sampled data.
    """
    # Setup
    data = pd.DataFrame({
        'continuous': np.random.random(100),
        'discrete': np.random.choice(['a', 'b', 'c'], 100)
    })
    discrete_columns = ['discrete']

    tvae = TVAESynthesizer(epochs=1)

    # Run
    tvae.fit(data, discrete_columns)
    sampled_random = tvae.sample(10)

    tvae.set_random_state(0)
    sampled_0_0 = tvae.sample(10)
    sampled_0_1 = tvae.sample(10)

    tvae.set_random_state(0)
    sampled_1_0 = tvae.sample(10)
    sampled_1_1 = tvae.sample(10)

    # Assert
    assert not np.array_equal(sampled_random, sampled_0_0)
    assert not np.array_equal(sampled_random, sampled_0_1)
    np.testing.assert_array_equal(sampled_0_0, sampled_1_0)
    np.testing.assert_array_equal(sampled_0_1, sampled_1_1)