def test_fixed_random_seed(): """Test the TVAESynthesizer with a fixed seed. Expect that when the random seed is reset with the same seed, the same sequence of data will be produced. Expect that the data generated with the seed is different than randomly sampled data. """ # Setup data = pd.DataFrame({ 'continuous': np.random.random(100), 'discrete': np.random.choice(['a', 'b', 'c'], 100) }) discrete_columns = ['discrete'] tvae = TVAESynthesizer(epochs=1) # Run tvae.fit(data, discrete_columns) sampled_random = tvae.sample(10) tvae.set_random_state(0) sampled_0_0 = tvae.sample(10) sampled_0_1 = tvae.sample(10) tvae.set_random_state(0) sampled_1_0 = tvae.sample(10) sampled_1_1 = tvae.sample(10) # Assert assert not np.array_equal(sampled_random, sampled_0_0) assert not np.array_equal(sampled_random, sampled_0_1) np.testing.assert_array_equal(sampled_0_0, sampled_1_0) np.testing.assert_array_equal(sampled_0_1, sampled_1_1)
def test_synthesizer_sample(): data = pd.DataFrame({'discrete': np.random.choice(['a', 'b'], 100)}) discrete_columns = ['discrete'] tvae = TVAESynthesizer(epochs=1) tvae.fit(data, discrete_columns) samples = tvae.sample(1000) assert isinstance(samples, pd.DataFrame)
def test_tvae_numpy(): data = pd.DataFrame({ 'continuous': np.random.random(1000), 'discrete': np.random.choice(['a', 'b'], 1000) }) discrete_columns = [1] tvae = TVAESynthesizer(epochs=10) tvae.fit(data.values, discrete_columns) sampled = tvae.sample(100) assert sampled.shape == (100, 2) assert isinstance(sampled, np.ndarray) assert set(np.unique(sampled[:, 1])) == {'a', 'b'}
def test_drop_last_false(): data = pd.DataFrame({ '1': ['a', 'b', 'c'] * 150, '2': ['a', 'b', 'c'] * 150 }) tvae = TVAESynthesizer(epochs=300) tvae.fit(data, ['1', '2']) sampled = tvae.sample(100) correct = 0 for _, row in sampled.iterrows(): if row['1'] == row['2']: correct += 1 assert correct >= 95
def test_tvae_dataframe(): data = pd.DataFrame({ 'continuous': np.random.random(1000), 'discrete': np.random.choice(['a', 'b'], 1000) }) discrete_columns = ['discrete'] tvae = TVAESynthesizer(epochs=10) tvae.fit(data, discrete_columns) sampled = tvae.sample(100) assert sampled.shape == (100, 2) assert isinstance(sampled, pd.DataFrame) assert set(sampled.columns) == {'continuous', 'discrete'} assert set(sampled['discrete'].unique()) == {'a', 'b'}
def test_save_load(): data = pd.DataFrame({ 'continuous': np.random.random(100), 'discrete': np.random.choice(['a', 'b'], 100) }) discrete_columns = ['discrete'] tvae = TVAESynthesizer(epochs=10) tvae.fit(data, discrete_columns) with tf.TemporaryDirectory() as temporary_directory: tvae.save(temporary_directory + "test_tvae.pkl") tvae = TVAESynthesizer.load(temporary_directory + "test_tvae.pkl") sampled = tvae.sample(1000) assert set(sampled.columns) == {'continuous', 'discrete'} assert set(sampled['discrete'].unique()) == {'a', 'b'}
def test_loss_function(): data = pd.DataFrame({ '1': [float(i) for i in range(1000)], '2': [float(2 * i) for i in range(1000)] }) tvae = TVAESynthesizer(epochs=300) tvae.fit(data) num_samples = 1000 sampled = tvae.sample(num_samples) error = 0 for _, row in sampled.iterrows(): error += abs(2 * row['1'] - row['2']) avg_error = error / num_samples assert avg_error < 400
def test_tvae(tmpdir): iris = datasets.load_iris() data = pd.DataFrame(iris.data, columns=iris.feature_names) data['class'] = pd.Series(iris.target).map(iris.target_names.__getitem__) tvae = TVAESynthesizer(epochs=10) tvae.fit(data, ['class']) path = str(tmpdir / 'test_tvae.pkl') tvae.save(path) tvae = TVAESynthesizer.load(path) sampled = tvae.sample(100) assert sampled.shape == (100, 5) assert isinstance(sampled, pd.DataFrame) assert set(sampled.columns) == set(data.columns) assert set(sampled.dtypes) == set(data.dtypes)
def test__loss_function(): """Test the TVAESynthesizer produces average values similar to the training data.""" data = pd.DataFrame({ '1': [float(i) for i in range(1000)], '2': [float(2 * i) for i in range(1000)] }) tvae = TVAESynthesizer(epochs=300) tvae.fit(data) num_samples = 1000 sampled = tvae.sample(num_samples) error = 0 for _, row in sampled.iterrows(): error += abs(2 * row['1'] - row['2']) avg_error = error / num_samples assert avg_error < 400