def test_synthesizer_sample(): data = pd.DataFrame({"discrete": np.random.choice(["a", "b", "c"], 100)}) discrete_columns = ["discrete"] ctgan = CTGANSynthesizer() ctgan.fit(data, discrete_columns, epochs=1) samples = ctgan.sample(1000, "discrete", "a") assert isinstance(samples, pd.DataFrame)
def test_synthesizer_sample(): data = pd.DataFrame({'discrete': np.random.choice(['a', 'b', 'c'], 100)}) discrete_columns = ['discrete'] ctgan = CTGANSynthesizer() ctgan.fit(data, discrete_columns, epochs=1) samples = ctgan.sample(1000, 'discrete', 'a') assert isinstance(samples, pd.DataFrame)
def main(): args = _parse_args() if args.tsv: data, discrete_columns = read_tsv(args.data, args.metadata) else: data, discrete_columns = read_csv( args.data, args.metadata, args.header, args.discrete ) if args.load: model = CTGANSynthesizer.load(args.load) else: model = CTGANSynthesizer() model.fit(data, discrete_columns, args.epochs) if args.save is not None: model.save(args.save) num_samples = args.num_samples or len(data) if args.sample_condition_column is not None: assert args.sample_condition_column_value is not None sampled = model.sample( num_samples, args.sample_condition_column, args.sample_condition_column_value ) if args.tsv: write_tsv(sampled, args.metadata, args.output) else: sampled.to_csv(args.output, index=False)
def test_sample(self): np.random.seed(0) tf.random.set_seed(0) data, discrete = generate_data(self._vars['batch_size']) model = CTGANSynthesizer( batch_size=self._vars['batch_size'], pac=self._vars['pac']) self.assertIsNotNone(model) model.train(data, discrete, epochs=1) output = model.sample(self._n_samples).values expected_output = np.array([[0.4139329, 3.0]]) np.testing.assert_almost_equal( output, expected_output, decimal=self._vars['decimal'])
def test_ctgan_numpy(): data = pd.DataFrame({ "continuous": np.random.random(100), "discrete": np.random.choice(["a", "b", "c"], 100), }) discrete_columns = [1] ctgan = CTGANSynthesizer() ctgan.fit(data.values, discrete_columns, epochs=1) sampled = ctgan.sample(100) assert sampled.shape == (100, 2) assert isinstance(sampled, np.ndarray) assert set(np.unique(sampled[:, 1])) == {"a", "b", "c"}
def test_ctgan_dataframe(): data = pd.DataFrame({ "continuous": np.random.random(100), "discrete": np.random.choice(["a", "b", "c"], 100), }) discrete_columns = ["discrete"] ctgan = CTGANSynthesizer() ctgan.fit(data, discrete_columns, epochs=1) sampled = ctgan.sample(100) assert sampled.shape == (100, 2) assert isinstance(sampled, pd.DataFrame) assert set(sampled.columns) == {"continuous", "discrete"} assert set(sampled["discrete"].unique()) == {"a", "b", "c"}
def test_ctgan_dataframe(): data = pd.DataFrame({ 'continuous': np.random.random(100), 'discrete': np.random.choice(['a', 'b', 'c'], 100) }) discrete_columns = ['discrete'] ctgan = CTGANSynthesizer() ctgan.fit(data, discrete_columns, epochs=1) sampled = ctgan.sample(100) assert sampled.shape == (100, 2) assert isinstance(sampled, pd.DataFrame) assert set(sampled.columns) == {'continuous', 'discrete'} assert set(sampled['discrete'].unique()) == {'a', 'b', 'c'}
def test_model_to_disk(self): np.random.seed(0) tf.random.set_seed(0) data, discrete = generate_data(self._vars['batch_size']) model = CTGANSynthesizer(batch_size=self._vars['batch_size'], pac=self._vars['pac']) self.assertIsNotNone(model) model.train(data, discrete, epochs=1) model_path = os.path.join(self._current_dir, 'model_test.joblib') model.dump(model_path, overwrite=True) loaded_model = CTGANSynthesizer(file_path=model_path) self.assertIsNotNone(loaded_model) for attr, value in loaded_model.__dict__.items(): self.assertTrue(attr in model.__dict__) if type(value) in [int, float, tuple]: self.assertEqual(value, model.__dict__[attr]) np.testing.assert_equal(loaded_model._cond_generator.__dict__, model._cond_generator.__dict__) for attr, value in loaded_model._transformer.__dict__.items(): if isinstance(value, pd.Series): pd.testing.assert_series_equal( value, model._transformer.__dict__[attr]) elif isinstance(value, list) and isinstance(value[0], tf.Tensor): tf.assert_equal(value, model._transformer.__dict__[attr]) else: np.testing.assert_equal(value, model._transformer.__dict__[attr]) np.testing.assert_equal(loaded_model._generator.get_weights(), model._generator.get_weights())
def main(): args = _parse_args() if args.tsv: data, discrete_columns = read_tsv(args.data, args.metadata) else: data, discrete_columns = read_csv(args.data, args.metadata, args.header, args.discrete) model = CTGANSynthesizer() model.fit(data, discrete_columns, args.epochs) num_samples = args.num_samples or len(data) sampled = model.sample(num_samples) if args.tsv: write_tsv(sampled, args.metadata, args.output) else: sampled.to_csv(args.output, index=False)
def _assert_train_equal(self, data, discrete): model = CTGANSynthesizer( batch_size=self._vars['batch_size'], pac=self._vars['pac']) self.assertIsNotNone(model) model.train(data, discrete, epochs=1) outputs = { 'output_tensor': [x.numpy() for x in model._transformer.output_tensor], 'cond_tensor': [x.numpy() for x in model._transformer.cond_tensor], 'gen_weights': model._generator.get_weights(), 'crt_weights': model._critic.get_weights(), } idx = int(len(discrete) > 0) for o in outputs: for i in range(len(outputs[o])): np.testing.assert_almost_equal( outputs[o][i], self._expected_values[idx][o][i], decimal=self._vars['decimal'])
def test_categorical_nan(): data = pd.DataFrame({ "continuous": np.random.random(30), # This must be a list (not a np.array) or NaN will be cast to a string. "discrete": [np.nan, "b", "c"] * 10, }) discrete_columns = ["discrete"] ctgan = CTGANSynthesizer() ctgan.fit(data, discrete_columns, epochs=1) sampled = ctgan.sample(100) assert sampled.shape == (100, 2) assert isinstance(sampled, pd.DataFrame) assert set(sampled.columns) == {"continuous", "discrete"} # since np.nan != np.nan, we need to be careful here values = set(sampled["discrete"].unique()) assert len(values) == 3 assert any(pd.isnull(x) for x in values) assert {"b", "c"}.issubset(values)
def test_log_frequency(): data = pd.DataFrame({ "continuous": np.random.random(1000), "discrete": np.repeat(["a", "b", "c"], [950, 25, 25]), }) discrete_columns = ["discrete"] ctgan = CTGANSynthesizer() ctgan.fit(data, discrete_columns, epochs=100) sampled = ctgan.sample(10000) counts = sampled["discrete"].value_counts() assert counts["a"] < 6500 ctgan = CTGANSynthesizer(log_frequency=False) ctgan.fit(data, discrete_columns, epochs=100) sampled = ctgan.sample(10000) counts = sampled["discrete"].value_counts() assert counts["a"] > 9000
def test_log_frequency(): data = pd.DataFrame({ 'continuous': np.random.random(1000), 'discrete': np.repeat(['a', 'b', 'c'], [950, 25, 25]) }) discrete_columns = ['discrete'] ctgan = CTGANSynthesizer() ctgan.fit(data, discrete_columns, epochs=100) sampled = ctgan.sample(10000) counts = sampled['discrete'].value_counts() assert counts['a'] < 6500 ctgan = CTGANSynthesizer() ctgan.fit(data, discrete_columns, epochs=100, log_frequency=False) sampled = ctgan.sample(10000) counts = sampled['discrete'].value_counts() assert counts['a'] > 9000
import numpy as np import pandas as pd import os import sys import tqdm import pickle import pathlib from pathlib import Path def get_domain_dims(DIR='us_import1'): with open('./generated_data_v1/{}/domain_dims.pkl'.format(DIR), 'rb') as fh: domain_dims = pickle.load(fh) return domain_dims def convert_np_to_pd(data_np, domain_dims): columns = list(domain_dims.keys()) df = pd.DataFrame(data=data_np, columns=columns) return df, columns real_data = np.load('./generated_data_v1/us_import1/pos_data.npy') domain_dims = get_domain_dims() data_df, columns = convert_np_to_pd(real_data, domain_dims) ctgan_obj = CTGANSynthesizer() ctgan_obj.fit(data, columns) ctgan_obj.save('ctgan.pkl')