Ejemplo n.º 1
0
def fit_score_model(name, model_kwargs, train_data, test_data,
                    continuous_columns, semsitive_column, sample_rows,
                    store_samples):
    """Fit and score models using given params."""
    for index, kwargs in enumerate(model_kwargs):
        logger.info('Training TGAN Model %d/%d', index + 1, len(model_kwargs))

        tf.reset_default_graph()
        base_dir = os.path.join('experiments', name)
        output = os.path.join(base_dir, 'model_{}'.format(index))
        model = TGANModel(continuous_columns,
                          sensitive_column,
                          output=output,
                          **kwargs)
        model.fit(train_data)
        sampled_data = model.sample(sample_rows)

        if store_samples:
            dir_name = os.path.join(base_dir, 'data')
            if not os.path.isdir(dir_name):
                os.mkdir(dir_name)

            file_name = os.path.join(dir_name, 'model_{}.csv'.format(index))
            sampled_data.to_csv(file_name, index=False, header=True)

        score, p_rules = evaluate_classification(sampled_data, test_data,
                                                 continuous_columns,
                                                 sensitive_column)
        model_kwargs[index]['score'] = score
        model_kwargs[index]['p-rules (train/test/all'] = p_rules

    return model_kwargs
Ejemplo n.º 2
0
def build_and_train(params):
    tf.reset_default_graph()
    gen_layers = [int(params['gen_layer_sizes'])] * int(
        params['gen_num_layers'])
    print(gen_layers)
    crit_layers = [int(params['crit_layer_sizes'])] * int(
        params['crit_num_layers'])
    print(crit_layers)
    d = params.get('dataset')
    continuous_columns = d.info.get('continuous_columns')
    print('Batch Size:' + str(params.get('batch_size')))
    savestr = str(np.random.randint(1, 999999))
    my_tgan = TGANModel(continuous_columns=continuous_columns,
                        batch_size=int(params.get('batch_size')),
                        z_dim=int(params.get('embedding_dim')),
                        learning_rate=params.get('learning_rate'),
                        num_gen_rnn=int(params.get('gen_num_layers')),
                        num_gen_feature=int(params.get('gen_layer_sizes')),
                        num_dis_layers=int(params.get('crit_num_layers')),
                        num_dis_hidden=int(params.get('crit_layer_sizes')),
                        max_epoch=EPOCHS,
                        steps_per_epoch=50,
                        restore_session=False,
                        output=savestr)
    print('Fitting a TGAN model for {0} epochs...'.format(EPOCHS))
    train_copy = d.train.copy()
    my_tgan.fit(train_copy)
    print('Successfully fitted a TGAN model')

    return my_tgan
Ejemplo n.º 3
0
def augment_tgan(csvfile):
    data = pd.read_csv(csvfile)
    cols = list(data)
    cols_num = list()
    for i in range(len(cols) - 1):
        cols_num.append(i)
    tgan = TGANModel(cols_num)
    tgan.fit(data)

    # now create number of samples (10%)
    num_samples = int(0.10 * len(data))
    samples = tgan.sample(num_samples)

    print(samples)
Ejemplo n.º 4
0
    def test___init__(self):
        """On init, arguments are set as attributes."""
        # Setup
        continuous_columns = []

        # Run
        instance = TGANModel(continuous_columns)

        # Check
        assert instance.continuous_columns == continuous_columns
        assert instance.log_dir == 'output/logs'
        assert instance.model_dir == 'output/model'
        assert instance.max_epoch == 5
        assert instance.steps_per_epoch == 10000
        assert instance.batch_size == 200
        assert instance.z_dim == 200
        assert instance.gpu is None
        assert instance.save_checkpoints is True
        assert instance.restore_session is True
Ejemplo n.º 5
0
            if (number > index):
                index = number + 1

    return path + "/" + name + "_" + str(index) + ".csv"


# generate dinamically names for syntethic dataset (e.g. "synthetic_adult_1" , "synthetic_adult_2")
pathToSave = getSavePath("Synthetic_data", "synthetic_adult")

# number of samples that we desire to generate
num_samples = 400

# trained model location
model_path = 'models/Adult_2.pkl'

# load tgan model that was previously trained
tgan = TGANModel.load(model_path)

# after fitting, we can sample some new synthetic data which is a pandas.DataFrame
samples = tgan.sample(num_samples)

print(pathToSave)
print(samples)

samples.head()

# save generated data as csv file and remove index line (first line)
samples.to_csv(pathToSave, index=False)

# save the model. Use force = true to overwrite
tgan.save(model_path, force=True)
Ejemplo n.º 6
0
continuous = []
for col in ori_data.columns:
    if ori_data[col].nunique() > 4:
        continuous.append(col)

continuous_columns = continuous

tgan = TGANModel(continuous_columns=continuous,
                 output='2) synthetic data generation/tGAN/bioresponse/0/',
                 gpu=0,
                 max_epoch=1,
                 steps_per_epoch=6000,
                 save_checkpoints=True,
                 restore_session=False,
                 batch_size=256,
                 z_dim=200,
                 noise=0.2,
                 l2norm=0.00001,
                 learning_rate=0.001,
                 num_gen_rnn=100,
                 num_gen_feature=100,
                 num_dis_layers=1,
                 num_dis_hidden=100,
                 optimizer='AdamOptimizer')

tgan.fit(fraud_data)
model_path = '2) synthetic data generation/tGAN/bioresponse/0/tGAN_bio_0_model.pkl'
tgan.save(model_path, force=True)  #force=True to overwrite

model_path = '2) synthetic data generation/tGAN/bioresponse/0/tGAN_bio_0_model.pkl'
loaded_tgan = TGANModel.load(model_path)
Ejemplo n.º 7
0
def main(params=None, optim=True):
    if params is None:
        params = {
            # Regular parameters
            'training_set': 'ln',
            'eval': 'all',
            # NN Hyperparameters
            'embedding_dim': 128,
            'gen_num_layers': 2,
            'gen_layer_sizes': 256,
            'crit_num_layers': 2,
            'crit_layer_sizes': 256,
            'learning_rate': 10**-6,
            'batch_size': 500,
            'training_iter': 1
        }

    if optim:
        params.update(
            space
        )  # Overwrite NN hyperparameters with stochastic variant from top of file

    print('Starting TGAN main script with following parameters:')
    for key in params:
        print(key, params[key])
    params['model'] = 'tgan'

    # Load dataset
    dataset = load_data(params.get('training_set'))
    params['dataset'] = dataset
    print('Successfully loaded dataset {0}'.format(params.get('training_set')))

    if params['model'] in dataset.samples:
        #  If we are here, we have already generated samples for this test setup (identifier/dataset/model)
        samples = dataset.samples.get(params['model'])
    else:
        # Train model and Generate samples
        if optim:
            # Optimize or load TGAN model
            filename = os.path.join(RESULT_DIR, params.get('training_set'),
                                    params.get('model') + '_optimized')
            if os.path.isfile(filename):
                my_tgan = TGANModel.load(filename)
                print('Successfully loaded old optimized TGAN model from {0}'.
                      format(filename))
            else:
                best, trials = optimize(params, filename + '.json')
                best['dataset'] = dataset
                my_tgan = build_and_train(best)
                my_tgan.save(filename)
                print('Saved the optimized TGAN model at {0}'.format(filename))
        else:
            # Train or load CTGAN model
            filename = os.path.join(RESULT_DIR, params.get('training_set'),
                                    params.get('model') + '_default')
            if os.path.isfile(filename):
                # my_tgan = TGANModel.load(filename)
                print('Successfully loaded old TGAN model from {0}'.format(
                    filename))
            else:
                my_tgan = build_and_train(params=params)
                # my_tgan.save(filename)
                print('Saved the TGAN model at {0}'.format(filename))

        # Sample from model
        print('Sampling from the TGAN model...')
        samples = sampler(my_tgan, params)
        save_samples(samples,
                     params['training_set'],
                     model=params.get('model'),
                     force=True)
        print('Saved the TGAN samples')

    # Evaluate fitted model
    if params['eval'] == 'all':
        print('Starting MLE evaluation on samples...')
        discrete_columns, continuous_columns = dataset.get_columns()
        plot_predictions_by_dimension(real=dataset.train,
                                      samples=samples,
                                      data_test=dataset.test,
                                      discrete_columns=discrete_columns,
                                      continuous_columns=continuous_columns,
                                      dataset=params.get('training_set'),
                                      model=params.get('model'))
        print('Plotting marginals of real and sample data...')
        plot_marginals(dataset.train, samples, params.get('training_set'),
                       params.get('model'))
        print('Plotting association matrices...')
        diff = plot_association(dataset, samples, params.get('training_set'),
                                params.get('model'))
        print(diff)
        alist = params.get('training_set').split(sep='-', maxsplit=1)
        dataset = alist[0]
        basepath = os.path.join(RESULT_DIR, *alist, params.get('model'))
        filepath = os.path.join(
            basepath, '{0}_{1}_c_marginals.png'.format(dataset,
                                                       params.get('model')))

        save_json(diff, filepath)
Ejemplo n.º 8
0
import pandas as pd

d = pd.read_csv('../data/berka/berka_cat.csv', sep=';')
d = d.drop(['trans_bank_partner', 'trans_account_partner'], axis=1)
continuous_columns = [0, 1, 2, 3, 9]
from tgan.model import TGANModel

tgan = TGANModel(continuous_columns,
                 restore_session=False,
                 max_epoch=50,
                 steps_per_epoch=1000,
                 batch_size=1000)
tgan.fit(d)

model_path = 'demo/my_model'

tgan.save(model_path)
Ejemplo n.º 9
0
import json
import pandas as pd
from tgan.model import TGANModel

with open(str(sys.argv[1]), 'r') as f:
    config = json.load(f)

df_train = pd.read_pickle(config['df_train'])
cont_columns = config['continuous_cols']

tgan = TGANModel(cont_columns,
                 batch_size=config['batch_size'],
                 z_dim=config['z_dim'],
                 num_gen_rnn=config['num_gen_rnn'],
                 num_gen_feature=config['num_gen_feature'],
                 num_dis_layers=config['num_dis_layers'],
                 num_dis_hidden=config['num_dis_hidden'],
                 learning_rate=config['learning_rate'],
                 noise=config['noise'],
                 max_epoch=config['max_epoch'],
                 steps_per_epoch=config['steps_per_epoch'])

model_path = config['model_path']

start_time = time.time()
# fit the TGAN
tgan.fit(df_train)
print("--- %s seconds ---" % (time.time() - start_time))

tgan.save(model_path, force=True)
Ejemplo n.º 10
0
# model save location
model_path = 'models/Adult_2.pkl'

# the TGAN model need to know which dataset columns are the type of continuous columns.
continuous_columns = [0, 2, 4, 10, 11, 12]

# set nn parameters, like epoch, batch size and loss function
tgan = TGANModel(continuous_columns,
                 output='output',
                 max_epoch=10,
                 steps_per_epoch=400,
                 save_checkpoints=True,
                 restore_session=False,
                 batch_size=200,
                 z_dim=200,
                 noise=0.2,
                 l2norm=0.00001,
                 learning_rate=0.001,
                 num_gen_rnn=100,
                 num_gen_feature=100,
                 num_dis_layers=1,
                 num_dis_hidden=100,
                 optimizer='AdamOptimizer')

# train phase
tgan.fit(data)

print("Fitted model!!!")

# after fitting, we can sample some new synthetic data which is a pandas.DataFrame
samples = tgan.sample(num_samples)