def extend_gan_train(x_train, y_train, x_test, cat_cols, gen_x_times=1.2, epochs=300): """ Extends train by generating new data by GAN :param x_train: train dataframe :param y_train: target for train dataframe :param x_test: dataframe :param cat_cols: List of categorical columns :param gen_x_times: Factor for which initial dataframe should be increased :param cat_cols: List of categorical columns :param epochs: Number of epoch max to train the GAN :return: extended train with target """ if gen_x_times == 0: raise ValueError("Passed gen_x_times with value 0!") x_train["target"] = y_train x_test_bigger = int(1.1 * x_test.shape[0] / x_train.shape[0]) ctgan = CTGANSynthesizer() ctgan.fit(x_train, cat_cols, epochs=epochs) generated_df = ctgan.sample((x_test_bigger) * x_train.shape[0]) data_dtype = x_train.dtypes.values for i in range(len(generated_df.columns)): generated_df[generated_df.columns[i]] = generated_df[ generated_df.columns[i] ].astype(data_dtype[i]) generated_df = pd.concat( [ x_train.sample(frac=(x_test_bigger), replace=True, random_state=42), generated_df, ] ).reset_index(drop=True) num_cols = [] for col in x_train.columns: if "num" in col: num_cols.append(col) for num_col in num_cols: min_val = x_test[num_col].quantile(0.02) max_val = x_test[num_col].quantile(0.98) generated_df = generated_df.loc[ (generated_df[num_col] >= min_val) & (generated_df[num_col] <= max_val) ] generated_df = generated_df.reset_index(drop=True) ad_model = adversarial_test(x_test, generated_df.drop("target", axis=1), cat_cols) generated_df["test_similarity"] = ad_model.predict( generated_df.drop("target", axis=1), return_shape=False ) generated_df.sort_values("test_similarity", ascending=False, inplace=True) generated_df = generated_df.head(int(gen_x_times * x_train.shape[0])) x_train = pd.concat( [x_train, generated_df.drop("test_similarity", axis=1)], axis=0 ).reset_index(drop=True) del generated_df gc.collect() return x_train.drop("target", axis=1), x_train["target"]
def baseline_ctgan(args, df_naive): from ctgan import CTGANSynthesizer ctgan = CTGANSynthesizer() ctgan.fit(df_naive) ctgan_samples = ctgan.sample(args.n_gen_samples) # print(ctgan_samples) return ctgan_samples
def augment_ctgan_classification(csvfile): data = pd.read_csv(csvfile) ctgan = CTGANSynthesizer() ctgan.fit(data, epochs=10) #15 percent_generated = 1 df_gen = ctgan.sample(int(len(data) * percent_generated)) df_gen['class_'] = df_gen['class_'].apply(np.floor) values = list(set(list(data['class_']))) newclass = df_gen['class_'] newclass2 = list() for i in range(len(newclass)): if newclass[i] not in values: newvalue = find_nearestval(newclass[i], values) newclass2.append(newvalue) else: newclass2.append(newclass[i]) df_gen['class_'] = newclass2 # now count each value and balance classcol = list(df_gen['class_']) unique_classes = list(set(df_gen['class_'])) counts = list() for i in range(len(unique_classes)): counts.append(classcol.count(unique_classes[i])) minval = min(counts) print(minval) # now balance out the classes by removing all to minimum value for i in range(len(unique_classes)): print(unique_classes[i]) index_pos_list = get_index_positions(classcol, unique_classes[i]) while len(index_pos_list) >= minval: index_pos_list = get_index_positions(classcol, unique_classes[i]) random_ind = random.choice(index_pos_list) df_gen = df_gen.drop(df_gen.index[random_ind]) classcol = list(df_gen['class_']) print('augmented with %s samples' % (str(len(unique_classes) * minval))) print(df_gen) # now add both togrther to make new .CSV file newfile1 = 'augmented_' + csvfile df_gen.to_csv(newfile1, index=0) # now combine augmented and regular dataset data2 = pd.read_csv('augmented_' + csvfile) frames = [data, data2] result = pd.concat(frames) newfile = 'augmented_combined_' + csvfile result.to_csv(newfile, index=0) return [csvfile, newfile1, newfile2]
class CTGAN(GenerativeModel): """ A generative adversarial network for tabular data """ def __init__(self, metadata, embedding_dim=128, gen_dim=(256, 256), dis_dim=(256, 256), l2scale=1e-6, batch_size=500, epochs=300): self.synthesiser = CTGANSynthesizer(embedding_dim, gen_dim, dis_dim, l2scale, batch_size, epochs) self.metadata = metadata self.datatype = DataFrame self.trained = False self.__name__ = 'CTGAN' def fit(self, rawTrain): """ Fit a generative model of the training data distribution. See <https://github.com/sdv-dev/CTGAN> for details. :param rawTrain: DataFrame or ndarray: Training set """ assert isinstance( rawTrain, self.datatype ), f'{self.__class__.__name__} expects {self.datatype} as input data but got {type(rawTrain)}' logger.debug( f'Start fitting {self.__class__.__name__} to data of shape {rawTrain.shape}...' ) self.synthesiser.fit(rawTrain, self.metadata) logger.debug(f'Finished fitting') self.trained = True def generate_samples(self, nsamples): """ Samples synthetic data records from the fitted generative distribution :param nsamples: int: Number of synthetic records to generate :return: synData: DataFrame: A synthetic dataset """ assert self.trained, "Model must first be fitted to some data." logger.debug(f'Generate synthetic dataset of size {nsamples}') synData = self.synthesiser.sample(nsamples) return synData
def run_cgan(X_S, x_T, n_samples): X_train = np.vstack([X_S, x_T]) ctgan = CTGANSynthesizer(embedding_dim=128, gen_dim=(256, 256), dis_dim=(256, 256), l2scale=1e-6, batch_size=500) ts = time.time() ctgan.fit(X_train, epochs=300) Z = ctgan.sample(n_samples) # print(Z[0].tolist()) # return None run_train = time.time() - ts return Z, run_train
def build_and_train(params): gen_layers = [int(params['gen_layer_sizes'])] * int(params['gen_num_layers']) print(gen_layers) crit_layers = [int(params['crit_layer_sizes'])] * int(params['crit_num_layers']) print(crit_layers) my_ctgan = CTGANSynthesizer(embedding_dim=int(params['embedding_dim']), gen_dim=gen_layers, dis_dim=crit_layers, batch_size=int(params['batch_size']), l2scale=params['l2scale']) print('Fitting a CTGAN model for {0} epochs...'.format(EPOCHS)) d = params.get('dataset') my_ctgan.fit(d.train, d.info.get('discrete_columns'), epochs=EPOCHS) print('Successfully fitted a CTGAN model') return my_ctgan
def __init__(self, metadata, embedding_dim=128, gen_dim=(256, 256), dis_dim=(256, 256), l2scale=1e-6, batch_size=500, epochs=300): self.synthesiser = CTGANSynthesizer(embedding_dim, gen_dim, dis_dim, l2scale, batch_size, epochs) self.metadata = metadata self.datatype = DataFrame self.trained = False self.__name__ = 'CTGAN'
def augment_ctgan_regression(csvfile): data=pd.read_csv(csvfile) ctgan = CTGANSynthesizer() ctgan.fit(data,epochs=10) #15 percent_generated=1 df_gen = ctgan.sample(int(len(data)*percent_generated)) print('augmented with %s samples'%(str(len(df_gen)))) print(df_gen) # now add both togrther to make new .CSV file newfile1='augmented_'+csvfile df_gen.to_csv(newfile1, index=0) # now combine augmented and regular dataset data2=pd.read_csv('augmented_'+csvfile) frames = [data, data2] result = pd.concat(frames) newfile2='augmented_combined_'+csvfile result.to_csv(newfile2, index=0) return [csvfile,newfile1,newfile2]
class CTGAN(GenerativeModel): """A conditional generative adversarial network for tabular data""" def __init__(self, metadata, embedding_dim=128, gen_dim=(256, 256), dis_dim=(256, 256), l2scale=1e-6, batch_size=500, epochs=300, multiprocess=False): self.synthesiser = CTGANSynthesizer(embedding_dim, gen_dim, dis_dim, l2scale, batch_size, epochs) self.metadata = metadata self.datatype = DataFrame self.multiprocess = bool(multiprocess) self.infer_ranges = True self.trained = False self.__name__ = 'CTGAN' def fit(self, data): """Train a generative adversarial network on tabular data. Input data is assumed to be of shape (n_samples, n_features) See https://github.com/DAI-Lab/SDGym for details""" assert isinstance(data, self.datatype), f'{self.__class__.__name__} expects {self.datatype} as input data but got {type(data)}' LOGGER.debug(f'Start fitting {self.__class__.__name__} to data of shape {data.shape}...') self.synthesiser.fit(data, self.metadata) LOGGER.debug(f'Finished fitting') self.trained = True def generate_samples(self, nsamples): """Generate random samples from the fitted Gaussian distribution""" assert self.trained, "Model must first be fitted to some data." LOGGER.debug(f'Generate synthetic dataset of size {nsamples}') synthetic_data = self.synthesiser.sample(nsamples) return synthetic_data
from ctgan import CTGANSynthesizer import pandas as pd import pandas_profiling import torch data_kidney = pd.read_csv("../../data/kidney/raw.csv") good_data_kidney = data_kidney.dropna() #good_data_kidney.profile_report().to_file("data_kidney.html") #print(data_kidney) #print(data_kidney.columns.tolist()) ctgan = CTGANSynthesizer() ctgan.fit(good_data_kidney, data_kidney.columns.tolist()) torch.save(ctgan, '../../models/kidney/ctgan-kidney') samples = ctgan.sample(300) #samples.profile_report().to_file("sample.html") print(samples)
def main(_): data, meta = read_data(FLAGS.data, FLAGS.meta) model = CTGANSynthesizer(epochs=FLAGS.max_epoch) model.fit(data, meta['discrete_columns'], tuple()) data_syn = model.sample(FLAGS.sample) write_data(data_syn, meta, FLAGS.output)
def add_synthetic(self, method='random', apply={}): ################# random if method == 'random': sys.stdout.write("\r") print('\n###########\nadding random synthetic samples ..\n###########\n') print('\n{}\n'.format(apply)) for i in tqdm(apply.keys()): noise_factor = self.X_train[self.y_train_l == i].mean() * 0.001 totalcount = 0 max_shape = self.X_train[self.y_train_l == i].shape[0] + apply[i] for xx in range(3): set_shape = self.X_train[self.y_train_l == i].shape[0] if set_shape < max_shape: howManyTimes = round(math.log(max_shape / set_shape)) + 1 for j in range(howManyTimes): totalcount += 1 rareEventX = self.X_train[self.y_train_l == i].copy() rareEventY = self.y_train_l[self.y_train_l == i].copy() noisyRareEvent = rareEventX + noise_factor * np.random.normal(loc=0.0, scale=1.0, size=rareEventX.shape) if rareEventX.shape[0] + noisyRareEvent.shape[0] > max_shape: will_be_subtracted = (rareEventX.shape[0] + noisyRareEvent.shape[0]) - max_shape new_shape = noisyRareEvent.shape[0] - will_be_subtracted self.X_train = np.concatenate((self.X_train, noisyRareEvent[:new_shape]), axis=0) self.y_train_l = np.concatenate((self.y_train_l, rareEventY[:new_shape]), axis=0) else: self.X_train = np.concatenate((self.X_train, noisyRareEvent), axis=0) self.y_train_l = np.concatenate((self.y_train_l, rareEventY), axis=0) self.y_train = tf.keras.utils.to_categorical(self.y_train_l) print(self.all_labels[i], self.X_train[self.y_train_l == i].shape, " {}.th generation with {} noise".format(totalcount, noise_factor)) noise_factor *= 0.5 ################# smoteenn elif method == 'smoteenn': print('\n###########\nadding smoteenn synthetic samples ..\n###########\n') print('\n{}\n'.format(apply)) competitors = apply.keys() sampling_strategy = {} for i in competitors: sampling_strategy[i] = int(self.y_train_l[self.y_train_l == i].shape[0] + apply[i]*1.5) smote_enn = SMOTEENN(random_state=0, n_jobs=64, sampling_strategy=sampling_strategy, enn=imblearn.under_sampling.EditedNearestNeighbours(sampling_strategy='auto', n_neighbors=3) ) filters = [(self.y_train_l == i) for i in competitors] X_resampled, y_resampled = smote_enn.fit_resample(self.X_train_l[np.logical_or.reduce(filters)], self.y_train_l[np.logical_or.reduce(filters)]) # remove selected classes from dataset filters = [(self.y_train_l != i) for i in apply.keys()] previous_shapes = {} for i in apply.keys(): previous_shapes[i] = self.y_train_l[self.y_train_l == i].shape[0] self.X_train_l = self.X_train_l[np.logical_and.reduce(filters)] self.y_train_l = self.y_train_l[np.logical_and.reduce(filters)] for i in apply.keys(): self.X_train_l = np.concatenate([ self.X_train_l, X_resampled[y_resampled == i][:previous_shapes[i] + apply[i]] ], axis=0) self.y_train_l = np.concatenate([ self.y_train_l, y_resampled[y_resampled == i][:previous_shapes[i] + apply[i]] ], axis=0) self.X_train = self.X_train_l.reshape(self.X_train_l.shape[0], 1, self.X_train_l.shape[1]) self.y_train = tf.keras.utils.to_categorical(self.y_train_l) elif method == 'ctgan': print('\n###########\nadding ctgan synthetic samples ..\n###########\n') print('\n{}\n'.format(apply)) for i in tqdm(apply.keys()): gan_batch_size = 10 if self.y_train_l[self.y_train_l == i].shape[0] > 100: gan_batch_size = 20 elif self.y_train_l[self.y_train_l == i].shape[0] > 300: gan_batch_size = 50 gan = CTGANSynthesizer(batch_size=gan_batch_size) gan.fit(self.X_train_l[self.y_train_l == i], epochs=100) # generate samples generated = gan.sample(apply[i]) self.X_train_l = np.concatenate([self.X_train_l, generated], axis=0) self.X_train = self.X_train_l.reshape(self.X_train_l.shape[0], 1, self.X_train_l.shape[1]) self.y_train_l = np.concatenate([self.y_train_l, np.ones(shape=(apply[i],)) * i], axis=0) self.y_train = tf.keras.utils.to_categorical(self.y_train_l)
original_dir = str(cwd + '/original_data/') #original data folder should only contain 1 file at a time, we define the file location here fname1 = os.listdir(original_dir)[0] fname2 = str(original_dir + fname1) #read original data file into pandas memory then remove the file data = pd.read_csv(fname2) os.remove(fname2) discrete_columns = list(data.select_dtypes(include=['object']).columns) print() print() ctgan = CTGANSynthesizer() security_check = True ''' In the rare case that multiple users upload files at the same time, there is a chance that since we grab the first file in fname1, we might accidently grab another user's file. By asking a simple security question we can verify that the file we selected in fname1 is the same file that the user intended to use ''' while security_check: security = input('Please re-confirm the file name you wish to fit: ') if security == fname1: # match is found between fileuploaded and the file pulled from the 'original_data' folder print("Data Integrity check PASSED") security_check = False else:
import pandas as pd data = pd.read_csv("jobs.csv") discrete_columns = ['Job'] from ctgan import CTGANSynthesizer ctgan = CTGANSynthesizer() ctgan.fit(data, discrete_columns, epochs=10) ctgan.save('job.pkl')
import pandas_profiling import torch #print(data_cancer) #print(data_cancer.columns.tolist()) data_cancer = pd.read_csv("../../data/diabetes/raw.csv") data_diab = data_cancer[50000:100000] good_data_cancer = data_cancer.dropna() good_data_cancer.profile_report().to_file("data_diab.html") exit() ctgan = CTGANSynthesizer() ctgan.fit(good_data_cancer, [ 'race', 'gender', 'age', 'weight', 'payer_code', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted' ]) torch.save(ctgan, '../../models/diabetes/ctgan-diabetes-50k-1') samples = ctgan.sample(300)
import ctgan as ctgan import pandas as pd from ctgan import CTGANSynthesizer data = pd.read_csv('cleaned_data.csv') #tabularised dataset discrete_columns = range(100, 120) #20 bins ctgan = CTGANSynthesizer(epochs=10) ctgan.fit(df, discrete_columns) #create synthetic data for 1000000 number of rows #1000000 prodcuts samples = ctgan.sample(1000000)
# Important Variables STATE = "Maharashtra" DISTRICTS = {"Pune": 3132143} #DISTRICTS = {"Bellary": 2452595} import time from ctgan import CTGANSynthesizer import pandas as pd from math import ceil import places from opencage.geocoder import OpenCageGeocode import numpy as np import gc ctgan_model = CTGANSynthesizer() print("Loading pretrained model...") ctgan_gen = ctgan_model.load("{}/{}.pkl".format(STATE, STATE)) print("Loading state details...") state_details = pd.read_csv("{}/{}_Details.csv".format(STATE, STATE)) state_details = state_details[26:30] #state_details.astype({'DD_Code': 'int32', 'Population': 'int32'}).dtypes #DISTRICTS = dict(zip(state_details.District, state_details.Population)) for district, population_counts in DISTRICTS.items(): gc.collect() print("Generating {} of population for {}...".format( population_counts, district))
from ctgan import CTGANSynthesizer import pandas as pd import pandas_profiling import torch data_cancer = pd.read_csv("../../data/cancer/raw.csv") good_data_cancer = data_cancer.dropna() #good_data_cancer.profile_report().to_file("data_breast.html") #print(data_cancer) #print(data_cancer.columns.tolist()) ctgan = CTGANSynthesizer() ctgan.fit(good_data_cancer, ['id', 'diagnosis']) #ctgan.fit(good_data_cancer, data_cancer.columns.tolist()) torch.save(ctgan, '../../models/breast/ctgan-breast') samples = ctgan.sample(300) print(samples) print(samples.describe()) #samples.profile_report().to_file("sample_cancer.html") #print(samples)
def main(): dataset = 'diabetes' epochs = 300 train_size = 0.7 print(datetime.datetime.now(), 'Dataset: %s' % dataset) D = get_dataset(dataset, path_dataset, normalize=None) X_train, y_train, X_test, y_test = D['X_train'], D['y_train'], D[ 'X_test'], D['y_test'] # n_classes = D['n_classes'] n_features = D['n_features'] feature_names = D['feature_names'] class_name = D['class_name'] le = LabelEncoder() le.fit(y_train) y_train = le.transform(y_train) y_test = le.transform(y_test) Xy_train = np.hstack((X_train, y_train.reshape(-1, 1))) print(datetime.datetime.now(), 'Training CTGAN') ctgan = CTGANSynthesizer(embedding_dim=128, gen_dim=(256, 256), dis_dim=(256, 256), l2scale=1e-6, batch_size=500) ts = time.time() ctgan.fit(Xy_train, epochs=epochs, discrete_columns=[n_features + 1]) cgan_fit_time = time.time() - ts n_fake_instances = len(Xy_train) print(datetime.datetime.now(), 'Generating synthetic data') ts = time.time() Xy_fake = ctgan.sample(n_fake_instances) cgan_gen_time = time.time() - ts # print('F 0', np.mean(Xy_fake[:, 0]), np.min(Xy_fake[:,0]), np.max(Xy_fake[:,0])) # print('F 1', np.mean(Xy_fake[:, 1]), np.min(Xy_fake[:, 1]), np.max(Xy_fake[:, 1])) # # print('R 0', np.mean(X_train[:, 0]), np.min(X_train[:, 0]), np.max(X_train[:, 0])) # print('R 1', np.mean(X_train[:, 1]), np.min(X_train[:, 1]), np.max(X_train[:, 1])) # return -1 print(datetime.datetime.now(), 'Storing synthetic data') df = pd.DataFrame(data=Xy_fake, columns=feature_names + [class_name]) df.to_csv(path_syht_dataset + '%s.csv' % dataset, index=False) X_fake = Xy_fake[:, :-1] X_real = X_train y_real = np.ones(len(X_real)) y_fake = np.zeros(len(X_fake)) X_rf = np.concatenate([X_real, X_fake]) y_rf = np.concatenate([y_real, y_fake]) X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split( X_rf, y_rf, train_size=train_size, stratify=y_rf) res_dict = dict() for clf_name, clf in clf_list.items(): print(datetime.datetime.now(), 'Training %s' % clf_name) ts = time.time() clf.fit(X_rf_train, y_rf_train) disc_fit_time = time.time() - ts pickle.dump( clf, open(path_discr + '%s_%s.pickle' % (dataset, clf_name), 'wb')) y_pred_train = clf.predict(X_rf_train) y_pred_test = clf.predict(X_rf_test) acc_train = accuracy_score(y_rf_train, y_pred_train) acc_test = accuracy_score(y_rf_test, y_pred_test) res_dict['%s_acc_train' % clf_name] = acc_train res_dict['%s_acc_test' % clf_name] = acc_test res_dict['%s_disc_fit_time' % clf_name] = disc_fit_time print(datetime.datetime.now(), '\taccuracy %.3f, %.3f' % (acc_train, acc_test)) res_dict['dataset'] = dataset res_dict['cgan_fit_time'] = cgan_fit_time res_dict['cgan_gen_time'] = cgan_gen_time print(datetime.datetime.now(), 'Storing evaluation') store_result(res_dict, path_ctgan_eval + 'tabular.json')
print("Starting DPGAN...") dpgan = PytorchDPSynthesizer(DPGAN(), GeneralTransformer(), epsilon=1) dpgan.fit(df, categorical_columns=['sex','educ','race','married'], verbose=True) synth_data = dpgan.sample(df.size) s = synth_data.corr() d = df.corr() print("Save and reload...") dpgan.save(os.path.join(git_root_dir, os.path.join("saved_models","dpgan.ckpt"))) newInstance = PytorchDPSynthesizer(DPGAN(), GeneralTransformer(), epsilon=1) newInstance.load(os.path.join(git_root_dir, os.path.join("saved_models","dpgan.ckpt"))) newInstance.fit(df,categorical_columns=['sex','educ','race','married'], update_epsilon=2, verbose=True) synth_data = newInstance.sample(df.size) s = synth_data.corr() d = df.corr() a2 = d.subtract(s) print("Starting CTGAN...") from ctgan import CTGANSynthesizer ctgan = CTGANSynthesizer() ctgan.fit(df, ['sex','educ','race','married'], epochs=10) synth_data = ctgan.sample(df.size) s = synth_data.corr() d = df.corr() #print(d.subtract(s))