Beispiel #1
0
def extend_gan_train(x_train, y_train, x_test, cat_cols, gen_x_times=1.2, epochs=300):
    """
    Extends train by generating new data by GAN
    :param x_train:  train dataframe
    :param y_train: target for train dataframe
    :param x_test: dataframe
    :param cat_cols: List of categorical columns
    :param gen_x_times: Factor for which initial dataframe should be increased
    :param cat_cols: List of categorical columns
    :param epochs: Number of epoch max to train the GAN
    :return: extended train with target
    """

    if gen_x_times == 0:
        raise ValueError("Passed gen_x_times with value 0!")
    x_train["target"] = y_train
    x_test_bigger = int(1.1 * x_test.shape[0] / x_train.shape[0])
    ctgan = CTGANSynthesizer()
    ctgan.fit(x_train, cat_cols, epochs=epochs)
    generated_df = ctgan.sample((x_test_bigger) * x_train.shape[0])
    data_dtype = x_train.dtypes.values

    for i in range(len(generated_df.columns)):
        generated_df[generated_df.columns[i]] = generated_df[
            generated_df.columns[i]
        ].astype(data_dtype[i])

    generated_df = pd.concat(
        [
            x_train.sample(frac=(x_test_bigger), replace=True, random_state=42),
            generated_df,
        ]
    ).reset_index(drop=True)

    num_cols = []
    for col in x_train.columns:
        if "num" in col:
            num_cols.append(col)

    for num_col in num_cols:
        min_val = x_test[num_col].quantile(0.02)
        max_val = x_test[num_col].quantile(0.98)
        generated_df = generated_df.loc[
            (generated_df[num_col] >= min_val) & (generated_df[num_col] <= max_val)
        ]
    generated_df = generated_df.reset_index(drop=True)
    ad_model = adversarial_test(x_test, generated_df.drop("target", axis=1), cat_cols)

    generated_df["test_similarity"] = ad_model.predict(
        generated_df.drop("target", axis=1), return_shape=False
    )
    generated_df.sort_values("test_similarity", ascending=False, inplace=True)
    generated_df = generated_df.head(int(gen_x_times * x_train.shape[0]))
    x_train = pd.concat(
        [x_train, generated_df.drop("test_similarity", axis=1)], axis=0
    ).reset_index(drop=True)
    del generated_df
    gc.collect()
    return x_train.drop("target", axis=1), x_train["target"]
Beispiel #2
0
def baseline_ctgan(args, df_naive):
    from ctgan import CTGANSynthesizer

    ctgan = CTGANSynthesizer()
    ctgan.fit(df_naive)
    ctgan_samples = ctgan.sample(args.n_gen_samples)
    # print(ctgan_samples)
    return ctgan_samples
Beispiel #3
0
def augment_ctgan_classification(csvfile):
    data = pd.read_csv(csvfile)

    ctgan = CTGANSynthesizer()
    ctgan.fit(data, epochs=10)  #15

    percent_generated = 1
    df_gen = ctgan.sample(int(len(data) * percent_generated))
    df_gen['class_'] = df_gen['class_'].apply(np.floor)

    values = list(set(list(data['class_'])))
    newclass = df_gen['class_']
    newclass2 = list()

    for i in range(len(newclass)):
        if newclass[i] not in values:
            newvalue = find_nearestval(newclass[i], values)
            newclass2.append(newvalue)
        else:
            newclass2.append(newclass[i])

    df_gen['class_'] = newclass2

    # now count each value and balance
    classcol = list(df_gen['class_'])
    unique_classes = list(set(df_gen['class_']))
    counts = list()
    for i in range(len(unique_classes)):
        counts.append(classcol.count(unique_classes[i]))
    minval = min(counts)
    print(minval)

    # now balance out the classes by removing all to minimum value
    for i in range(len(unique_classes)):
        print(unique_classes[i])
        index_pos_list = get_index_positions(classcol, unique_classes[i])
        while len(index_pos_list) >= minval:
            index_pos_list = get_index_positions(classcol, unique_classes[i])
            random_ind = random.choice(index_pos_list)
            df_gen = df_gen.drop(df_gen.index[random_ind])
            classcol = list(df_gen['class_'])

    print('augmented with %s samples' % (str(len(unique_classes) * minval)))
    print(df_gen)
    # now add both togrther to make new .CSV file
    newfile1 = 'augmented_' + csvfile
    df_gen.to_csv(newfile1, index=0)

    # now combine augmented and regular dataset
    data2 = pd.read_csv('augmented_' + csvfile)
    frames = [data, data2]
    result = pd.concat(frames)
    newfile = 'augmented_combined_' + csvfile
    result.to_csv(newfile, index=0)
    return [csvfile, newfile1, newfile2]
class CTGAN(GenerativeModel):
    """
    A generative adversarial network for tabular data
    """
    def __init__(self,
                 metadata,
                 embedding_dim=128,
                 gen_dim=(256, 256),
                 dis_dim=(256, 256),
                 l2scale=1e-6,
                 batch_size=500,
                 epochs=300):

        self.synthesiser = CTGANSynthesizer(embedding_dim, gen_dim, dis_dim,
                                            l2scale, batch_size, epochs)
        self.metadata = metadata
        self.datatype = DataFrame

        self.trained = False

        self.__name__ = 'CTGAN'

    def fit(self, rawTrain):
        """
        Fit a generative model of the training data distribution.
        See <https://github.com/sdv-dev/CTGAN> for details.

        :param rawTrain: DataFrame or ndarray: Training set
        """
        assert isinstance(
            rawTrain, self.datatype
        ), f'{self.__class__.__name__} expects {self.datatype} as input data but got {type(rawTrain)}'

        logger.debug(
            f'Start fitting {self.__class__.__name__} to data of shape {rawTrain.shape}...'
        )
        self.synthesiser.fit(rawTrain, self.metadata)

        logger.debug(f'Finished fitting')
        self.trained = True

    def generate_samples(self, nsamples):
        """
        Samples synthetic data records from the fitted generative distribution

        :param nsamples: int: Number of synthetic records to generate
        :return: synData: DataFrame: A synthetic dataset
        """
        assert self.trained, "Model must first be fitted to some data."

        logger.debug(f'Generate synthetic dataset of size {nsamples}')
        synData = self.synthesiser.sample(nsamples)

        return synData
Beispiel #5
0
def run_cgan(X_S, x_T, n_samples):
    X_train = np.vstack([X_S, x_T])
    ctgan = CTGANSynthesizer(embedding_dim=128,
                             gen_dim=(256, 256),
                             dis_dim=(256, 256),
                             l2scale=1e-6,
                             batch_size=500)
    ts = time.time()
    ctgan.fit(X_train, epochs=300)
    Z = ctgan.sample(n_samples)
    # print(Z[0].tolist())
    # return None
    run_train = time.time() - ts

    return Z, run_train
Beispiel #6
0
def augment_ctgan_regression(csvfile):
    data=pd.read_csv(csvfile)
    ctgan = CTGANSynthesizer()
    ctgan.fit(data,epochs=10) #15
    percent_generated=1
    df_gen = ctgan.sample(int(len(data)*percent_generated))
    print('augmented with %s samples'%(str(len(df_gen))))
    print(df_gen)
    # now add both togrther to make new .CSV file
    newfile1='augmented_'+csvfile
    df_gen.to_csv(newfile1, index=0)
    # now combine augmented and regular dataset
    data2=pd.read_csv('augmented_'+csvfile)
    frames = [data, data2]
    result = pd.concat(frames)
    newfile2='augmented_combined_'+csvfile
    result.to_csv(newfile2, index=0)
    return [csvfile,newfile1,newfile2]
Beispiel #7
0
class CTGAN(GenerativeModel):
    """A conditional generative adversarial network for tabular data"""
    def __init__(self, metadata,
                 embedding_dim=128, gen_dim=(256, 256),
                 dis_dim=(256, 256), l2scale=1e-6,
                 batch_size=500, epochs=300,
                 multiprocess=False):

        self.synthesiser = CTGANSynthesizer(embedding_dim, gen_dim, dis_dim,
                                            l2scale, batch_size, epochs)

        self.metadata = metadata
        self.datatype = DataFrame

        self.multiprocess = bool(multiprocess)

        self.infer_ranges = True
        self.trained = False

        self.__name__ = 'CTGAN'

    def fit(self, data):
        """Train a generative adversarial network on tabular data.
        Input data is assumed to be of shape (n_samples, n_features)
        See https://github.com/DAI-Lab/SDGym for details"""
        assert isinstance(data, self.datatype), f'{self.__class__.__name__} expects {self.datatype} as input data but got {type(data)}'

        LOGGER.debug(f'Start fitting {self.__class__.__name__} to data of shape {data.shape}...')
        self.synthesiser.fit(data, self.metadata)

        LOGGER.debug(f'Finished fitting')
        self.trained = True

    def generate_samples(self, nsamples):
        """Generate random samples from the fitted Gaussian distribution"""
        assert self.trained, "Model must first be fitted to some data."

        LOGGER.debug(f'Generate synthetic dataset of size {nsamples}')
        synthetic_data = self.synthesiser.sample(nsamples)

        return synthetic_data
Beispiel #8
0
def main():

    dataset = 'diabetes'
    epochs = 300
    train_size = 0.7

    print(datetime.datetime.now(), 'Dataset: %s' % dataset)
    D = get_dataset(dataset, path_dataset, normalize=None)

    X_train, y_train, X_test, y_test = D['X_train'], D['y_train'], D[
        'X_test'], D['y_test']
    # n_classes = D['n_classes']
    n_features = D['n_features']
    feature_names = D['feature_names']
    class_name = D['class_name']

    le = LabelEncoder()
    le.fit(y_train)
    y_train = le.transform(y_train)
    y_test = le.transform(y_test)

    Xy_train = np.hstack((X_train, y_train.reshape(-1, 1)))

    print(datetime.datetime.now(), 'Training CTGAN')
    ctgan = CTGANSynthesizer(embedding_dim=128,
                             gen_dim=(256, 256),
                             dis_dim=(256, 256),
                             l2scale=1e-6,
                             batch_size=500)
    ts = time.time()
    ctgan.fit(Xy_train, epochs=epochs, discrete_columns=[n_features + 1])
    cgan_fit_time = time.time() - ts

    n_fake_instances = len(Xy_train)

    print(datetime.datetime.now(), 'Generating synthetic data')
    ts = time.time()
    Xy_fake = ctgan.sample(n_fake_instances)
    cgan_gen_time = time.time() - ts

    # print('F 0', np.mean(Xy_fake[:, 0]), np.min(Xy_fake[:,0]), np.max(Xy_fake[:,0]))
    # print('F 1', np.mean(Xy_fake[:, 1]), np.min(Xy_fake[:, 1]), np.max(Xy_fake[:, 1]))
    #
    # print('R 0', np.mean(X_train[:, 0]), np.min(X_train[:, 0]), np.max(X_train[:, 0]))
    # print('R 1', np.mean(X_train[:, 1]), np.min(X_train[:, 1]), np.max(X_train[:, 1]))
    # return -1

    print(datetime.datetime.now(), 'Storing synthetic data')
    df = pd.DataFrame(data=Xy_fake, columns=feature_names + [class_name])
    df.to_csv(path_syht_dataset + '%s.csv' % dataset, index=False)

    X_fake = Xy_fake[:, :-1]
    X_real = X_train

    y_real = np.ones(len(X_real))
    y_fake = np.zeros(len(X_fake))

    X_rf = np.concatenate([X_real, X_fake])
    y_rf = np.concatenate([y_real, y_fake])

    X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split(
        X_rf, y_rf, train_size=train_size, stratify=y_rf)

    res_dict = dict()

    for clf_name, clf in clf_list.items():
        print(datetime.datetime.now(), 'Training %s' % clf_name)
        ts = time.time()
        clf.fit(X_rf_train, y_rf_train)
        disc_fit_time = time.time() - ts
        pickle.dump(
            clf, open(path_discr + '%s_%s.pickle' % (dataset, clf_name), 'wb'))

        y_pred_train = clf.predict(X_rf_train)
        y_pred_test = clf.predict(X_rf_test)
        acc_train = accuracy_score(y_rf_train, y_pred_train)
        acc_test = accuracy_score(y_rf_test, y_pred_test)
        res_dict['%s_acc_train' % clf_name] = acc_train
        res_dict['%s_acc_test' % clf_name] = acc_test
        res_dict['%s_disc_fit_time' % clf_name] = disc_fit_time
        print(datetime.datetime.now(),
              '\taccuracy %.3f, %.3f' % (acc_train, acc_test))

    res_dict['dataset'] = dataset
    res_dict['cgan_fit_time'] = cgan_fit_time
    res_dict['cgan_gen_time'] = cgan_gen_time

    print(datetime.datetime.now(), 'Storing evaluation')
    store_result(res_dict, path_ctgan_eval + 'tabular.json')
Beispiel #9
0
def main(_):
    data, meta = read_data(FLAGS.data, FLAGS.meta)
    model = CTGANSynthesizer(epochs=FLAGS.max_epoch)
    model.fit(data, meta['discrete_columns'], tuple())
    data_syn = model.sample(FLAGS.sample)
    write_data(data_syn, meta, FLAGS.output)
Beispiel #10
0
data_cancer = pd.read_csv("../../data/diabetes/raw.csv")

data_diab = data_cancer[50000:100000]
good_data_cancer = data_cancer.dropna()

good_data_cancer.profile_report().to_file("data_diab.html")
exit()
ctgan = CTGANSynthesizer()
ctgan.fit(good_data_cancer, [
    'race', 'gender', 'age', 'weight', 'payer_code', 'medical_specialty',
    'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin',
    'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
    'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
    'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
    'examide', 'citoglipton', 'insulin', 'glyburide-metformin',
    'glipizide-metformin', 'glimepiride-pioglitazone',
    'metformin-rosiglitazone', 'metformin-pioglitazone', 'change',
    'diabetesMed', 'readmitted'
])

torch.save(ctgan, '../../models/diabetes/ctgan-diabetes-50k-1')

samples = ctgan.sample(300)

print(samples)

print(samples.describe())

#samples.profile_report().to_file("sample_cancer.html")

#print(samples)
Beispiel #11
0
    def add_synthetic(self, method='random', apply={}):

        ################# random
        if method == 'random':
            sys.stdout.write("\r")
            print('\n###########\nadding random synthetic samples ..\n###########\n')
            print('\n{}\n'.format(apply))
            
            for i in tqdm(apply.keys()):
                noise_factor     = self.X_train[self.y_train_l == i].mean() * 0.001
                totalcount       = 0
                max_shape        = self.X_train[self.y_train_l == i].shape[0] + apply[i]
                for xx in range(3):
                    set_shape = self.X_train[self.y_train_l == i].shape[0]
                    if set_shape < max_shape:
                        howManyTimes = round(math.log(max_shape / set_shape)) + 1
                        for j in range(howManyTimes):
                            totalcount += 1
                            rareEventX   = self.X_train[self.y_train_l == i].copy()
                            rareEventY   = self.y_train_l[self.y_train_l == i].copy()

                            noisyRareEvent = rareEventX + noise_factor * np.random.normal(loc=0.0, scale=1.0, size=rareEventX.shape)

                            if rareEventX.shape[0] + noisyRareEvent.shape[0] > max_shape:
                                will_be_subtracted = (rareEventX.shape[0] + noisyRareEvent.shape[0]) - max_shape
                                new_shape = noisyRareEvent.shape[0] - will_be_subtracted

                                self.X_train         = np.concatenate((self.X_train, noisyRareEvent[:new_shape]), axis=0)
                                self.y_train_l       = np.concatenate((self.y_train_l, rareEventY[:new_shape]), axis=0)
                            else:
                                self.X_train         = np.concatenate((self.X_train, noisyRareEvent), axis=0)
                                self.y_train_l       = np.concatenate((self.y_train_l, rareEventY), axis=0)
                                
                            self.y_train = tf.keras.utils.to_categorical(self.y_train_l)

                            print(self.all_labels[i], self.X_train[self.y_train_l == i].shape, " {}.th generation with {} noise".format(totalcount, noise_factor))
                            noise_factor *= 0.5
                        
        ################# smoteenn
        elif method == 'smoteenn':
            print('\n###########\nadding smoteenn synthetic samples ..\n###########\n')
            print('\n{}\n'.format(apply))
            
            competitors = apply.keys()
            sampling_strategy = {}
            for i in competitors:
                sampling_strategy[i] = int(self.y_train_l[self.y_train_l == i].shape[0] + apply[i]*1.5) 
            
            smote_enn = SMOTEENN(random_state=0, n_jobs=64, 
                                 sampling_strategy=sampling_strategy,
                                 enn=imblearn.under_sampling.EditedNearestNeighbours(sampling_strategy='auto', n_neighbors=3)
                                )
            

            filters = [(self.y_train_l == i) for i in competitors]
                
            X_resampled, y_resampled = smote_enn.fit_resample(self.X_train_l[np.logical_or.reduce(filters)], 
                                                              self.y_train_l[np.logical_or.reduce(filters)])            
            
            # remove selected classes from dataset
            filters = [(self.y_train_l != i) for i in apply.keys()]
            
            previous_shapes = {}
            for i in apply.keys():
                previous_shapes[i] = self.y_train_l[self.y_train_l == i].shape[0]
            
            self.X_train_l = self.X_train_l[np.logical_and.reduce(filters)]
            self.y_train_l = self.y_train_l[np.logical_and.reduce(filters)]
            
            
            for i in apply.keys():
                self.X_train_l = np.concatenate([
                    self.X_train_l, 
                    X_resampled[y_resampled == i][:previous_shapes[i] + apply[i]]
                ], axis=0)
                self.y_train_l = np.concatenate([
                    self.y_train_l, 
                    y_resampled[y_resampled == i][:previous_shapes[i] + apply[i]]
                ], axis=0)

                
            self.X_train                  = self.X_train_l.reshape(self.X_train_l.shape[0], 1, self.X_train_l.shape[1])
            self.y_train                  = tf.keras.utils.to_categorical(self.y_train_l)
            
        elif method == 'ctgan':
            print('\n###########\nadding ctgan synthetic samples ..\n###########\n')
            print('\n{}\n'.format(apply))
            for i in tqdm(apply.keys()):
                gan_batch_size = 10
                if self.y_train_l[self.y_train_l == i].shape[0] > 100:
                    gan_batch_size = 20
                elif self.y_train_l[self.y_train_l == i].shape[0] > 300:
                    gan_batch_size = 50
                
                gan = CTGANSynthesizer(batch_size=gan_batch_size)
                gan.fit(self.X_train_l[self.y_train_l == i], epochs=100)
                
                # generate samples
                generated = gan.sample(apply[i])
                
                self.X_train_l = np.concatenate([self.X_train_l, generated], axis=0)
                self.X_train   = self.X_train_l.reshape(self.X_train_l.shape[0], 1, self.X_train_l.shape[1])
                
                self.y_train_l = np.concatenate([self.y_train_l, np.ones(shape=(apply[i],)) * i], axis=0)
                
                self.y_train = tf.keras.utils.to_categorical(self.y_train_l)
    # Split combined caste religion into caste and religion
    caste_religion = pd.DataFrame(samples.CasteReligion.str.split(' ',
                                                                  1).tolist(),
                                  columns=['Religion', 'Caste'])
    samples = pd.concat([samples, caste_religion], axis=1)
    del samples['CasteReligion']
    gc.collect()

    # Add district name in the generated data
    samples['District'] = district
    print("ADDING JOB COLUMNS...")
    # Generate job columns
    job_generator = CTGANSynthesizer()
    job_generator = job_generator.load("job.pkl")
    jobs = job_generator.sample(population_counts)

    # Split Job into JobLabel and JobID
    job_label_id = pd.DataFrame(jobs.Job.str.rsplit(' ', 1).tolist(),
                                columns=['JobLabel', 'JobID'])
    samples = pd.concat([samples, job_label_id], axis=1)
    gc.collect()

    #samples['Job']

    # Essential worker columns list
    print("Adding essential workers...")
    essential_list = [
        'Police', 'Sweepers', 'Sales, shop', 'Shopkeepers', 'Boilermen',
        'Nursing', 'Journalists', 'Electrical', 'Food', 'Physicians',
        'Mail distributors', 'Loaders', 'Village officials', 'Govt officials',
Beispiel #13
0

print("Starting DPGAN...")
dpgan = PytorchDPSynthesizer(DPGAN(), GeneralTransformer(), epsilon=1)
dpgan.fit(df, categorical_columns=['sex','educ','race','married'], verbose=True)
synth_data = dpgan.sample(df.size)
s = synth_data.corr()
d = df.corr()

print("Save and reload...")
dpgan.save(os.path.join(git_root_dir, os.path.join("saved_models","dpgan.ckpt")))

newInstance = PytorchDPSynthesizer(DPGAN(), GeneralTransformer(), epsilon=1)
newInstance.load(os.path.join(git_root_dir, os.path.join("saved_models","dpgan.ckpt")))

newInstance.fit(df,categorical_columns=['sex','educ','race','married'], update_epsilon=2, verbose=True)
synth_data = newInstance.sample(df.size)
s = synth_data.corr()
d = df.corr()
a2 = d.subtract(s)

print("Starting CTGAN...")
from ctgan import CTGANSynthesizer
ctgan = CTGANSynthesizer()
ctgan.fit(df, ['sex','educ','race','married'], epochs=10)
synth_data = ctgan.sample(df.size)
s = synth_data.corr()
d = df.corr()
#print(d.subtract(s))