def main(): dataset = 'italypower' train_size = 1000 D = get_dataset(dataset, path_dataset) X_train, y_train, X_test, y_test = D['X_train'], D['y_train'], D['X_test'], D['y_test'] data_type = D['data_type'] if data_type == 'txt': X_train, X_test = D['X_train_txt'], D['X_test_txt'] if len(X_train) > train_size: idx = np.random.choice(len(X_train), size=train_size, replace=False) X_train = X_train[idx] idx = 0 if data_type == 'tab': X_train_dt = [TabularRecord(x) for x in X_train] x_dt = TabularRecord(X_test[idx]) elif data_type == 'ts': window_shape = D['window_sizes'][0] step = D['window_steps'][0] X_train_dt = [TimeSeriesRecord(x, window_shape=window_shape, step=step) for x in X_train] x_dt = TimeSeriesRecord(X_test[idx], window_shape=window_shape, step=step) elif data_type == 'img': window_shape = D['window_sizes'][2] step = D['window_steps'][2][0] X_train_dt = [ImageRecord(x, window_shape=window_shape, step=step) for x in X_train] x_dt = ImageRecord(X_test[idx], window_shape=window_shape, step=step) elif data_type == 'txt': window_shape = 3 step = 1 X_train_dt = [TextRecord(x, window_shape=window_shape, step=step, text_length=100) for x in X_train] x_dt = TextRecord(X_test[idx], window_shape=window_shape, step=step, text_length=100) else: raise ValueError('Unknown data type %s' % data_type) Z = dang_neighborhood_generation(x_dt, X_train_dt, n_samples=1000, indpb=0.5, neighgen_op=neighgen_operators, base=None) # Z = rand_neighborhood_generation(x_dt, X_train_dt, n_samples=1000, indpb=0.5) # Z = supp_neighborhood_generation(x_dt, 0.0, n_samples=1000, indpb=0.5) # Z = norm_neighborhood_generation(x_dt, X_train_dt, n_samples=1000, indpb=0.5) print(len(Z), np.mean(X_train), np.mean(Z)) plt.plot(x_dt.data.tolist(), lw=5) for i in range(10): plt.plot(Z[i*2].data.tolist()) plt.show()
def main(): method = 'dang' for dataset in tab_datasets: print(datetime.datetime.now(), 'Dataset: %s' % dataset) D = get_dataset(dataset, path_dataset, normalize) run_experiment(D, dataset, method) print('')
def main(): dataset = 'diabetes' epochs = 300 train_size = 0.7 print(datetime.datetime.now(), 'Dataset: %s' % dataset) D = get_dataset(dataset, path_dataset, normalize=None) X_train, y_train, X_test, y_test = D['X_train'], D['y_train'], D[ 'X_test'], D['y_test'] # n_classes = D['n_classes'] n_features = D['n_features'] feature_names = D['feature_names'] class_name = D['class_name'] le = LabelEncoder() le.fit(y_train) y_train = le.transform(y_train) y_test = le.transform(y_test) Xy_train = np.hstack((X_train, y_train.reshape(-1, 1))) print(datetime.datetime.now(), 'Training CTGAN') ctgan = CTGANSynthesizer(embedding_dim=128, gen_dim=(256, 256), dis_dim=(256, 256), l2scale=1e-6, batch_size=500) ts = time.time() ctgan.fit(Xy_train, epochs=epochs, discrete_columns=[n_features + 1]) cgan_fit_time = time.time() - ts n_fake_instances = len(Xy_train) print(datetime.datetime.now(), 'Generating synthetic data') ts = time.time() Xy_fake = ctgan.sample(n_fake_instances) cgan_gen_time = time.time() - ts # print('F 0', np.mean(Xy_fake[:, 0]), np.min(Xy_fake[:,0]), np.max(Xy_fake[:,0])) # print('F 1', np.mean(Xy_fake[:, 1]), np.min(Xy_fake[:, 1]), np.max(Xy_fake[:, 1])) # # print('R 0', np.mean(X_train[:, 0]), np.min(X_train[:, 0]), np.max(X_train[:, 0])) # print('R 1', np.mean(X_train[:, 1]), np.min(X_train[:, 1]), np.max(X_train[:, 1])) # return -1 print(datetime.datetime.now(), 'Storing synthetic data') df = pd.DataFrame(data=Xy_fake, columns=feature_names + [class_name]) df.to_csv(path_syht_dataset + '%s.csv' % dataset, index=False) X_fake = Xy_fake[:, :-1] X_real = X_train y_real = np.ones(len(X_real)) y_fake = np.zeros(len(X_fake)) X_rf = np.concatenate([X_real, X_fake]) y_rf = np.concatenate([y_real, y_fake]) X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split( X_rf, y_rf, train_size=train_size, stratify=y_rf) res_dict = dict() for clf_name, clf in clf_list.items(): print(datetime.datetime.now(), 'Training %s' % clf_name) ts = time.time() clf.fit(X_rf_train, y_rf_train) disc_fit_time = time.time() - ts pickle.dump( clf, open(path_discr + '%s_%s.pickle' % (dataset, clf_name), 'wb')) y_pred_train = clf.predict(X_rf_train) y_pred_test = clf.predict(X_rf_test) acc_train = accuracy_score(y_rf_train, y_pred_train) acc_test = accuracy_score(y_rf_test, y_pred_test) res_dict['%s_acc_train' % clf_name] = acc_train res_dict['%s_acc_test' % clf_name] = acc_test res_dict['%s_disc_fit_time' % clf_name] = disc_fit_time print(datetime.datetime.now(), '\taccuracy %.3f, %.3f' % (acc_train, acc_test)) res_dict['dataset'] = dataset res_dict['cgan_fit_time'] = cgan_fit_time res_dict['cgan_gen_time'] = cgan_gen_time print(datetime.datetime.now(), 'Storing evaluation') store_result(res_dict, path_ctgan_eval + 'tabular.json')
def main(): dataset = 'wdbc' train_size = 1000 D = get_dataset(dataset, path_dataset, normalize) X_train, y_train, X_test, y_test = D['X_train'], D['y_train'], D[ 'X_test'], D['y_test'] data_type = D['data_type'] if data_type == 'txt': X_train, X_test = D['X_train_txt'], D['X_test_txt'] if len(X_train) > train_size: idx = np.random.choice(len(X_train), size=train_size, replace=False) X_train = X_train[idx] print(dataset, X_train.shape, data_type) idx = 0 if data_type == 'tab': X_train_dt = [TabularRecord(x) for x in X_train] x_dt = TabularRecord(X_test[idx]) elif data_type == 'ts': window_shape = D['window_sizes'][0] step = D['window_steps'][0] X_train_dt = [ TimeSeriesRecord(x, window_shape=window_shape, step=step) for x in X_train ] x_dt = TimeSeriesRecord(X_test[idx], window_shape=window_shape, step=step) elif data_type == 'img': window_shape = D['window_sizes'][2] step = D['window_steps'][2][0] X_train_dt = [ ImageRecord(x, window_shape=window_shape, step=step) for x in X_train ] x_dt = ImageRecord(X_test[idx], window_shape=window_shape, step=step) elif data_type == 'txt': window_shape = 3 step = 1 X_train_dt = [ TextRecord(x, window_shape=window_shape, step=step, text_length=100) for x in X_train ] x_dt = TextRecord(X_test[idx], window_shape=window_shape, step=step, text_length=100) else: raise ValueError('Unknown data type %s' % data_type) Z = dang_neighborhood_generation(x_dt, X_train_dt, n_samples=1000, indpb=0.5, neighgen_op=neighgen_operators, base=None) print(x_dt.data) print('----') for i in range(10): print(Z[i]) print('----')
def main(): # dataset = 'wdbc' # dataset = 'italypower' # dataset = 'mnist' dataset = '20newsgroups' D = get_dataset(dataset, path_dataset, normalize) X_train, y_train, X_test, y_test = D['X_train'], D['y_train'], D[ 'X_test'], D['y_test'] n_classes = D['n_classes'] data_type = D['data_type'] print(X_train.shape, data_type) if data_type == 'txt': X_train, X_test = D['X_train_txt'], D['X_test_txt'] idx = 0 if data_type == 'tab': X_train_dt = [TabularRecord(x) for x in X_train] x_dt = TabularRecord(X_test[idx]) elif data_type == 'ts': print(data_type, D['window_sizes'][0], D['window_steps'][0]) window_shape = D['window_sizes'][0] step = D['window_steps'][0] X_train_dt = [ TimeSeriesRecord(x, window_shape=window_shape, step=step) for x in X_train ] x_dt = TimeSeriesRecord(X_test[idx], window_shape=window_shape, step=step) elif data_type == 'img': print(data_type) window_shape = (14, 14) #D['window_sizes'][2] step = 7 #D['window_steps'][2][0] X_train_dt = [ ImageRecord(x, window_shape=window_shape, step=step) for x in X_train[:1000] ] x_dt = ImageRecord(X_test[idx], window_shape=window_shape, step=step) elif data_type == 'txt': print(data_type) window_shape = 3 step = 1 X_train_dt = [ TextRecord(x, window_shape=window_shape, step=step, text_length=100) for x in X_train[:1000] ] x_dt = TextRecord(X_test[idx], window_shape=window_shape, step=step, text_length=100) else: raise ValueError('Unknown data type %s' % data_type) # print(x_dt) Z = dang_neighborhood_generation(x_dt, X_train_dt, n_samples=1000, indpb=0.5, neighgen_op=neighgen_operators, base=None) print(len(Z)) # print(Z[3].data.shape) # print(x_dt[0].data) # plt.imshow(x_dt[0].data, cmap='gray') # plt.show() # print(Z[0]) # plt.imshow(Z[1].data, cmap='gray') # plt.show() # plt.plot(x_dt.data.tolist(), lw=5) # for i in range(10): # plt.plot(Z[i*2].data.tolist()) # plt.show() print(' '.join([term for term in x_dt.data if len(term) > 0])) print('----') for i in range(10): print(' '.join([term for term in Z[i].data if len(term) > 0])) print('----')
def main(): print(K.tensorflow_backend._get_available_gpus()) dataset = 'arrowhead' epochs = 10100 train_size = 0.7 latent_dim = 32 window = 3 print(datetime.datetime.now(), 'Dataset: %s' % dataset) D = get_dataset(dataset, path_dataset, normalize='standard') X_train, y_train, _, _ = D['X_train'], D['y_train'], D['X_test'], D[ 'y_test'] n_timestamps = D['n_timestamps'] n_classes = D['n_classes'] le = LabelEncoder() le.fit(y_train) y_train = le.transform(y_train) # y_test = le.transfrm(y_test) print(datetime.datetime.now(), 'Training CGAN') cigan = CGAN(n_timestamps, n_classes, latent_dim, window=window, img_path=path_cgan_images + '/ts/%s_' % dataset, verbose=True) ts = time.time() cigan.fit(X_train, y_train, epochs=epochs, batch_size=32, sample_interval=100) cgan_fit_time = time.time() - ts n_fake_instances = len(X_train) print(datetime.datetime.now(), 'Generating synthetic data') ts = time.time() X_fake = cigan.sample(n_fake_instances) cgan_gen_time = time.time() - ts print(datetime.datetime.now(), 'Storing synthetic data') np.save(path_syht_dataset + '%s' % dataset, X_fake) X_real = X_train # print('F', np.mean(X_fake[:, 0]), np.min(X_fake[:,0]), np.max(X_fake[:,0])) # print('R', np.mean(X_real[:,0]), np.min(X_real[:,0]), np.max(X_real[:,0])) # # print('') # print('F', np.mean(X_fake), np.min(X_fake), np.max(X_fake)) # print('R', np.mean(X_real), np.min(X_real), np.max(X_real)) # # return -1 y_real = np.ones(len(X_real)) y_fake = np.zeros(len(X_fake)) X_rf = np.concatenate([X_real, X_fake]) y_rf = np.concatenate([y_real, y_fake]) X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split( X_rf, y_rf, train_size=train_size, stratify=y_rf) res_dict = dict() for clf_name, clf in clf_list.items(): print(datetime.datetime.now(), 'Training %s' % clf_name) ts = time.time() clf.fit(X_rf_train, y_rf_train) disc_fit_time = time.time() - ts pickle.dump( clf, open(path_discr + '%s_%s.pickle' % (dataset, clf_name), 'wb')) y_pred_train = clf.predict(X_rf_train) y_pred_test = clf.predict(X_rf_test) acc_train = accuracy_score(y_rf_train, y_pred_train) acc_test = accuracy_score(y_rf_test, y_pred_test) res_dict['%s_acc_train' % clf_name] = acc_train res_dict['%s_acc_test' % clf_name] = acc_test res_dict['%s_disc_fit_time' % clf_name] = disc_fit_time print(datetime.datetime.now(), '\taccuracy %.3f, %.3f' % (acc_train, acc_test)) res_dict['dataset'] = dataset res_dict['cgan_fit_time'] = cgan_fit_time res_dict['cgan_gen_time'] = cgan_gen_time print(datetime.datetime.now(), 'Storing evaluation') store_result(res_dict, path_ctgan_eval + 'time_series.json')
def main(): dataset = '20newsgroups' train_size = 1000 text_length = 1000 D = get_dataset(dataset, path_dataset, categories=['alt.atheism', 'talk.religion.misc']) X_train, y_train, X_test, y_test = D['X_train'], D['y_train'], D[ 'X_test'], D['y_test'] data_type = D['data_type'] if data_type == 'txt': X_train, X_test = D['X_train_txt'], D['X_test_txt'] if len(X_train) > train_size: idx = np.random.choice(len(X_train), size=train_size, replace=False) X_train = [x for i, x in enumerate(X_train) if i in idx] idx = 0 if data_type == 'tab': X_train_dt = [TabularRecord(x) for x in X_train] x_dt = TabularRecord(X_test[idx]) elif data_type == 'ts': window_shape = D['window_sizes'][0] step = D['window_steps'][0] X_train_dt = [ TimeSeriesRecord(x, window_shape=window_shape, step=step) for x in X_train ] x_dt = TimeSeriesRecord(X_test[idx], window_shape=window_shape, step=step) elif data_type == 'img': window_shape = D['window_sizes'][2] step = D['window_steps'][2][0] X_train_dt = [ ImageRecord(x, window_shape=window_shape, step=step) for x in X_train ] x_dt = ImageRecord(X_test[idx], window_shape=window_shape, step=step) elif data_type == 'txt': window_shape = 3 step = 3 X_train_dt = [ TextRecord(x, window_shape=window_shape, step=step, text_length=text_length) for x in X_train ] x_dt = TextRecord(X_test[idx], window_shape=window_shape, step=step, text_length=text_length) else: raise ValueError('Unknown data type %s' % data_type) Z = dang_neighborhood_generation(x_dt, X_train_dt, n_samples=1000, indpb=0.5, neighgen_op=neighgen_operators, base=None) # Z = rand_neighborhood_generation(x_dt, X_train_dt, n_samples=1000, indpb=0.5) # Z = supp_neighborhood_generation(x_dt, ' ', n_samples=1000, indpb=0.5) # Z = mode_neighborhood_generation(x_dt, X_train_dt, n_samples=1000, indpb=0.5) print(len(Z)) print(' '.join([term for term in x_dt.data if len(term) > 0])) print('----O------') for i in range(10): print(' '.join([term for term in Z[i].data if len(term) > 0])) print('----')
def main(): print(K.tensorflow_backend._get_available_gpus()) dataset = 'imdb' epochs = 100 train_size = 0.7 maxlen = 3 step = 1 print(datetime.datetime.now(), 'Dataset: %s' % dataset) if dataset == '20newsgroups': categories = ['alt.atheism', 'talk.religion.misc'] else: categories = None D = get_dataset(dataset, path_dataset, categories=categories) # X_train, y_train, X_test, y_test = D['X_train'], D['y_train'], D['X_test'], D['y_test'] # n_classes = D['n_classes'] # class_name = D['class_name'] X_train_txt = D['X_train_txt'] y_train = D['y_train'] if dataset == 'imdb': X_train_txt, _, y_train, _ = train_test_split(X_train_txt, y_train, train_size=1000, stratify=y_train) X_train_txt = clean_texts(X_train_txt, min_chars=10) text_lengths = [len(x) for x in X_train_txt] # X_test_txt = D['X_test_txt'] # X_test_txt = clean_texts(X_test_txt, min_chars=10) # print(np.mean(lens), np.median(lens), np.min(lens), np.max(lens)) # return -1 X, y, words, words_indices, indices_words = texts2texts_nextword( X_train_txt, maxlen=maxlen, step=step) nbr_terms = len(words) start_texts_idx = np.random.choice(len(X_train_txt), min(1000, len(X_train_txt)), replace=False) start_texts = [X_train_txt[i] for i in start_texts_idx] print(datetime.datetime.now(), 'Training GAN') gan = GAN(nbr_terms, maxlen, latent_dim=100, term_indices=words_indices, indices_term=indices_words, txt_path=path_cgan_images + '/txt/%s_' % dataset, verbose=False, start_texts=start_texts, text_lengths=text_lengths) ts = time.time() gan.fit(X, y, epochs=epochs, batch_size=128) cgan_fit_time = time.time() - ts n_fake_instances = len(X_train_txt) print(datetime.datetime.now(), 'Generating synthetic data') ts = time.time() X_fake_txt = gan.sample(n=n_fake_instances, t=20, start_texts=start_texts, diversity_list=[0.2, 0.5, 0.7, 1.0, 1.2]) cgan_gen_time = time.time() - ts # print('Fake start') # for f in X_fake_txt[:5]: # print(f) # print('Fake end') # # print('Real start') # for r in X_train_txt[:5]: # print(r) # print('Real end') # # return -1 print(datetime.datetime.now(), 'Storing synthetic data') # df = pd.DataFrame(data=Xy_fake, columns=feature_names + [class_name]) # df.to_csv(path_syht_dataset + '%s.csv' % dataset, index=False) fout = open(path_syht_dataset + '%s.csv' % dataset, 'w') for txt in X_fake_txt: fout.write(txt + '\n') fout.close() X_fake = X_fake_txt X_real = X_train_txt y_real = np.ones(len(X_real)) y_fake = np.zeros(len(X_fake)) X_rf = np.concatenate([X_real, X_fake]) y_rf = np.concatenate([y_real, y_fake]) X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split( X_rf, y_rf, train_size=train_size, stratify=y_rf) vectorizer = TfidfVectorizer(max_features=1000, stop_words=stopwords.words('english')) X_rf_train = vectorizer.fit_transform(X_rf_train).toarray() X_rf_test = vectorizer.transform(X_rf_test).toarray() res_dict = dict() for clf_name, clf in clf_list.items(): print(datetime.datetime.now(), 'Training %s' % clf_name) ts = time.time() clf.fit(X_rf_train, y_rf_train) disc_fit_time = time.time() - ts pickle.dump( clf, open(path_discr + '%s_%s.pickle' % (dataset, clf_name), 'wb')) y_pred_train = clf.predict(X_rf_train) y_pred_test = clf.predict(X_rf_test) acc_train = accuracy_score(y_rf_train, y_pred_train) acc_test = accuracy_score(y_rf_test, y_pred_test) res_dict['%s_acc_train' % clf_name] = acc_train res_dict['%s_acc_test' % clf_name] = acc_test res_dict['%s_disc_fit_time' % clf_name] = disc_fit_time print(datetime.datetime.now(), '\taccuracy %.3f, %.3f' % (acc_train, acc_test)) res_dict['dataset'] = dataset res_dict['cgan_fit_time'] = cgan_fit_time res_dict['cgan_gen_time'] = cgan_gen_time print(datetime.datetime.now(), 'Storing evaluation') store_result(res_dict, path_ctgan_eval + 'text.json')
def main(): print(K.tensorflow_backend._get_available_gpus()) dataset = 'cifar10' epochs = 20000 train_size = 0.7 categories = None latent_dim = 100 print(datetime.datetime.now(), 'Dataset: %s' % dataset) D = get_dataset(dataset, path_dataset, categories=categories) X_train, y_train, _, _ = D['X_train'], D['y_train'], D['X_test'], D[ 'y_test'] X_train = X_train * 255.0 X_train = (X_train - 127.5) / 127.5 n_classes = D['n_classes'] # n_features = D['n_features'] # feature_names = D['feature_names'] # class_name = D['class_name'] img_rows, img_cols = D['w'], D['h'] channels = 1 le = LabelEncoder() le.fit(y_train) y_train = le.transform(y_train) # y_test = le.transfrm(y_test) print(datetime.datetime.now(), 'Training CGAN') cigan = CGAN(img_rows, img_cols, channels, n_classes, latent_dim, img_path=path_cgan_images + '/imgs/%s_' % dataset, verbose=True) ts = time.time() cigan.fit(X_train, y_train, epochs=epochs, batch_size=32, sample_interval=100) cgan_fit_time = time.time() - ts n_fake_instances = len(X_train) print(datetime.datetime.now(), 'Generating synthetic data') ts = time.time() X_fake = cigan.sample(n_fake_instances) cgan_gen_time = time.time() - ts print(datetime.datetime.now(), 'Storing synthetic data') np.save(path_syht_dataset + '%s' % dataset, X_fake) s0, s1, s2 = X_fake.shape X_fake = X_fake.reshape(s0, s1 * s2) X_real = X_train.reshape(s0, s1 * s2) # print(np.mean(X_fake[0]), np.min(X_fake[0]), np.max(X_fake[0])) # print(np.mean(X_real[0]), np.min(X_real[0]), np.max(X_real[0])) y_real = np.ones(len(X_real)) y_fake = np.zeros(len(X_fake)) X_rf = np.concatenate([X_real, X_fake]) y_rf = np.concatenate([y_real, y_fake]) X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split( X_rf, y_rf, train_size=train_size, stratify=y_rf) res_dict = dict() for clf_name, clf in clf_list.items(): print(datetime.datetime.now(), 'Training %s' % clf_name) ts = time.time() clf.fit(X_rf_train, y_rf_train) disc_fit_time = time.time() - ts pickle.dump( clf, open(path_discr + '%s_%s.pickle' % (dataset, clf_name), 'wb')) y_pred_train = clf.predict(X_rf_train) y_pred_test = clf.predict(X_rf_test) acc_train = accuracy_score(y_rf_train, y_pred_train) acc_test = accuracy_score(y_rf_test, y_pred_test) res_dict['%s_acc_train' % clf_name] = acc_train res_dict['%s_acc_test' % clf_name] = acc_test res_dict['%s_disc_fit_time' % clf_name] = disc_fit_time print(datetime.datetime.now(), '\taccuracy %.3f, %.3f' % (acc_train, acc_test)) res_dict['dataset'] = dataset res_dict['cgan_fit_time'] = cgan_fit_time res_dict['cgan_gen_time'] = cgan_gen_time print(datetime.datetime.now(), 'Storing evaluation') store_result(res_dict, path_ctgan_eval + 'images.json')