def sampling_data_for_dynamic(x_ben, x_van, sampling_ratio, neg_label): n_samples_train = int(len(x_ben) * sampling_ratio) n_samples_test = len(x_ben) - n_samples_train assert n_samples_test <= x_van.shape[0] x_ben, x_van = sample_shuffle(x_ben), sample_shuffle(x_van) x_train = x_ben[0:n_samples_train] x_test = x_ben[-n_samples_test:].tolist() + x_van[-n_samples_test:].tolist( ) x_test = np.array(x_test) y_test_GAN = np.ones(2 * n_samples_test) y_test_GAN[n_samples_test:] = neg_label return x_train, x_test, y_test_GAN
def sampling_preprocessing_LSTM_AE(x_ben, x_van, train_ratio, max_len): assert train_ratio < 1. n_samples_train = int(x_ben.shape[0] * train_ratio) assert n_samples_train <= x_van.shape[0] x_train = sample_shuffle(x_ben)[0:n_samples_train].tolist() + \ sample_shuffle(x_van)[0:n_samples_train].tolist() x_train = sample_shuffle(np.array(x_train)) weights = get_sample_weights( x_train) # contruct the sample weights for LSTM-AE output. return seq_padding(x_train, max_len, 'pre'), \ seq_padding(x_ben, max_len, 'pre'), \ seq_padding(x_van, max_len, 'pre'), \ seq_padding(weights, max_len, 'post') # 'post' for weights sequence
def sampling_preprocessing_LSTM_AE(x_ben, x_van, train_ratio, max_len): n_samples_train = int(x_ben.shape[0] * train_ratio) # x_train = sample_shuffle(x_ben)[0:n_samples_train] # shuffle and sampling data x_ben = sample_shuffle(x_ben) x_van = sample_shuffle(x_van) x_train = x_ben[0:n_samples_train] weights = get_sample_weights( x_train) # contruct the sample weights for LSTM-AE output return seq_padding(x_train, max_len, 'pre'), \ seq_padding(x_ben, max_len, 'pre'), \ seq_padding(x_van, max_len, 'pre'), \ seq_padding(weights, max_len, 'post'), \ map(lambda x: len(x), x_ben),\ map(lambda x: len(x), x_van)# padding sequence,
def sampling_data_for_OCC(x_ben, x_van, sampling_ratio, neg_label1, neg_label2, en_ae): n_samples_train = int(len(x_ben) * sampling_ratio) if en_ae == 1: n_samples_test = len(x_ben) - n_samples_train else: n_samples_test = len(x_van) # n_samples_train = int(x_ben.shape[0] * sampling_ratio) # n_samples_test = x_ben.shape[0] - n_samples_train # assert n_samples_test <= x_van.shape[0] # assert n_samples_test <= len(x_van) x_ben, x_van = sample_shuffle(x_ben), sample_shuffle(x_van) x_train = x_ben[0:n_samples_train] x_test = x_ben[-n_samples_test:].tolist() + x_van[-n_samples_test:].tolist( ) x_test = np.array(x_test) y_train_OCC = np.ones(n_samples_train) y_test_OCC = np.ones(2 * n_samples_test) y_test_OCC[n_samples_test:] = neg_label1 y_test_GAN = np.ones(2 * n_samples_test) y_test_GAN[n_samples_test:] = neg_label2 return x_train, x_test, \ y_train_OCC, y_test_OCC, y_test_GAN
# try_num = sys.argv[1:] # Load data and preprocess. en_ae = 1 # 1 for wiki; 2 for credit card with encoding; 3 for credit card without encoding. dra_tra_pro = True # Observe the training process along epochs, or run training then test it. if en_ae == 1: samples_path = os.getcwd() + "\\..\\..\\sampleData\\" f_ben, f_van = "X_v8_4_50_Ben", "X_v8_4_50_Van" x_ben, x_van = load_data(samples_path, f_ben, f_van) input_dim = 8 hid_dim = [200] d_in = [200] epochs = 150 elif en_ae == 2: x_ben, x_van = getDataCCFD("creditcard.csv.zip") x_ben = sample_shuffle(x_ben)[0:2000] input_dim = 30 hid_dim = [100] d_in = [50] #autoencoding. epochs = 200 else: x_ben, x_van = getDataCCFD("creditcard.csv.zip") x_ben = sample_shuffle(x_ben)[0:2000] input_dim = 30 d_in = [input_dim] # without autoencoding. epochs = 200 train_ratio = .7 max_len = 50 time_step = max_len g_in = [50]
from sklearn.preprocessing import StandardScaler from numpy.random import multivariate_normal from representation_libs import db_span, get_eps, cluster_analyis, DB_statistics import json from utils import sample_shuffle from sklearn.manifold import TSNE from mpl_toolkits.mplot3d import axes3d import matplotlib.pyplot as plt from scipy.spatial import distance from matplotlib.axes import Axes x_ben = np.load("ben_hid_repre.npy") x_van = np.load("van_hid_repre.npy") x_fake = sample_shuffle(np.load("x_fake.npy"))[0:len(x_van)] X = np.concatenate((x_ben, x_van, x_fake)) y = np.concatenate( (np.ones(len(x_ben)), np.zeros(len(x_van)), np.ones(len(x_fake)) + 1)) eps_X = get_eps(X) clusters, outlier = db_span(X, 1.4305, 180) # clusters, outlier = db_span(X, eps_X*.48, 180) # print "eps: ", eps_X*.48 cluster_X = list() cluster_y = list() cluster_c = list() for cluster_id, class_ids in clusters.items():
def gen_hid_repre(fea_dim, hid_dim, fix_or_var, step_length): """ :param fea_dim: input dimension of LSTM-AE model :param hid_dim: output dimension of hidden representation :param fix_or_var: editing sequence is fixed-length or variant-length. :return: fixed-length hidden representation of editing sequence. """ base_path = os.getcwd() samples_path = base_path + "\\sampleData\\" repre_path = base_path + "\\hidden_representation\\" if not os.path.exists(repre_path): os.makedirs(repre_path) if fix_or_var == 1: # Load data x_ben = np.load(samples_path + "X_%s_1_20_Ben.npy" % fea_dim) x_van = np.load(samples_path + "X_%s_1_20_Van.npy" % fea_dim) # print x_ben.shape, x_van.shape # exit(0) x_ben = sample_shuffle(x_ben)[0:6000] x_van = sample_shuffle(x_van)[0:3000] train_ben = x_ben[0:3000] # Fit Model timesteps = 20 input_dim = fea_dim autoencoder = Autoencoder() autoencoder.model('lstm', [timesteps, input_dim], hid_dim) autoencoder.compile() autoencoder.fit(train_ben, "rev") hidModel = Sequential() hidModel.add(autoencoder.model.layers[0]) hidModel.add(autoencoder.model.layers[1]) ben_hid_emd = hidModel.predict(x_ben) van_hid_emd = hidModel.predict(x_van) # store data np.save(repre_path + "ben_hid_emd_20_%s_%s" % (fea_dim, hid_dim[0]), ben_hid_emd) np.save(repre_path + "van_hid_emd_20_%s_%s" % (fea_dim, hid_dim[0]), van_hid_emd) elif fix_or_var == 0: if step_length == 20: x_ben = np.load(samples_path + "X_%s_1_20_Ben.npy" % fea_dim) x_van = np.load(samples_path + "X_%s_1_20_Van.npy" % fea_dim) x_ben = sample_shuffle(x_ben) # 16496 x_van = sample_shuffle(x_van) # 17015 # train_ben = np.concatenate((x_ben[0:10000], x_van[0:10000])) # mix samples for baseline 'latent representation.' train_ben = x_ben[0:10000] sampleWeights = list() for e in train_ben: sampleWeights.append(np.ones(len(e))) train_ben_P = pad_sequences(train_ben, maxlen=20, dtype='float') x_ben_P = pad_sequences(x_ben, maxlen=20, dtype='float') x_van_P = pad_sequences(x_van, maxlen=20, dtype='float') # decoding sequence is reversed sampleWeights = pad_sequences(sampleWeights, maxlen=20, dtype='float', padding='post') timesteps = 20 input_dim = fea_dim autoencoder = Autoencoder() autoencoder.modelMasking('lstm', [timesteps, input_dim], hid_dim) autoencoder.compile('temporal') autoencoder.fit(train_ben_P, 'rev', sampleWeights) hidModel = Sequential() hidModel.add(autoencoder.model.layers[0]) hidModel.add(autoencoder.model.layers[1]) hidModel.add(autoencoder.model.layers[2]) ben_hid_emd = hidModel.predict(x_ben_P) van_hid_emd = hidModel.predict(x_van_P) # store data # np.save(repre_path + "ben_hid_emd_mix_1_20_%s_%s" % (fea_dim, hid_dim[0]), ben_hid_emd) # np.save(repre_path + "val_hid_emd_mix_1_20_%s_%s" % (fea_dim, hid_dim[0]), van_hid_emd) elif step_length == 50: x_ben = np.load(samples_path + "X_v%s_4_50_Ben.npy" % fea_dim) x_van = np.load(samples_path + "X_v%s_4_50_Van.npy" % fea_dim) x_ben = sample_shuffle(x_ben) x_van = sample_shuffle(x_van) train_ben = x_ben[0:7000] sampleWeights = list() for e in train_ben: sampleWeights.append(np.ones(len(e))) train_ben_P = pad_sequences(train_ben, maxlen=50, dtype='float') x_ben_P = pad_sequences(x_ben, maxlen=50, dtype='float') x_van_P = pad_sequences(x_van, maxlen=50, dtype='float') # decoding sequence is reversed sampleWeights = pad_sequences(sampleWeights, maxlen=50, dtype='float', padding='post') timesteps = 50 input_dim = fea_dim autoencoder = Autoencoder() autoencoder.modelMasking('lstm', [timesteps, input_dim], hid_dim) autoencoder.compile('temporal') autoencoder.fit(train_ben_P, 'rev', sampleWeights) hidModel = Sequential() hidModel.add(autoencoder.model.layers[0]) hidModel.add(autoencoder.model.layers[1]) hidModel.add(autoencoder.model.layers[2]) ben_hid_emd = hidModel.predict(x_ben_P) van_hid_emd = hidModel.predict(x_van_P) return ben_hid_emd, van_hid_emd