Esempio n. 1
0
def sampling_data_for_dynamic(x_ben, x_van, sampling_ratio, neg_label):
    n_samples_train = int(len(x_ben) * sampling_ratio)
    n_samples_test = len(x_ben) - n_samples_train
    assert n_samples_test <= x_van.shape[0]
    x_ben, x_van = sample_shuffle(x_ben), sample_shuffle(x_van)
    x_train = x_ben[0:n_samples_train]
    x_test = x_ben[-n_samples_test:].tolist() + x_van[-n_samples_test:].tolist(
    )
    x_test = np.array(x_test)
    y_test_GAN = np.ones(2 * n_samples_test)
    y_test_GAN[n_samples_test:] = neg_label
    return x_train, x_test, y_test_GAN
Esempio n. 2
0
def sampling_preprocessing_LSTM_AE(x_ben, x_van, train_ratio, max_len):

    assert train_ratio < 1.
    n_samples_train = int(x_ben.shape[0] * train_ratio)

    assert n_samples_train <= x_van.shape[0]
    x_train = sample_shuffle(x_ben)[0:n_samples_train].tolist() + \
              sample_shuffle(x_van)[0:n_samples_train].tolist()
    x_train = sample_shuffle(np.array(x_train))
    weights = get_sample_weights(
        x_train)  #  contruct the sample weights for LSTM-AE output.
    return seq_padding(x_train, max_len, 'pre'), \
           seq_padding(x_ben, max_len, 'pre'), \
           seq_padding(x_van, max_len, 'pre'), \
           seq_padding(weights, max_len, 'post') # 'post' for weights sequence
Esempio n. 3
0
def sampling_preprocessing_LSTM_AE(x_ben, x_van, train_ratio, max_len):
    n_samples_train = int(x_ben.shape[0] * train_ratio)
    # x_train = sample_shuffle(x_ben)[0:n_samples_train]   # shuffle and sampling data
    x_ben = sample_shuffle(x_ben)
    x_van = sample_shuffle(x_van)
    x_train = x_ben[0:n_samples_train]
    weights = get_sample_weights(
        x_train)  #  contruct the sample weights for LSTM-AE output

    return seq_padding(x_train, max_len, 'pre'), \
           seq_padding(x_ben, max_len, 'pre'), \
           seq_padding(x_van, max_len, 'pre'), \
           seq_padding(weights, max_len, 'post'), \
           map(lambda x: len(x), x_ben),\
           map(lambda x: len(x), x_van)# padding sequence,
Esempio n. 4
0
def sampling_data_for_OCC(x_ben, x_van, sampling_ratio, neg_label1, neg_label2,
                          en_ae):
    n_samples_train = int(len(x_ben) * sampling_ratio)
    if en_ae == 1:
        n_samples_test = len(x_ben) - n_samples_train
    else:
        n_samples_test = len(x_van)
    # n_samples_train = int(x_ben.shape[0] * sampling_ratio)
    # n_samples_test = x_ben.shape[0] - n_samples_train
    # assert n_samples_test <= x_van.shape[0]
    # assert n_samples_test <= len(x_van)
    x_ben, x_van = sample_shuffle(x_ben), sample_shuffle(x_van)
    x_train = x_ben[0:n_samples_train]
    x_test = x_ben[-n_samples_test:].tolist() + x_van[-n_samples_test:].tolist(
    )
    x_test = np.array(x_test)
    y_train_OCC = np.ones(n_samples_train)
    y_test_OCC = np.ones(2 * n_samples_test)
    y_test_OCC[n_samples_test:] = neg_label1
    y_test_GAN = np.ones(2 * n_samples_test)
    y_test_GAN[n_samples_test:] = neg_label2
    return x_train, x_test, \
           y_train_OCC, y_test_OCC, y_test_GAN
Esempio n. 5
0
# try_num = sys.argv[1:]
# Load data and preprocess.
en_ae = 1  # 1 for wiki; 2 for credit card with encoding; 3 for credit card without encoding.
dra_tra_pro = True # Observe the training process along epochs, or run training then test it.

if en_ae == 1:
    samples_path = os.getcwd() + "\\..\\..\\sampleData\\"
    f_ben, f_van = "X_v8_4_50_Ben", "X_v8_4_50_Van"
    x_ben, x_van = load_data(samples_path, f_ben, f_van)
    input_dim = 8
    hid_dim = [200]
    d_in = [200]
    epochs = 150
elif en_ae == 2:
    x_ben, x_van = getDataCCFD("creditcard.csv.zip")
    x_ben = sample_shuffle(x_ben)[0:2000]
    input_dim = 30
    hid_dim = [100]
    d_in = [50]  #autoencoding.
    epochs = 200
else:
    x_ben, x_van = getDataCCFD("creditcard.csv.zip")
    x_ben = sample_shuffle(x_ben)[0:2000]
    input_dim = 30
    d_in = [input_dim]  # without autoencoding.
    epochs = 200

train_ratio = .7
max_len = 50
time_step = max_len
g_in = [50]
from sklearn.preprocessing import StandardScaler
from numpy.random import multivariate_normal
from representation_libs import db_span, get_eps, cluster_analyis, DB_statistics
import json
from utils import sample_shuffle
from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
from scipy.spatial import distance

from matplotlib.axes import Axes

x_ben = np.load("ben_hid_repre.npy")
x_van = np.load("van_hid_repre.npy")

x_fake = sample_shuffle(np.load("x_fake.npy"))[0:len(x_van)]

X = np.concatenate((x_ben, x_van, x_fake))
y = np.concatenate(
    (np.ones(len(x_ben)), np.zeros(len(x_van)), np.ones(len(x_fake)) + 1))
eps_X = get_eps(X)

clusters, outlier = db_span(X, 1.4305, 180)

# clusters, outlier = db_span(X, eps_X*.48, 180)
# print "eps: ", eps_X*.48

cluster_X = list()
cluster_y = list()
cluster_c = list()
for cluster_id, class_ids in clusters.items():
Esempio n. 7
0
def gen_hid_repre(fea_dim, hid_dim, fix_or_var, step_length):
    """
	:param fea_dim: input dimension of LSTM-AE model
	:param hid_dim: output dimension of hidden representation
	:param fix_or_var:  editing sequence is fixed-length or variant-length.
	:return: fixed-length hidden representation of editing sequence.
	"""
    base_path = os.getcwd()
    samples_path = base_path + "\\sampleData\\"
    repre_path = base_path + "\\hidden_representation\\"
    if not os.path.exists(repre_path):
        os.makedirs(repre_path)

    if fix_or_var == 1:
        # Load data
        x_ben = np.load(samples_path + "X_%s_1_20_Ben.npy" % fea_dim)
        x_van = np.load(samples_path + "X_%s_1_20_Van.npy" % fea_dim)
        # print x_ben.shape, x_van.shape
        # exit(0)
        x_ben = sample_shuffle(x_ben)[0:6000]
        x_van = sample_shuffle(x_van)[0:3000]
        train_ben = x_ben[0:3000]

        # Fit Model
        timesteps = 20
        input_dim = fea_dim

        autoencoder = Autoencoder()
        autoencoder.model('lstm', [timesteps, input_dim], hid_dim)
        autoencoder.compile()
        autoencoder.fit(train_ben, "rev")

        hidModel = Sequential()
        hidModel.add(autoencoder.model.layers[0])
        hidModel.add(autoencoder.model.layers[1])

        ben_hid_emd = hidModel.predict(x_ben)
        van_hid_emd = hidModel.predict(x_van)

        # store data
        np.save(repre_path + "ben_hid_emd_20_%s_%s" % (fea_dim, hid_dim[0]),
                ben_hid_emd)
        np.save(repre_path + "van_hid_emd_20_%s_%s" % (fea_dim, hid_dim[0]),
                van_hid_emd)

    elif fix_or_var == 0:
        if step_length == 20:
            x_ben = np.load(samples_path + "X_%s_1_20_Ben.npy" % fea_dim)
            x_van = np.load(samples_path + "X_%s_1_20_Van.npy" % fea_dim)
            x_ben = sample_shuffle(x_ben)  # 16496
            x_van = sample_shuffle(x_van)  # 17015
            # train_ben = np.concatenate((x_ben[0:10000], x_van[0:10000])) # mix samples for baseline 'latent representation.'
            train_ben = x_ben[0:10000]

            sampleWeights = list()
            for e in train_ben:
                sampleWeights.append(np.ones(len(e)))

            train_ben_P = pad_sequences(train_ben, maxlen=20, dtype='float')
            x_ben_P = pad_sequences(x_ben, maxlen=20, dtype='float')
            x_van_P = pad_sequences(x_van, maxlen=20, dtype='float')

            # decoding sequence is reversed
            sampleWeights = pad_sequences(sampleWeights,
                                          maxlen=20,
                                          dtype='float',
                                          padding='post')

            timesteps = 20
            input_dim = fea_dim
            autoencoder = Autoencoder()
            autoencoder.modelMasking('lstm', [timesteps, input_dim], hid_dim)
            autoencoder.compile('temporal')
            autoencoder.fit(train_ben_P, 'rev', sampleWeights)

            hidModel = Sequential()
            hidModel.add(autoencoder.model.layers[0])
            hidModel.add(autoencoder.model.layers[1])
            hidModel.add(autoencoder.model.layers[2])

            ben_hid_emd = hidModel.predict(x_ben_P)
            van_hid_emd = hidModel.predict(x_van_P)

            # store data
            # np.save(repre_path + "ben_hid_emd_mix_1_20_%s_%s" % (fea_dim, hid_dim[0]), ben_hid_emd)
            # np.save(repre_path + "val_hid_emd_mix_1_20_%s_%s" % (fea_dim, hid_dim[0]), van_hid_emd)

        elif step_length == 50:

            x_ben = np.load(samples_path + "X_v%s_4_50_Ben.npy" % fea_dim)
            x_van = np.load(samples_path + "X_v%s_4_50_Van.npy" % fea_dim)
            x_ben = sample_shuffle(x_ben)
            x_van = sample_shuffle(x_van)
            train_ben = x_ben[0:7000]

            sampleWeights = list()
            for e in train_ben:
                sampleWeights.append(np.ones(len(e)))

            train_ben_P = pad_sequences(train_ben, maxlen=50, dtype='float')
            x_ben_P = pad_sequences(x_ben, maxlen=50, dtype='float')
            x_van_P = pad_sequences(x_van, maxlen=50, dtype='float')

            # decoding sequence is reversed
            sampleWeights = pad_sequences(sampleWeights,
                                          maxlen=50,
                                          dtype='float',
                                          padding='post')

            timesteps = 50
            input_dim = fea_dim
            autoencoder = Autoencoder()
            autoencoder.modelMasking('lstm', [timesteps, input_dim], hid_dim)
            autoencoder.compile('temporal')
            autoencoder.fit(train_ben_P, 'rev', sampleWeights)

            hidModel = Sequential()
            hidModel.add(autoencoder.model.layers[0])
            hidModel.add(autoencoder.model.layers[1])
            hidModel.add(autoencoder.model.layers[2])

            ben_hid_emd = hidModel.predict(x_ben_P)
            van_hid_emd = hidModel.predict(x_van_P)

    return ben_hid_emd, van_hid_emd