def initialize_dataset(self):
        labeled_data = pre.load_data('./labeled_data/hotel_corpus.tsv')
        labeled_data.extend(
            pre.load_data('./labeled_data/schedule_corpus.tsv'))
        labeled_data_matrix = [re.split('[\t\n]', s) for s in labeled_data]
        target_list = []
        morph_list = []
        only_target_list = []
        for row in labeled_data_matrix:
            if len(row) <= 1:
                target_list.append('None')
                continue

            elif row[0] == 'Speaker':
                target_list.append('None')
                continue
            if len(row) == 3:
                print(row[1])
            elif len(row) == 4:
                target_list.append(row[-1])
                only_target_list.append(row[-1])
            morph_list.append(pre.morph_data(row[1]))

        #print((morph_list[:15]))
        #all_target_dict = pre.making_all_dict(list(set(target_list)))      #target들의 dictionary
        #all_target_1dlist = pre.making_target_list(target_list, all_target_dict)
        all_target_dict = pre.making_all_dict(list(
            set(only_target_list)))  #target들의 dictionary
        all_target_1dlist = pre.making_target_list(only_target_list,
                                                   all_target_dict)

        #none_idx = int(all_target_dict['None'])
        target_list = all_target_1dlist
        #while target_list.count(none_idx) != 0:
        #    target_list.remove(none_idx)

        self.target_dict = pre.making_all_dict(list(set(target_list)))
        #self.target_dict = pre.making_all_dict(list(set(only_target_list)))

        #print('len of target dict', len(self.target_dict))
        self.inv_target_dict = {v: k for k, v in self.target_dict.items()}
        #for key in self.target_dict:
        #    print (key)
        #print(len(target_list), len(morph_list))
        #print(morph_list[:10])
        Train_X, Test_X, Train_Y, Test_Y = train_test_split(morph_list,
                                                            target_list,
                                                            test_size=0.2,
                                                            random_state=42)
        self.baseTrain_X = Train_X
        self.baseTrain_Y = Train_Y
        self.Test_X = Test_X
        self.Test_Y = Test_Y
def chatbot(txt):
    #chatbot code here 
    # Importing the dataset
    metadata, idx_q, idx_a = data_preprocessing.load_data(PATH = './')
    # Splitting the dataset into the Training set and the Test set
    (trainX, trainY), (testX, testY), (validX, validY) = data_utils_1.split_dataset(idx_q, idx_a)
    # Embedding
    xseq_len = trainX.shape[-1]
    yseq_len = trainY.shape[-1]
    batch_size = 16
    vocab_twit = metadata['idx2w']
    xvocab_size = len(metadata['idx2w'])  
    yvocab_size = xvocab_size
    emb_dim = 1024
    idx2w, w2idx, limit = data_utils_2.get_metadata()
    # Building the seq2seq model
    model = seq2seq_wrapper.Seq2Seq(xseq_len = xseq_len,
                                yseq_len = yseq_len,
                                xvocab_size = xvocab_size,
                                yvocab_size = yvocab_size,
                                ckpt_path = './weights',
                                emb_dim = emb_dim,
                                num_layers = 3)
    # Loading the weights and Running the session
    session = model.restore_last_session()
    # Getting the ChatBot predicted answer
    def respond(question):
        encoded_question = data_utils_2.encode(question, w2idx, limit['maxq'])
        answer = model.predict(session, encoded_question)[0]
        return data_utils_2.decode(answer, idx2w) 
    # Setting up the chat 
    #while True :
        ''''
        engine = pyttsx3.init()
        engine.runAndWait()
        try:
            r = sr.Recognizer()
            mic = sr.Microphone()
            with mic as source:
                r.adjust_for_ambient_noise(source)
                audio = r.listen(source)
                print('You :')
                x=r.recognize_google(audio)
                print(x)
        except:
            continue
        question=x.lower()'''
    question=txt
        #question = input("You: ")
        #if question=='good bye':
        #print('Ok Bye')
        #break
        #answer = respond(question)
    return respond(question)
Example #3
0
def run(num_classes,learning_rate,width,depth,mini_batch_size):

	precision = accuracy = recall = f_score = np.array([])


	X_train,X_test,y_train,y_test,unknown_data = dp.load_data()
	X_train,X_test,y_train,y_test,unknown_data,dtype = dp.prepare_data(X_train,X_test,y_train,y_test,unknown_data)


	for _ in range(1):

		model = NN.Net1(num_classes,depth=depth,width=width).type(dtype)
		opt = optim.SGD(params=model.parameters(),lr=learning_rate,momentum=rp.m,nesterov=True)
		train_losses,test_losses = model.train_validate(X_train,y_train,X_test,y_test,opt,mini_batch_size,dtype)

		model = torch.load("Models/Best_Model.pkl")

		y_pred,_ = model.test(X_test)

		# Calculate metrics
		y_true = y_test.data.cpu().numpy()
		y_pred = y_pred.data.cpu().numpy()
		a,p,r,f = m.compute_metrics(y_true,y_pred)

		accuracy = np.append(accuracy,a)
		precision = np.append(precision,p)
		recall = np.append(recall,r)
		f_score = np.append(f_score,f)


	accuracy = np.mean(accuracy)
	precision = np.mean(precision)
	recall = np.mean(recall)
	f_score = np.mean(f_score)

	m.show_results(accuracy,precision,recall,f_score,num_classes,train_losses,test_losses)
	
	#g.generate_graph(model,X_train)
	
	fw.create_data_csv(learning_rate,depth,width,mini_batch_size,rp.m,len(test_losses)-10,accuracy)

	# Store unknown_data prediction 
	y_pred,_ = model.test(unknown_data)
	fw.store_prediction(y_pred.data.cpu().numpy())
Example #4
0
def load_data():
    # Importing the dataset
    metadata, idx_q, idx_a = data_preprocessing.load_data(PATH='./')

    # Splitting the dataset into the Training set and the Test set
    (trainX,
     trainY), (testX,
               testY), (validX,
                        validY) = data_utils_1.split_dataset(idx_q, idx_a)

    # Embedding
    xseq_len = trainX.shape[-1]
    yseq_len = trainY.shape[-1]
    batch_size = 16
    vocab_twit = metadata['idx2w']
    xvocab_size = len(metadata['idx2w'])
    yvocab_size = xvocab_size
    emb_dim = 1024
    idx2w, w2idx, limit = data_utils_2.get_metadata()

    return (xseq_len, yseq_len, xvocab_size, yvocab_size, emb_dim)
Example #5
0
def generate_anomalies():
    """
    Run data generation process end-to-end.

    Parameters
    ----------
    None.

    Returns
    -------
    None.
    """

    device = 'cpu'
    if torch.cuda.device_count() > 0 and torch.cuda.is_available():
        print("Cuda installed. Running on GPU")
        device = 'cuda'
    else:
        print("No GPU available!")
        exit

    # Importing and cleaning data
    full_data_frame = dp.load_data("data/full_dataset.csv")
    print("Raw dataset loaded into memory")

    # Only tags that we need
    lpc_data = full_data_frame[lpc_tags]

    # Remove data corresponding to offline behaviour
    online_data = dp.remove_offline_data(lpc_data,
                                         min_temp=80,
                                         max_ctrl_pres=58,
                                         window=24)
    print("Offline data removed")

    # Remove features that are deemed invalid - anti-const condition enforced
    clean_data = dp.remove_invalid_features(online_data,
                                            max_gap_len=12,
                                            max_nan_percent=1,
                                            min_unique_vals=2000,
                                            min_variance=1,
                                            max_const_len=144)
    print("Invalid features removed")

    # Interpolate dataframe
    clean_data.interpolate(method='linear', axis=0, inplace=True)
    print("Missing data interpolated")

    # Find deltas from interpolated data
    delta_data = dp.calculate_deltas(clean_data)
    print("Deltas calculated")

    # Normalise (Standardise dataset to ~N(0, 1))
    normalised_data = dp.normalise_dataset(delta_data)
    print("Data normalised")

    # Save final dataset
    dp.save_data(normalised_data, "./generation_data.csv")
    print("Data pre-processing complete")

    data_frame = dp.load_data("./generation_data.csv")
    print("Data loaded into memory")

    dataset = data_frame.to_numpy()
    tag = 27
    dataset = dataset[:, [tag, -1]]
    clipped_data = gsf.clip_distribution(dataset)

    # Ordered indexes of trips in dataset
    anomaly_indexes = np.array(
        [10634, 36136, 57280, 57618, 60545, 63144, 118665, 128524, 131118])
    anomaly_indexes = gsf.convert_indexes(anomaly_indexes, dataset)

    seq_length = 144
    stride = 72

    real_loader = gsf.create_static_loader(clipped_data, 32)

    anomaly_loader, normal_loader, train_loader, valid_loader, train_valid_loader, \
        test_loader, valid_index, test_index  = \
        gsf.create_mode_loaders(dataset, anomaly_indexes, seq_length, stride)

    latent_size = 20
    num_cycles = 1000

    leakyrelu_gain = init.calculate_gain('leaky_relu')

    D = gml.Discriminator(seq_length).to(device)
    gml.init_weights(D, leakyrelu_gain)

    G = gml.Generator(seq_length, latent_size).to(device)
    gml.init_weights(G, leakyrelu_gain)

    gml.train_lsgan(D, G, real_loader, 72, stride, latent_size, num_cycles,
                    tag)
Example #6
0
File: main.py Project: rjgsousa/TEM
def TL(
        source, target = None,
        path = '../gen_patches/dataset_noisy/', retrain = False, retrain_ft_layers = [1,1,1,1,1,1],
        outputfolder='backup',
        outputfolderres='backup_res',
        batchsize = 1000,
        sourcemodelspath = './'
):
    
    """
    TO DO: FINISH DOCUMENTATION
    """

    options = {
        'sourcemodelspath'  : sourcemodelspath,
        'outputfolder'      : outputfolder,
        'outputfolderres'   : outputfolderres,
        'verbose'           : 0,
        'viewdata'          : False,
        'trainsize'         : 0.6,
        'patchsize'         : 20,
        'measure'           : 'acc',
        'weight'            : 200,
        'datanormalize'     : True,
        # ---------- one-class learning
        'replicate'         : False,
        'oneclass'          : False,
        # ---------- source problem params
        'database_source'   : 'db2',
        'resolution_source' : source,
        'nclasses_source'   : 2, # TODO: do this automatically
        # ---------- target problem params
        'database_target'   : 'db2',
        'resolution_target' : target,
        # ---------- TL hyperparams
        'retrain'           : retrain,
        'retrain_ft_layers' : retrain_ft_layers,
        # ---------- hyperparams
        'nruns'             : 20,
        'folds'             : 3,
        'hlayers'           : [len(retrain_ft_layers) / 2],    # X hidden + 1 log layer
        'nneurons'          : [ 1000],     # range(500, 1001, 250),
        'pretraining_epochs': [ 1000],     # [200]
        'training_epochs'   : [ 3000],    # [100]
        'pretrain_lr'       : [ 0.01, 0.001],   #[ 0.01, 0.001],
        'finetune_lr'       : [ 0.1 , 0.01],  #[ 0.1, 0.01],
        'threshold'         : [0.8], #[ 0.5 , 0.6, 0.8], #numpy.arange(.5,1.01,.1),
        'batchsize'         : [ batchsize], #[100] or [1000] depending on the size of the dataset. 
        # ---------- end of hyperparams
        'corruptlevels'     : [0.1], #numpy.arange(0.1, 0.4, 0.1)
    }
    
    print >> sys.stderr, (options), "\n"

    # -------------------------------------------------------------------------------
    datasetpath = path
    # print argv
    # print datasetpath
    # print retrain_ft_layers
    # alaallala

    # -------------------------------------------------------------------------------
    # load dataset
    if options['retrain'] == 1:
        options['database']   = options['database_target']
        options['resolution'] = options['resolution_target']
    else:
        options['database']   = options['database_source']
        options['resolution'] = options['resolution_source']

    (dataset, ndim, nclasses)   = load_data( datasetpath, options )
    options['ndim']     = ndim
    options['nclasses'] = nclasses

    # --------------------------------------------------------------------------------------------
    for nrun in range(1,options['nruns']+1):
        print >> sys.stderr, ("### {0:03d} of {1:03d}".format(nrun,options['nruns']))
        options['numpy_rng']  = numpy.random.RandomState(nrun)
        options['theano_rng'] = RandomStreams(seed=nrun)

        # --------------
        # generate folds
        folds = gen_folds( dataset, options, nrun )    
        # continue
        
        if options['retrain'] == 1:
            filename = "{0:s}/{1:05d}_{2:03d}_model.pkl.gz".format(options['sourcemodelspath'], nrun,
                                                              string.atoi(options['resolution_source']))
            print >> sys.stderr, ":: Loading model {0:s}...\n".format(filename)
            sda_reuse_model = load_savedgzdata ( filename )

            #print sda_reuse_model.logLayer.W.get_value()
            #print sda_reuse_model.logLayer.W.get_value()
            #kkk
            
        else:
            sda_reuse_model = None

        # ----------------------------------------------------------------------------
        results = do_experiment( folds, options, nrun, sda_reuse_model )
        # ----------------------------------------------------------------------------
    
        # --------------------------------------------------
        filename = '{0:s}/res_{1:05d}_{2:03d}.pkl.gz'.format(options['outputfolderres'],nrun,string.atoi(options['resolution']))
        save_results(filename,results)
def train_model(num_epochs, batch_size, learning_rate):
    """Trains a neural network for image classification from the SVHN
    dataset, and creates a plot giving training/testing accuracy as a
    function of batch number.

    Parameters:
    num_epochs -- Number of training epochs
    batch_size -- Number of examples in each training batch
    learning_rate -- Initial learning rate
    """
    # Get data:
    train_X_orig, train_y_orig, _, _ = data_preprocessing.load_data()
    train_X_norm = data_preprocessing.normalize(train_X_orig)
    train_X, valid_X, train_y, valid_y = data_preprocessing.split(
        train_X_norm, train_y_orig)
    # One-hot encode so they can be used for input/validation:
    train_y_cat = keras.utils.to_categorical(train_y, num_classes=10)
    valid_y_cat = keras.utils.to_categorical(valid_y, num_classes=10)
    
    # Build & compile model:
    model = graph_construction.get_keras_model()
    sgd = SGD(lr=learning_rate, decay=1e-5, momentum=0.9, nesterov=True)
    model.compile(loss='categorical_crossentropy',
                  optimizer=sgd,
                  metrics=['accuracy'])

    # Train:
    history = BatchHistory(valid_X, valid_y_cat)    
    model.fit(train_X,
              train_y_cat,
              epochs=num_epochs,
              batch_size=batch_size,
              callbacks=[history],
              validation_data=(valid_X, valid_y_cat))
    # The callback slows things down a bit, but I'm not sure of a good
    # way around it.  If I were testing only on specific batches of
    # validation data, it might be less of an issue.

    fname_base = "model_{0}_{1}_{2}_rgb".format(learning_rate, num_epochs,
                                            batch_size)
    model.save_weights("{0}.h5".format(fname_base))

    # Plot training & validation accuracy, and loss (not called for,
    # but useful):
    b = [i["batch"] for i in history.history]
    plt.plot(b, [i["acc"] for i in history.history])
    plt.plot(b, [i["val_acc"] for i in history.history])
    plt.ylabel('Accuracy')
    plt.xlabel('Batch')
    plt.legend(['Training', 'Validation'], loc='lower right')
    plt.savefig("{0}_accuracy.png".format(fname_base))
    plt.show()
    plt.close()
    
    plt.plot(b, [i["loss"] for i in history.history])
    plt.plot(b, [i["val_loss"] for i in history.history])
    plt.ylabel('Loss (categorical cross-entropy)')
    plt.xlabel('Batch')
    plt.legend(['Training', 'Validation'], loc='lower right')
    plt.savefig("{0}_loss.png".format(fname_base))
    plt.close()
Example #8
0
def run_training():
    """Train net model."""
    #Get the sets of data
    data_set = dp.extract_data(FLAGS.data_dir, FLAGS.label_dir, FLAGS.neighbor)
    train_data, train_label, test_data, test_label = dp.load_data(data_set, FLAGS.ratio)
    print('original train label:-----------------------------------------')
    print(train_label)
    print('original test label:------------------------------------------')
    print(test_label)
    print('train label length: ' + str(len(train_label)) + ', train data length: ' + str(len(train_data)))
    print('test label length:' + str(len(test_label)) + ', test data length: ' + str(len(test_data)))
    #transform int label into one-hot values
    print('train: ')
    train_label = dp.onehot_label(train_label, NUM_CLASSES)
    print('test: ')
    test_label = dp.onehot_label(test_label, NUM_CLASSES)
    print('one hot train label:-----------------------------------------')
    print(train_label)
    print('one hot test label:------------------------------------------')
    print(test_label)
    print('max train_data: ' + str(np.max(train_data)))
    print('min train_data: ' + str(np.min(train_data)))

    with tf.Graph().as_default():
        #Generate placeholders
        data_placeholder, label_placeholder = placeholder_inputs(FLAGS.batch_size)
        #Build a Graph that computes predictions from the inference model
        softmax, fc_weights, softmax_weights, fc = inference(data_placeholder, FLAGS.fc_uints)
        #Add to the Graph the Ops for loss calculation
        loss_entroy = loss(softmax, label_placeholder)
        #Add to the Graph the Ops that calculate and apply gradients
        train_op = training(loss_entroy, FLAGS.learning_rate)
        #Add thp Op to compare the loss to the labels
        correct = acc(softmax, label_placeholder)
        #Build the summary operation based on the TF collection of Summaries
        summary_op = tf.summary.merge_all()
        #Add the variable initalizer Op
        init = tf.global_variables_initializer()
        #Create a saver for writing traing checkpoints
        saver = tf.train.Saver()

        #Create a session for training
        sess = tf.Session()
        #Instantiate a SummaryWriter to output summaries and the Graph
        summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)

        #Run the Op to initialize the variables
        sess.run(init)

        #Start the training loop
        for step in range(FLAGS.max_steps):
            start_time = time.time()

            feed_dict = fill_feed_dict(step, train_data, train_label, data_placeholder, label_placeholder)

            #Run one step of the model
            _, loss_value, softmax_value, fc_weights_value, softmax_weights_value, fc_output = sess.run([train_op, loss_entroy, softmax, fc_weights, softmax_weights, fc], feed_dict = feed_dict)

            #print('input ************************************')
            #print(train_data[1000])
            #print('label ************************************')
            #print(train_label)
            #print('fc weights *******************************')
            #print(fc_weights_value[0])
            #print('fc output ********************************')
            #print(fc_output[0])
            #print('softmax weights **************************')
            #print(softmax_weights_value[0])
            #print('softmax ****************************')
            #print(softmax_value[0])

            duration = time.time() - start_time

            #Write the summaries and print an overview farily often
            if step % 100 == 0:
                #Print status to stdout
                print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))
                #Update the events file
                summary_str = sess.run(summary_op, feed_dict = feed_dict)
                summary_writer.add_summary(summary_str, step)
                summary_writer.flush()

            #Save a checkpoint and evaluate the model periodically
            if(step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_file = os.path.join(FLAGS.train_dir, 'checkpoint')
                saver.save(sess, checkpoint_file, global_step = step)
                #Evaluate against the data set
                print('Training Data Eval:')
                do_eval(sess, correct, data_placeholder, label_placeholder, train_data, train_label)
                print('Test Data Eval:')
                do_eval(sess, correct, data_placeholder, label_placeholder, test_data, test_label)
Example #9
0
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import data_preprocessing
import keras
from keras.models import Sequential
from keras.layers import Flatten, Dense
data = data_preprocessing.load_data(download=False)
new_data = data_preprocessing.convert2onehot(data)

#convert DataFrame to Numpy
new_data = new_data.values.astype(np.float32)

print(new_data.shape)
#数据在训练的时候需要被打乱顺序
np.random.shuffle(new_data)

#打乱顺序以后就要分开所有的数据
#设定分离的索引点
sep = int(0.7 * len(new_data))
train_data = new_data[:sep]  #70% training data
test_data = new_data[sep:]  #30% test data
print(new_data.shape)
x_train = train_data[:, :21]
y_train = train_data[:, 21:]
x_test, y_test = test_data[:, :21], test_data[:, 21:]
print(x_train.shape)
print(y_train.shape)
print(x_train.shape[0:1])
print("1111111111111111", x_train.shape[0:1])
model = Sequential()
Example #10
0
batch_size = 512  # mini-batch
keep_prob = 1  # drop out
l2reg = 0  # l2
lstm_sizes = [256]  # lstm dimention
fc_size = 256  # layer_size
max_epochs = 50

# ---- 其他参数
max_sent_len = 30
class_num = 2
show_step = 20
data_path = './data/'

# ================== data prepare =================
#loading data from data_preprocesssing
X_final, y_final, X_final_dev, y_final_dev, X_final_test, y_final_test = dp.load_data(
    data_path + "creditcard.csv")
# print(len(train_x))

encoder = LabelBinarizer()

one_hot_train_y = encoder.fit_transform(y_final)
one_hot_dev_y = encoder.fit_transform(y_final_dev)
one_hot_final_test_y = encoder.fit_transform(y_final_test)
# one_hot_train_y = list(one_hot_train_y)
# print(len(one_hot_train_y))
# print(type(one_hot_train_y))
# print(one_hot_train_y)

change_train_y = []
change_dev_y = []
change_final_test_y = []
Example #11
0
"""
@author: Md Rashad Al Hasan Rony

"""

from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from data_preprocessing import load_data, process_data, split_data

# Parameters
MAX_DOC_LENGTH = 25
BATCH_SIZE = 256
EPOCHS = 5

# Loading train and test dataset
x_train, y_train = load_data("train_data.csv", sample_ratio=0.1)
x_test, y_test = load_data("test_data.csv", sample_ratio=0.1)

# Data preprocessing
x_train, x_test, _, n_vocab = process_data(x_train, x_test, MAX_DOC_LENGTH)

# Splitting dataset
x_test, x_val, y_test, y_val, _, test_size = split_data(x_test, y_test, 0.1)

# Model for training
model = Sequential()
model.add(Embedding((n_vocab + 1), 15, input_length=MAX_DOC_LENGTH))
model.add(Bidirectional(LSTM(15)))
model.add(Dropout(0.5))
model.add(Dense(15, activation='sigmoid'))
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
import data_preprocessing
import model
import tensorflow as tf
import numpy as np



# data loading (pickle)
dataset_detection_video, classlbl_to_classid = data_preprocessing.load_data()



# ====== GRID SEARCH TRAINING=========

frame_batch = [15]
lstm = [32]
relu = [16]

for i in lstm:
	for j in relu:
		for k in frame_batch:

			print(str(i)+'-'+str(j)+'-'+str(k))
			
			#features
			coo, feat_type = data_preprocessing.cooccurrence(dataset_detection_video, k)

			coint, feat_type = data_preprocessing.cointersection(dataset_detection_video, k)

			for index, video in enumerate(coint):
				video['sequence'] = np.concatenate((video['sequence'], coo[index]['sequence']),axis=1)
    prediction = clf.predict(testing_set[list(dataset.columns[1:])])
    return clf, prediction, testing_set["Severity"]


# Train and return a svm model, predictions, and actual values
def train_svm(data):
    training_set = data.copy()
    testing_set = pd.DataFrame(columns=list(data.columns))
    for i in range(int(training_set.index.size / 10000)):
        j = random.randrange(0, training_set.index.size)
        testing_set = testing_set.append(training_set.loc[j])
        testing_set = testing_set.reset_index(drop=True)
        training_set = training_set.drop(j)
        training_set = training_set.reset_index(drop=True)
    samples = training_set[list(training_set.columns[1:])]
    targets = training_set["Severity"]

    svm_model = svm.SVC()
    svm_model.fit(samples.iloc[0:100], targets.iloc[0:100])
    return svm_model, svm_model.predict(testing_set[list(dataset.columns[1:])]), testing_set["Severity"]


dataset = dp.load_data("modified1.csv")
features = list(dataset.columns[1:])
classes = dataset["Severity"]
attributes = dataset[features]
pca_scatter(classes, attributes)
display_histogram(attributes)
display_histogram(pd.DataFrame(classes))
train_decision_tree(dataset)
Example #14
0
def train_model(num_epochs, batch_size):
    """Trains model, outputs train and validation accuracy during training, has a early stopping
	   if validation error increases, outputs test accuracy on unseen dataset.

	   params:

	   :num_epochs: This is the number of cycles the model should be trained on

	   :batch_size: Batch training is being used to perform optimization. Therefore the size of the batch
	   must be specified. It cannot be too large else there will be an out of memory error. Generally 
	   20-50 is a good size for the current setup

	   return: nothing.

	   """

    #load data
    num_classes = 10
    images_train, labels_train = load_data('train_32x32.mat')
    tr_data = normalize(images_train)
    tr_labels = one_hot_encode_labels(labels_train, num_classes)

    images_test, labels_test = load_data('test_32x32.mat')
    te_data = normalize(images_test)
    te_labels = one_hot_encode_labels(labels_test, num_classes)

    #split test set into validation set and test set 50/50
    num_feat_val_set = 500
    val_data, val_labels = te_data[:
                                   num_feat_val_set], te_labels[:
                                                                num_feat_val_set]
    testing_data_and_labels = generate_data(te_data[num_feat_val_set:],
                                            te_labels[num_feat_val_set:])

    x, y_, model, train_op, accuracy, keep_prob = classifier()

    #save model functionailty
    saver = tf.train.Saver()

    #variable used to determine if to exit training phase due to overfitting
    exit = False

    #start interactive matplotlib session
    plt.ion()

    with tf.Session() as sess:
        sess.run(model)

        count = 0
        #training
        #get sample length
        sample_length = tr_data.shape[0]
        #get number of batches
        num_batches = int(sample_length / batch_size)

        #make the containers to hold test and train accuracy
        train_accuracy = np.zeros([num_epochs * num_batches, 1])
        val_accuracy = np.zeros([num_epochs * num_batches, 1])
        test_accuracy = 0

        for epoch in range(num_epochs):

            #get shuffled index
            shuffled_indexes = np.arange(sample_length)
            np.random.shuffle(shuffled_indexes)
            #shuffle data
            shuffled_train_data = tr_data[shuffled_indexes]
            shuffled_train_labels = tr_labels[shuffled_indexes]

            for i in range(num_batches):
                #gather batches
                start = i * batch_size
                end = (i + 1) * batch_size

                #make sure dont access part of an array that doesn't exist(small overlap of data from previous batch will occur - artifact of uneven data set and even batch size)
                if end > sample_length:
                    end = sample_length
                if start > sample_length - batch_size:
                    start = sample_length - batch_size

                batch_x, batch_y = shuffled_train_data[
                    start:end][:], shuffled_train_labels[start:end][:]

                _, train_accuracy[count] = sess.run([train_op, accuracy],
                                                    feed_dict={
                                                        x: batch_x,
                                                        y_: batch_y,
                                                        keep_prob: 0.5
                                                    })

                val_accuracy[count] = sess.run(accuracy,
                                               feed_dict={
                                                   x: val_data,
                                                   y_: val_labels,
                                                   keep_prob: 1.0
                                               })

                #prints out the accuracy every 7 batches (So that an even amount gets printed out based on num_batches)
                print_out = int(num_batches / 7)
                if i % print_out == 0:
                    print("epoch: %d, train_step %d, training accuracy %g" %
                          (epoch, i, train_accuracy[count]))
                    print("epoch: %d, test accuracy: %g" %
                          (epoch, val_accuracy[count]))

                    #plotting
                    train_line, = plt.plot(count,
                                           train_accuracy[count],
                                           'bo',
                                           label='train_accuracy')
                    val_line, = plt.plot(count,
                                         val_accuracy[count],
                                         'ro',
                                         label='validation_accuracy')
                    plt.xlabel('Batch Number')
                    plt.ylabel('Accuracy')
                    plt.title('Validation & train accuracy vs Batch Number')
                    plt.pause(0.05)

                #exit training if there is increased validation error (decrease in test accuracy)
                if val_accuracy[count] < (val_accuracy[count - 1] - 0.1):
                    exit = True
                    break

                count += 1

            #exit training if there is increased validation error (decrease in test accuracy)
            if exit:
                print("Overfitting occuring: exiting training phase")
                break
            print(
                "--------------------------------------------------------------"
            )

        #saves model
        saver_path = saver.save(sess, './trained_model.ckpt')
        print("Saved model: ", saver_path)

        #calculate test accuracy on unseen test set
        for _ in range(te_data.shape[0] - num_feat_val_set):
            batch = next(testing_data_and_labels)
            test_accuracy += sess.run(accuracy,
                                      feed_dict={
                                          x:
                                          np.reshape(batch[0], (1, 32, 32, 3)),
                                          y_: np.reshape(batch[1], (1, 10)),
                                          keep_prob: 1.0
                                      })

    print("Testing Accuracy: %g" %
          (float(test_accuracy) / float(te_data.shape[0] - num_feat_val_set)))
Example #15
0
def anomaly_detection():
    """
    Run anomaly detection process end-to-end.

    Parameters
    ----------
    None.

    Returns
    -------
    None.
    """

    device = 'cpu'
    if torch.cuda.device_count() > 0 and torch.cuda.is_available():
        print("Cuda installed. Running on GPU")
        device = 'cuda'
    else:
        print("No GPU available!")
        exit

    # Importing and cleaning data
    full_data_frame = dp.load_data("data/full_data.csv")
    print("Raw dataset loaded into memory")

    # Only tags that we need
    lpc_data= full_data_frame[lpc_tags]

    # Remove data corresponding to offline behaviour
    online_data = dp.remove_offline_data(lpc_data, min_temp=80, max_ctrl_pres=58,
                                         window=24)
    print("Offline data removed")

    # Remove features that are deemed invalid - no anti-const condition enforced
    clean_data = dp.remove_invalid_features(online_data, max_gap_len=12,
                                            max_nan_percent=1, min_unique_vals=2000,
                                            min_variance=1, max_const_len=-1)
    print("Invalid features removed")

    # Interpolate dataframe
    clean_data.interpolate(method='linear', axis=0, inplace=True)
    print("Missing data interpolated")

    # Find deltas from interpolated data
    delta_data = dp.calculate_deltas(clean_data)
    print("Deltas calculated")

    # Normalise (Standardise dataset to ~N(0, 1))
    normalised_data = dp.normalise_dataset(delta_data)
    print("Data normalised")

    # Save final dataset
    dp.save_data(normalised_data, "./detection_data.csv")
    print("Data pre-processing complete")

    # Load dataset (with indexes saved)
    data_frame = dp.load_data("./detection_data.csv")
    dataset = data_frame.to_numpy()
    print("Data loaded")

    # Ordered indexes of trips in dataset
    anomaly_indexes = np.array([10634, 36136, 57280, 57618, 60545, 63144, 118665,
                                128524, 131118])
    anomaly_indexes = dsf.convert_indexes(anomaly_indexes, dataset)

    num_features = dataset.shape[1] - 1
    seq_length = 144
    stride = 72

    # Create data loaders for all purposes
    anomaly_loader, normal_loader, train_loader, valid_loader, train_valid_loader, \
    test_loader, valid_index, test_index  = \
    dsf.create_mode_loaders(dataset, anomaly_indexes, seq_length, stride)
    print("Data loaders created")

    # Initialise and train model
    model = dml.Autoencoder(num_features, 60).to(device)

    dml.train_autoencoder(model, train_valid_loader, test_loader, 120, False)
    print("Model training done, best model saved")

    # Classify all data by mean reconstrution error
    anomalies = dsf.assess_autoencoder(model, normal_loader, valid_index,
                                       test_index, False, anomaly_loader,
                                       seq_length, num_features)
    print("Model assessment done")

    # Save sorted anomalies with their time indexes
    pd.DataFrame(anomalies).to_csv("./detected_anomalies.csv")
    print("Detected anomalies saved")
    print("Process complete")

    return
Example #16
0
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 01 22:49:56 2018

@author: xuc
"""

from data_preprocessing import load_data

dataset = load_data('data')

print dataset
Example #17
0
def TL(source,
       target=None,
       path='../gen_patches/dataset_noisy/',
       retrain=False,
       retrain_ft_layers=[1, 1, 1, 1, 1, 1],
       outputfolder='backup',
       outputfolderres='backup_res',
       batchsize=1000,
       sourcemodelspath='./'):
    """
    TO DO: FINISH DOCUMENTATION
    """

    options = {
        'sourcemodelspath': sourcemodelspath,
        'outputfolder': outputfolder,
        'outputfolderres': outputfolderres,
        'verbose': 0,
        'viewdata': False,
        'trainsize': 0.6,
        'patchsize': 20,
        'measure': 'acc',
        'weight': 200,
        'datanormalize': True,
        # ---------- one-class learning
        'replicate': False,
        'oneclass': False,
        # ---------- source problem params
        'database_source': 'db2',
        'resolution_source': source,
        'nclasses_source': 2,  # TODO: do this automatically
        # ---------- target problem params
        'database_target': 'db2',
        'resolution_target': target,
        # ---------- TL hyperparams
        'retrain': retrain,
        'retrain_ft_layers': retrain_ft_layers,
        # ---------- hyperparams
        'nruns': 20,
        'folds': 3,
        'hlayers': [len(retrain_ft_layers) / 2],  # X hidden + 1 log layer
        'nneurons': [1000],  # range(500, 1001, 250),
        'pretraining_epochs': [1000],  # [200]
        'training_epochs': [3000],  # [100]
        'pretrain_lr': [0.01, 0.001],  #[ 0.01, 0.001],
        'finetune_lr': [0.1, 0.01],  #[ 0.1, 0.01],
        'threshold': [0.8],  #[ 0.5 , 0.6, 0.8], #numpy.arange(.5,1.01,.1),
        'batchsize':
        [batchsize],  #[100] or [1000] depending on the size of the dataset. 
        # ---------- end of hyperparams
        'corruptlevels': [0.1],  #numpy.arange(0.1, 0.4, 0.1)
    }

    print >> sys.stderr, (options), "\n"

    # -------------------------------------------------------------------------------
    datasetpath = path
    # print argv
    # print datasetpath
    # print retrain_ft_layers
    # alaallala

    # -------------------------------------------------------------------------------
    # load dataset
    if options['retrain'] == 1:
        options['database'] = options['database_target']
        options['resolution'] = options['resolution_target']
    else:
        options['database'] = options['database_source']
        options['resolution'] = options['resolution_source']

    (dataset, ndim, nclasses) = load_data(datasetpath, options)
    options['ndim'] = ndim
    options['nclasses'] = nclasses

    # --------------------------------------------------------------------------------------------
    for nrun in range(1, options['nruns'] + 1):
        print >> sys.stderr, ("### {0:03d} of {1:03d}".format(
            nrun, options['nruns']))
        options['numpy_rng'] = numpy.random.RandomState(nrun)
        options['theano_rng'] = RandomStreams(seed=nrun)

        # --------------
        # generate folds
        folds = gen_folds(dataset, options, nrun)
        # continue

        if options['retrain'] == 1:
            filename = "{0:s}/{1:05d}_{2:03d}_model.pkl.gz".format(
                options['sourcemodelspath'], nrun,
                string.atoi(options['resolution_source']))
            print >> sys.stderr, ":: Loading model {0:s}...\n".format(filename)
            sda_reuse_model = load_savedgzdata(filename)

            #print sda_reuse_model.logLayer.W.get_value()
            #print sda_reuse_model.logLayer.W.get_value()
            #kkk

        else:
            sda_reuse_model = None

        # ----------------------------------------------------------------------------
        results = do_experiment(folds, options, nrun, sda_reuse_model)
        # ----------------------------------------------------------------------------

        # --------------------------------------------------
        filename = '{0:s}/res_{1:05d}_{2:03d}.pkl.gz'.format(
            options['outputfolderres'], nrun,
            string.atoi(options['resolution']))
        save_results(filename, results)
Example #18
0
def train_model(learn_model, num_epochs, batch_size, learning_rate):
	"""takes a some parameters, trains a specified model,
	   calculates accuracy during training and prints it out. Prints out
	   final accuracy on a test set as well"""

	#load data
	num_classes = 6
	tr_data = normalize(load_data('X_train.txt'))
	tr_labels = one_hot_encode_labels(load_data('y_train.txt'),num_classes)
	te_data = normalize(load_data('X_test.txt'))
	te_labels = one_hot_encode_labels(load_data('y_test.txt'),num_classes)

	#determine which learn method to run
	if learn_model =='logistic' or learn_model == '2-layer':

		if learn_model=='logistic':
			x,y_,model,train_op,accuracy = logistic_classifier(learning_rate)
		else:
			x,y_,model,train_op,accuracy = two_layer_net(learning_rate)

		with tf.Session() as sess:
			sess.run(model)

			count = 0
			#get sample length
			sample_length = tr_data.shape[0]
			#get number of batches
			num_batches = int(sample_length/batch_size)

			#make the containers to hold test and train accuracy
			train_accuracy = np.zeros([num_epochs*num_batches,1])
			test_accuracy = np.zeros([num_epochs,1])

			for epoch in range(num_epochs):
				
				#get shuffled index
				shuffled_indexes = np.arange(sample_length)
				np.random.shuffle(shuffled_indexes)
				#shuffle data
				shuffled_train_data = tr_data[shuffled_indexes]
				shuffled_train_labels = tr_labels[shuffled_indexes]

				for i in range(num_batches):
				
					#gather batches
					start = i*batch_size
					end = (i+1)*batch_size

					#make sure dont access part of an array that doesn't exist(small overlap of data from previous batch will occur - artifact of uneven data set and even batch size)
					if end > sample_length:
						end = sample_length
					if start > sample_length-batch_size:
						start = sample_length-batch_size

					batch_x,batch_y = shuffled_train_data[start:end][:],shuffled_train_labels[start:end][:]
					_,train_accuracy[count] = sess.run([train_op, accuracy], feed_dict={x:batch_x,y_:batch_y})

					#prints out the accuracy every 7 batches (So that an even amount gets printed out based on num_batches)
					print_out = int(num_batches/7)
					if i%print_out==0:
						print("epoch%d, train_step %d, training accuracy %g"%(epoch, i, train_accuracy[count]))
				
					count +=1

				test_accuracy[epoch] = sess.run(accuracy, feed_dict={x: te_data, y_: te_labels})	
				print("epoch: %d, test accuracy: %g"%(epoch, test_accuracy[epoch]))
				print("--------------------------------------------------------------")

		#calculates average accuracy at five points
		
		#plots train accuracy
		train_line, = plt.plot(train_accuracy,'r.', label = 'train accuracy') 
		test_line, = plt.plot([e*num_batches for e in range(num_epochs)], test_accuracy,'b-', label = 'test accuracy')

		plt.legend(loc = 'lower right')
		plt.xlabel('Batch Number')
		plt.ylabel('Accuracy')
		plt.title('Prediction Accuracy vs Batch Number')
		#plt.legend(handles=[])
		plt.show()

	if learn_model =='knn':

		#get input from user
		raw_k_val = input("Please enter odd k value (recommend 7) or enter 0 to run a list of k values: ")

		if int(raw_k_val)==0:
			#ran multiple k's to decide which was best for this data set (Keep odd so that there are no ties)
			num_neighbours = [1,3,5,7,9,11,13,15]
			#num_neighbours = [1,3]
		else:
			num_neighbours = [int(raw_k_val)]

		#will contain the accuracy for each value of k
		train_accuracy = np.zeros((len(num_neighbours)))

		for index, k in enumerate(num_neighbours):
			#retrieve object for graph_constructor
			x,y_,xtest, ytest,accuracy, model,train_op = knn(k)
			with tf.Session() as sess:
				sess.run(model)
				# loop over test data
				for i in range(len(te_data)):
					# Get nearest neighbor to each row of test data which represents one multi dimensional data point
					train_accuracy[index] += sess.run(accuracy, feed_dict={x: tr_data, y_: tr_labels, xtest: te_data[i, :], ytest: te_labels[i]})

					if i%200==0:
						print(str(i) + ' out of ' + str(len(te_data)) + ' have been tested')

			print("k = {}, Accuracy: {} ".format(k, train_accuracy[index]/len(te_data)))

		#only plot if there is more than one k value
		if int(raw_k_val) == 0: 
			plt.plot(num_neighbours, train_accuracy/len(te_data), 'ro')
			plt.xlabel('K - value')
			plt.ylabel('Accuracy')
			plt.title('Prediction Accuracy vs Batch Number')
			plt.show()
from data_preprocessing import extract_swell_dataset, extract_dreamer_dataset, extract_amigos_dataset, extract_wesad_dataset, load_data
from data_preprocessing import swell_prepare_for_10fold, wesad_prepare_for_10fold, amigos_prepare_for_10fold, dreamer_prepare_for_10fold, current_time, makedirs, one_hot_encoding
from send_emails import send_email

#data_folder = Path("D:\\Code Repo\\Working dir\\Self supervised task\\NEW_WORK_DIR\\data_folder\\full_data_filtered\\")
extract_data = 0
""" for the first time run this """
#if extract_data == 1:
#    _       = extract_swell_dataset(overlap_pct= 1, window_size_sec= 10, data_save_path= data_folder, save= 1)
#    _       = extract_dreamer_dataset(overlap_pct= 1, window_size_sec= 10, data_save_path= data_folder, save= 1)
#    _       = extract_amigos_dataset(overlap_pct= 1, window_size_sec= 10, data_save_path= data_folder, save=1)
#    _       = extract_wesad_dataset(overlap_pct=1, window_size_sec=10, data_save_path= data_folder, save=1)

dirname = Path("/home/pritam/self_supervised_learning/")
swell_data = load_data(dirname /
                       "data_folder/full_data_filtered/swell_dict.npy")
dreamer_data = load_data(dirname /
                         "data_folder/full_data_filtered/dreamer_dict.npy")
amigos_data = load_data(dirname /
                        "data_folder/full_data_filtered/amigos_dict.npy")
wesad_data = load_data(dirname /
                       "data_folder/full_data_filtered/wesad_dict.npy")

swell_data = swell_prepare_for_10fold(
    swell_data)  #person, y_input_stress, y_arousal, y_valence,
wesad_data = wesad_prepare_for_10fold(wesad_data)  # person, y_stress
amigos_data = amigos_prepare_for_10fold(
    amigos_data)  # person, y_arousal, y_valence, y_dominance
dreamer_data = dreamer_prepare_for_10fold(
    dreamer_data)  # person, y_arousal, y_valence, y_dominance
Example #20
0
# Importing the libraries
import seq2seq_wrapper
import importlib
importlib.reload(seq2seq_wrapper)
import data_preprocessing
import data_utils_1
import data_utils_2



########## PART 1 - DATA PREPROCESSING ##########



# Importing the dataset
metadata, idx_q, idx_a = data_preprocessing.load_data(PATH = 'C:\Users\LENOVO\Documents\ChatBot\The Best ChatBot\')

# Splitting the dataset into the Training set and the Test set
(trainX, trainY), (testX, testY), (validX, validY) = data_utils_1.split_dataset(idx_q, idx_a)

# Embedding
xseq_len = trainX.shape[-1]
yseq_len = trainY.shape[-1]
batch_size = 16
vocab_twit = metadata['idx2w']
xvocab_size = len(metadata['idx2w'])  
yvocab_size = xvocab_size
emb_dim = 1024
idx2w, w2idx, limit = data_utils_2.get_metadata()

Example #21
0
import importlib
import imp
#importlib.reload(seq2seq_wrapper)
imp.reload(seq2seq_wrapper)
import data_preprocessing
import data_utils_1
import data_utils_2
from flask import Flask, jsonify, render_template, request


########## PART 1 - DATA PREPROCESSING ##########



# Importing the dataset
metadata, idx_q, idx_a = data_preprocessing.load_data(PATH = './')

# Splitting the dataset into the Training set and the Test set
(trainX, trainY), (testX, testY), (validX, validY) = data_utils_1.split_dataset(idx_q, idx_a)

# Embedding
xseq_len = trainX.shape[-1]
yseq_len = trainY.shape[-1]
batch_size = 16
vocab_twit = metadata['idx2w']
xvocab_size = len(metadata['idx2w'])  
yvocab_size = xvocab_size
emb_dim = 1024
idx2w, w2idx, limit = data_utils_2.get_metadata()

Example #22
0
import numpy as np
import theano as theano
import theano.tensor as T
import time
import operator
from data_preprocessing import load_data, load_model_parameters_theano, generate_sentences
from gru_theano import *
import sys


# Load data (this may take a few minutes)
VOCABULARY_SIZE = 8000
X_train, y_train, word_to_index, index_to_word = load_data("data/reddit-comments-2015.csv", VOCABULARY_SIZE)

# Load parameters of pre-trained model
model = load_model_parameters_theano('./data/pretrained.npz')

generate_sentences(model, 100, index_to_word, word_to_index)