def initialize_dataset(self): labeled_data = pre.load_data('./labeled_data/hotel_corpus.tsv') labeled_data.extend( pre.load_data('./labeled_data/schedule_corpus.tsv')) labeled_data_matrix = [re.split('[\t\n]', s) for s in labeled_data] target_list = [] morph_list = [] only_target_list = [] for row in labeled_data_matrix: if len(row) <= 1: target_list.append('None') continue elif row[0] == 'Speaker': target_list.append('None') continue if len(row) == 3: print(row[1]) elif len(row) == 4: target_list.append(row[-1]) only_target_list.append(row[-1]) morph_list.append(pre.morph_data(row[1])) #print((morph_list[:15])) #all_target_dict = pre.making_all_dict(list(set(target_list))) #target들의 dictionary #all_target_1dlist = pre.making_target_list(target_list, all_target_dict) all_target_dict = pre.making_all_dict(list( set(only_target_list))) #target들의 dictionary all_target_1dlist = pre.making_target_list(only_target_list, all_target_dict) #none_idx = int(all_target_dict['None']) target_list = all_target_1dlist #while target_list.count(none_idx) != 0: # target_list.remove(none_idx) self.target_dict = pre.making_all_dict(list(set(target_list))) #self.target_dict = pre.making_all_dict(list(set(only_target_list))) #print('len of target dict', len(self.target_dict)) self.inv_target_dict = {v: k for k, v in self.target_dict.items()} #for key in self.target_dict: # print (key) #print(len(target_list), len(morph_list)) #print(morph_list[:10]) Train_X, Test_X, Train_Y, Test_Y = train_test_split(morph_list, target_list, test_size=0.2, random_state=42) self.baseTrain_X = Train_X self.baseTrain_Y = Train_Y self.Test_X = Test_X self.Test_Y = Test_Y
def chatbot(txt): #chatbot code here # Importing the dataset metadata, idx_q, idx_a = data_preprocessing.load_data(PATH = './') # Splitting the dataset into the Training set and the Test set (trainX, trainY), (testX, testY), (validX, validY) = data_utils_1.split_dataset(idx_q, idx_a) # Embedding xseq_len = trainX.shape[-1] yseq_len = trainY.shape[-1] batch_size = 16 vocab_twit = metadata['idx2w'] xvocab_size = len(metadata['idx2w']) yvocab_size = xvocab_size emb_dim = 1024 idx2w, w2idx, limit = data_utils_2.get_metadata() # Building the seq2seq model model = seq2seq_wrapper.Seq2Seq(xseq_len = xseq_len, yseq_len = yseq_len, xvocab_size = xvocab_size, yvocab_size = yvocab_size, ckpt_path = './weights', emb_dim = emb_dim, num_layers = 3) # Loading the weights and Running the session session = model.restore_last_session() # Getting the ChatBot predicted answer def respond(question): encoded_question = data_utils_2.encode(question, w2idx, limit['maxq']) answer = model.predict(session, encoded_question)[0] return data_utils_2.decode(answer, idx2w) # Setting up the chat #while True : '''' engine = pyttsx3.init() engine.runAndWait() try: r = sr.Recognizer() mic = sr.Microphone() with mic as source: r.adjust_for_ambient_noise(source) audio = r.listen(source) print('You :') x=r.recognize_google(audio) print(x) except: continue question=x.lower()''' question=txt #question = input("You: ") #if question=='good bye': #print('Ok Bye') #break #answer = respond(question) return respond(question)
def run(num_classes,learning_rate,width,depth,mini_batch_size): precision = accuracy = recall = f_score = np.array([]) X_train,X_test,y_train,y_test,unknown_data = dp.load_data() X_train,X_test,y_train,y_test,unknown_data,dtype = dp.prepare_data(X_train,X_test,y_train,y_test,unknown_data) for _ in range(1): model = NN.Net1(num_classes,depth=depth,width=width).type(dtype) opt = optim.SGD(params=model.parameters(),lr=learning_rate,momentum=rp.m,nesterov=True) train_losses,test_losses = model.train_validate(X_train,y_train,X_test,y_test,opt,mini_batch_size,dtype) model = torch.load("Models/Best_Model.pkl") y_pred,_ = model.test(X_test) # Calculate metrics y_true = y_test.data.cpu().numpy() y_pred = y_pred.data.cpu().numpy() a,p,r,f = m.compute_metrics(y_true,y_pred) accuracy = np.append(accuracy,a) precision = np.append(precision,p) recall = np.append(recall,r) f_score = np.append(f_score,f) accuracy = np.mean(accuracy) precision = np.mean(precision) recall = np.mean(recall) f_score = np.mean(f_score) m.show_results(accuracy,precision,recall,f_score,num_classes,train_losses,test_losses) #g.generate_graph(model,X_train) fw.create_data_csv(learning_rate,depth,width,mini_batch_size,rp.m,len(test_losses)-10,accuracy) # Store unknown_data prediction y_pred,_ = model.test(unknown_data) fw.store_prediction(y_pred.data.cpu().numpy())
def load_data(): # Importing the dataset metadata, idx_q, idx_a = data_preprocessing.load_data(PATH='./') # Splitting the dataset into the Training set and the Test set (trainX, trainY), (testX, testY), (validX, validY) = data_utils_1.split_dataset(idx_q, idx_a) # Embedding xseq_len = trainX.shape[-1] yseq_len = trainY.shape[-1] batch_size = 16 vocab_twit = metadata['idx2w'] xvocab_size = len(metadata['idx2w']) yvocab_size = xvocab_size emb_dim = 1024 idx2w, w2idx, limit = data_utils_2.get_metadata() return (xseq_len, yseq_len, xvocab_size, yvocab_size, emb_dim)
def generate_anomalies(): """ Run data generation process end-to-end. Parameters ---------- None. Returns ------- None. """ device = 'cpu' if torch.cuda.device_count() > 0 and torch.cuda.is_available(): print("Cuda installed. Running on GPU") device = 'cuda' else: print("No GPU available!") exit # Importing and cleaning data full_data_frame = dp.load_data("data/full_dataset.csv") print("Raw dataset loaded into memory") # Only tags that we need lpc_data = full_data_frame[lpc_tags] # Remove data corresponding to offline behaviour online_data = dp.remove_offline_data(lpc_data, min_temp=80, max_ctrl_pres=58, window=24) print("Offline data removed") # Remove features that are deemed invalid - anti-const condition enforced clean_data = dp.remove_invalid_features(online_data, max_gap_len=12, max_nan_percent=1, min_unique_vals=2000, min_variance=1, max_const_len=144) print("Invalid features removed") # Interpolate dataframe clean_data.interpolate(method='linear', axis=0, inplace=True) print("Missing data interpolated") # Find deltas from interpolated data delta_data = dp.calculate_deltas(clean_data) print("Deltas calculated") # Normalise (Standardise dataset to ~N(0, 1)) normalised_data = dp.normalise_dataset(delta_data) print("Data normalised") # Save final dataset dp.save_data(normalised_data, "./generation_data.csv") print("Data pre-processing complete") data_frame = dp.load_data("./generation_data.csv") print("Data loaded into memory") dataset = data_frame.to_numpy() tag = 27 dataset = dataset[:, [tag, -1]] clipped_data = gsf.clip_distribution(dataset) # Ordered indexes of trips in dataset anomaly_indexes = np.array( [10634, 36136, 57280, 57618, 60545, 63144, 118665, 128524, 131118]) anomaly_indexes = gsf.convert_indexes(anomaly_indexes, dataset) seq_length = 144 stride = 72 real_loader = gsf.create_static_loader(clipped_data, 32) anomaly_loader, normal_loader, train_loader, valid_loader, train_valid_loader, \ test_loader, valid_index, test_index = \ gsf.create_mode_loaders(dataset, anomaly_indexes, seq_length, stride) latent_size = 20 num_cycles = 1000 leakyrelu_gain = init.calculate_gain('leaky_relu') D = gml.Discriminator(seq_length).to(device) gml.init_weights(D, leakyrelu_gain) G = gml.Generator(seq_length, latent_size).to(device) gml.init_weights(G, leakyrelu_gain) gml.train_lsgan(D, G, real_loader, 72, stride, latent_size, num_cycles, tag)
def TL( source, target = None, path = '../gen_patches/dataset_noisy/', retrain = False, retrain_ft_layers = [1,1,1,1,1,1], outputfolder='backup', outputfolderres='backup_res', batchsize = 1000, sourcemodelspath = './' ): """ TO DO: FINISH DOCUMENTATION """ options = { 'sourcemodelspath' : sourcemodelspath, 'outputfolder' : outputfolder, 'outputfolderres' : outputfolderres, 'verbose' : 0, 'viewdata' : False, 'trainsize' : 0.6, 'patchsize' : 20, 'measure' : 'acc', 'weight' : 200, 'datanormalize' : True, # ---------- one-class learning 'replicate' : False, 'oneclass' : False, # ---------- source problem params 'database_source' : 'db2', 'resolution_source' : source, 'nclasses_source' : 2, # TODO: do this automatically # ---------- target problem params 'database_target' : 'db2', 'resolution_target' : target, # ---------- TL hyperparams 'retrain' : retrain, 'retrain_ft_layers' : retrain_ft_layers, # ---------- hyperparams 'nruns' : 20, 'folds' : 3, 'hlayers' : [len(retrain_ft_layers) / 2], # X hidden + 1 log layer 'nneurons' : [ 1000], # range(500, 1001, 250), 'pretraining_epochs': [ 1000], # [200] 'training_epochs' : [ 3000], # [100] 'pretrain_lr' : [ 0.01, 0.001], #[ 0.01, 0.001], 'finetune_lr' : [ 0.1 , 0.01], #[ 0.1, 0.01], 'threshold' : [0.8], #[ 0.5 , 0.6, 0.8], #numpy.arange(.5,1.01,.1), 'batchsize' : [ batchsize], #[100] or [1000] depending on the size of the dataset. # ---------- end of hyperparams 'corruptlevels' : [0.1], #numpy.arange(0.1, 0.4, 0.1) } print >> sys.stderr, (options), "\n" # ------------------------------------------------------------------------------- datasetpath = path # print argv # print datasetpath # print retrain_ft_layers # alaallala # ------------------------------------------------------------------------------- # load dataset if options['retrain'] == 1: options['database'] = options['database_target'] options['resolution'] = options['resolution_target'] else: options['database'] = options['database_source'] options['resolution'] = options['resolution_source'] (dataset, ndim, nclasses) = load_data( datasetpath, options ) options['ndim'] = ndim options['nclasses'] = nclasses # -------------------------------------------------------------------------------------------- for nrun in range(1,options['nruns']+1): print >> sys.stderr, ("### {0:03d} of {1:03d}".format(nrun,options['nruns'])) options['numpy_rng'] = numpy.random.RandomState(nrun) options['theano_rng'] = RandomStreams(seed=nrun) # -------------- # generate folds folds = gen_folds( dataset, options, nrun ) # continue if options['retrain'] == 1: filename = "{0:s}/{1:05d}_{2:03d}_model.pkl.gz".format(options['sourcemodelspath'], nrun, string.atoi(options['resolution_source'])) print >> sys.stderr, ":: Loading model {0:s}...\n".format(filename) sda_reuse_model = load_savedgzdata ( filename ) #print sda_reuse_model.logLayer.W.get_value() #print sda_reuse_model.logLayer.W.get_value() #kkk else: sda_reuse_model = None # ---------------------------------------------------------------------------- results = do_experiment( folds, options, nrun, sda_reuse_model ) # ---------------------------------------------------------------------------- # -------------------------------------------------- filename = '{0:s}/res_{1:05d}_{2:03d}.pkl.gz'.format(options['outputfolderres'],nrun,string.atoi(options['resolution'])) save_results(filename,results)
def train_model(num_epochs, batch_size, learning_rate): """Trains a neural network for image classification from the SVHN dataset, and creates a plot giving training/testing accuracy as a function of batch number. Parameters: num_epochs -- Number of training epochs batch_size -- Number of examples in each training batch learning_rate -- Initial learning rate """ # Get data: train_X_orig, train_y_orig, _, _ = data_preprocessing.load_data() train_X_norm = data_preprocessing.normalize(train_X_orig) train_X, valid_X, train_y, valid_y = data_preprocessing.split( train_X_norm, train_y_orig) # One-hot encode so they can be used for input/validation: train_y_cat = keras.utils.to_categorical(train_y, num_classes=10) valid_y_cat = keras.utils.to_categorical(valid_y, num_classes=10) # Build & compile model: model = graph_construction.get_keras_model() sgd = SGD(lr=learning_rate, decay=1e-5, momentum=0.9, nesterov=True) model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy']) # Train: history = BatchHistory(valid_X, valid_y_cat) model.fit(train_X, train_y_cat, epochs=num_epochs, batch_size=batch_size, callbacks=[history], validation_data=(valid_X, valid_y_cat)) # The callback slows things down a bit, but I'm not sure of a good # way around it. If I were testing only on specific batches of # validation data, it might be less of an issue. fname_base = "model_{0}_{1}_{2}_rgb".format(learning_rate, num_epochs, batch_size) model.save_weights("{0}.h5".format(fname_base)) # Plot training & validation accuracy, and loss (not called for, # but useful): b = [i["batch"] for i in history.history] plt.plot(b, [i["acc"] for i in history.history]) plt.plot(b, [i["val_acc"] for i in history.history]) plt.ylabel('Accuracy') plt.xlabel('Batch') plt.legend(['Training', 'Validation'], loc='lower right') plt.savefig("{0}_accuracy.png".format(fname_base)) plt.show() plt.close() plt.plot(b, [i["loss"] for i in history.history]) plt.plot(b, [i["val_loss"] for i in history.history]) plt.ylabel('Loss (categorical cross-entropy)') plt.xlabel('Batch') plt.legend(['Training', 'Validation'], loc='lower right') plt.savefig("{0}_loss.png".format(fname_base)) plt.close()
def run_training(): """Train net model.""" #Get the sets of data data_set = dp.extract_data(FLAGS.data_dir, FLAGS.label_dir, FLAGS.neighbor) train_data, train_label, test_data, test_label = dp.load_data(data_set, FLAGS.ratio) print('original train label:-----------------------------------------') print(train_label) print('original test label:------------------------------------------') print(test_label) print('train label length: ' + str(len(train_label)) + ', train data length: ' + str(len(train_data))) print('test label length:' + str(len(test_label)) + ', test data length: ' + str(len(test_data))) #transform int label into one-hot values print('train: ') train_label = dp.onehot_label(train_label, NUM_CLASSES) print('test: ') test_label = dp.onehot_label(test_label, NUM_CLASSES) print('one hot train label:-----------------------------------------') print(train_label) print('one hot test label:------------------------------------------') print(test_label) print('max train_data: ' + str(np.max(train_data))) print('min train_data: ' + str(np.min(train_data))) with tf.Graph().as_default(): #Generate placeholders data_placeholder, label_placeholder = placeholder_inputs(FLAGS.batch_size) #Build a Graph that computes predictions from the inference model softmax, fc_weights, softmax_weights, fc = inference(data_placeholder, FLAGS.fc_uints) #Add to the Graph the Ops for loss calculation loss_entroy = loss(softmax, label_placeholder) #Add to the Graph the Ops that calculate and apply gradients train_op = training(loss_entroy, FLAGS.learning_rate) #Add thp Op to compare the loss to the labels correct = acc(softmax, label_placeholder) #Build the summary operation based on the TF collection of Summaries summary_op = tf.summary.merge_all() #Add the variable initalizer Op init = tf.global_variables_initializer() #Create a saver for writing traing checkpoints saver = tf.train.Saver() #Create a session for training sess = tf.Session() #Instantiate a SummaryWriter to output summaries and the Graph summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) #Run the Op to initialize the variables sess.run(init) #Start the training loop for step in range(FLAGS.max_steps): start_time = time.time() feed_dict = fill_feed_dict(step, train_data, train_label, data_placeholder, label_placeholder) #Run one step of the model _, loss_value, softmax_value, fc_weights_value, softmax_weights_value, fc_output = sess.run([train_op, loss_entroy, softmax, fc_weights, softmax_weights, fc], feed_dict = feed_dict) #print('input ************************************') #print(train_data[1000]) #print('label ************************************') #print(train_label) #print('fc weights *******************************') #print(fc_weights_value[0]) #print('fc output ********************************') #print(fc_output[0]) #print('softmax weights **************************') #print(softmax_weights_value[0]) #print('softmax ****************************') #print(softmax_value[0]) duration = time.time() - start_time #Write the summaries and print an overview farily often if step % 100 == 0: #Print status to stdout print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) #Update the events file summary_str = sess.run(summary_op, feed_dict = feed_dict) summary_writer.add_summary(summary_str, step) summary_writer.flush() #Save a checkpoint and evaluate the model periodically if(step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_file = os.path.join(FLAGS.train_dir, 'checkpoint') saver.save(sess, checkpoint_file, global_step = step) #Evaluate against the data set print('Training Data Eval:') do_eval(sess, correct, data_placeholder, label_placeholder, train_data, train_label) print('Test Data Eval:') do_eval(sess, correct, data_placeholder, label_placeholder, test_data, test_label)
import numpy as np import tensorflow as tf import matplotlib.pyplot as plt import data_preprocessing import keras from keras.models import Sequential from keras.layers import Flatten, Dense data = data_preprocessing.load_data(download=False) new_data = data_preprocessing.convert2onehot(data) #convert DataFrame to Numpy new_data = new_data.values.astype(np.float32) print(new_data.shape) #数据在训练的时候需要被打乱顺序 np.random.shuffle(new_data) #打乱顺序以后就要分开所有的数据 #设定分离的索引点 sep = int(0.7 * len(new_data)) train_data = new_data[:sep] #70% training data test_data = new_data[sep:] #30% test data print(new_data.shape) x_train = train_data[:, :21] y_train = train_data[:, 21:] x_test, y_test = test_data[:, :21], test_data[:, 21:] print(x_train.shape) print(y_train.shape) print(x_train.shape[0:1]) print("1111111111111111", x_train.shape[0:1]) model = Sequential()
batch_size = 512 # mini-batch keep_prob = 1 # drop out l2reg = 0 # l2 lstm_sizes = [256] # lstm dimention fc_size = 256 # layer_size max_epochs = 50 # ---- 其他参数 max_sent_len = 30 class_num = 2 show_step = 20 data_path = './data/' # ================== data prepare ================= #loading data from data_preprocesssing X_final, y_final, X_final_dev, y_final_dev, X_final_test, y_final_test = dp.load_data( data_path + "creditcard.csv") # print(len(train_x)) encoder = LabelBinarizer() one_hot_train_y = encoder.fit_transform(y_final) one_hot_dev_y = encoder.fit_transform(y_final_dev) one_hot_final_test_y = encoder.fit_transform(y_final_test) # one_hot_train_y = list(one_hot_train_y) # print(len(one_hot_train_y)) # print(type(one_hot_train_y)) # print(one_hot_train_y) change_train_y = [] change_dev_y = [] change_final_test_y = []
""" @author: Md Rashad Al Hasan Rony """ from keras.models import Sequential, load_model from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional from data_preprocessing import load_data, process_data, split_data # Parameters MAX_DOC_LENGTH = 25 BATCH_SIZE = 256 EPOCHS = 5 # Loading train and test dataset x_train, y_train = load_data("train_data.csv", sample_ratio=0.1) x_test, y_test = load_data("test_data.csv", sample_ratio=0.1) # Data preprocessing x_train, x_test, _, n_vocab = process_data(x_train, x_test, MAX_DOC_LENGTH) # Splitting dataset x_test, x_val, y_test, y_val, _, test_size = split_data(x_test, y_test, 0.1) # Model for training model = Sequential() model.add(Embedding((n_vocab + 1), 15, input_length=MAX_DOC_LENGTH)) model.add(Bidirectional(LSTM(15))) model.add(Dropout(0.5)) model.add(Dense(15, activation='sigmoid')) model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
import data_preprocessing import model import tensorflow as tf import numpy as np # data loading (pickle) dataset_detection_video, classlbl_to_classid = data_preprocessing.load_data() # ====== GRID SEARCH TRAINING========= frame_batch = [15] lstm = [32] relu = [16] for i in lstm: for j in relu: for k in frame_batch: print(str(i)+'-'+str(j)+'-'+str(k)) #features coo, feat_type = data_preprocessing.cooccurrence(dataset_detection_video, k) coint, feat_type = data_preprocessing.cointersection(dataset_detection_video, k) for index, video in enumerate(coint): video['sequence'] = np.concatenate((video['sequence'], coo[index]['sequence']),axis=1)
prediction = clf.predict(testing_set[list(dataset.columns[1:])]) return clf, prediction, testing_set["Severity"] # Train and return a svm model, predictions, and actual values def train_svm(data): training_set = data.copy() testing_set = pd.DataFrame(columns=list(data.columns)) for i in range(int(training_set.index.size / 10000)): j = random.randrange(0, training_set.index.size) testing_set = testing_set.append(training_set.loc[j]) testing_set = testing_set.reset_index(drop=True) training_set = training_set.drop(j) training_set = training_set.reset_index(drop=True) samples = training_set[list(training_set.columns[1:])] targets = training_set["Severity"] svm_model = svm.SVC() svm_model.fit(samples.iloc[0:100], targets.iloc[0:100]) return svm_model, svm_model.predict(testing_set[list(dataset.columns[1:])]), testing_set["Severity"] dataset = dp.load_data("modified1.csv") features = list(dataset.columns[1:]) classes = dataset["Severity"] attributes = dataset[features] pca_scatter(classes, attributes) display_histogram(attributes) display_histogram(pd.DataFrame(classes)) train_decision_tree(dataset)
def train_model(num_epochs, batch_size): """Trains model, outputs train and validation accuracy during training, has a early stopping if validation error increases, outputs test accuracy on unseen dataset. params: :num_epochs: This is the number of cycles the model should be trained on :batch_size: Batch training is being used to perform optimization. Therefore the size of the batch must be specified. It cannot be too large else there will be an out of memory error. Generally 20-50 is a good size for the current setup return: nothing. """ #load data num_classes = 10 images_train, labels_train = load_data('train_32x32.mat') tr_data = normalize(images_train) tr_labels = one_hot_encode_labels(labels_train, num_classes) images_test, labels_test = load_data('test_32x32.mat') te_data = normalize(images_test) te_labels = one_hot_encode_labels(labels_test, num_classes) #split test set into validation set and test set 50/50 num_feat_val_set = 500 val_data, val_labels = te_data[: num_feat_val_set], te_labels[: num_feat_val_set] testing_data_and_labels = generate_data(te_data[num_feat_val_set:], te_labels[num_feat_val_set:]) x, y_, model, train_op, accuracy, keep_prob = classifier() #save model functionailty saver = tf.train.Saver() #variable used to determine if to exit training phase due to overfitting exit = False #start interactive matplotlib session plt.ion() with tf.Session() as sess: sess.run(model) count = 0 #training #get sample length sample_length = tr_data.shape[0] #get number of batches num_batches = int(sample_length / batch_size) #make the containers to hold test and train accuracy train_accuracy = np.zeros([num_epochs * num_batches, 1]) val_accuracy = np.zeros([num_epochs * num_batches, 1]) test_accuracy = 0 for epoch in range(num_epochs): #get shuffled index shuffled_indexes = np.arange(sample_length) np.random.shuffle(shuffled_indexes) #shuffle data shuffled_train_data = tr_data[shuffled_indexes] shuffled_train_labels = tr_labels[shuffled_indexes] for i in range(num_batches): #gather batches start = i * batch_size end = (i + 1) * batch_size #make sure dont access part of an array that doesn't exist(small overlap of data from previous batch will occur - artifact of uneven data set and even batch size) if end > sample_length: end = sample_length if start > sample_length - batch_size: start = sample_length - batch_size batch_x, batch_y = shuffled_train_data[ start:end][:], shuffled_train_labels[start:end][:] _, train_accuracy[count] = sess.run([train_op, accuracy], feed_dict={ x: batch_x, y_: batch_y, keep_prob: 0.5 }) val_accuracy[count] = sess.run(accuracy, feed_dict={ x: val_data, y_: val_labels, keep_prob: 1.0 }) #prints out the accuracy every 7 batches (So that an even amount gets printed out based on num_batches) print_out = int(num_batches / 7) if i % print_out == 0: print("epoch: %d, train_step %d, training accuracy %g" % (epoch, i, train_accuracy[count])) print("epoch: %d, test accuracy: %g" % (epoch, val_accuracy[count])) #plotting train_line, = plt.plot(count, train_accuracy[count], 'bo', label='train_accuracy') val_line, = plt.plot(count, val_accuracy[count], 'ro', label='validation_accuracy') plt.xlabel('Batch Number') plt.ylabel('Accuracy') plt.title('Validation & train accuracy vs Batch Number') plt.pause(0.05) #exit training if there is increased validation error (decrease in test accuracy) if val_accuracy[count] < (val_accuracy[count - 1] - 0.1): exit = True break count += 1 #exit training if there is increased validation error (decrease in test accuracy) if exit: print("Overfitting occuring: exiting training phase") break print( "--------------------------------------------------------------" ) #saves model saver_path = saver.save(sess, './trained_model.ckpt') print("Saved model: ", saver_path) #calculate test accuracy on unseen test set for _ in range(te_data.shape[0] - num_feat_val_set): batch = next(testing_data_and_labels) test_accuracy += sess.run(accuracy, feed_dict={ x: np.reshape(batch[0], (1, 32, 32, 3)), y_: np.reshape(batch[1], (1, 10)), keep_prob: 1.0 }) print("Testing Accuracy: %g" % (float(test_accuracy) / float(te_data.shape[0] - num_feat_val_set)))
def anomaly_detection(): """ Run anomaly detection process end-to-end. Parameters ---------- None. Returns ------- None. """ device = 'cpu' if torch.cuda.device_count() > 0 and torch.cuda.is_available(): print("Cuda installed. Running on GPU") device = 'cuda' else: print("No GPU available!") exit # Importing and cleaning data full_data_frame = dp.load_data("data/full_data.csv") print("Raw dataset loaded into memory") # Only tags that we need lpc_data= full_data_frame[lpc_tags] # Remove data corresponding to offline behaviour online_data = dp.remove_offline_data(lpc_data, min_temp=80, max_ctrl_pres=58, window=24) print("Offline data removed") # Remove features that are deemed invalid - no anti-const condition enforced clean_data = dp.remove_invalid_features(online_data, max_gap_len=12, max_nan_percent=1, min_unique_vals=2000, min_variance=1, max_const_len=-1) print("Invalid features removed") # Interpolate dataframe clean_data.interpolate(method='linear', axis=0, inplace=True) print("Missing data interpolated") # Find deltas from interpolated data delta_data = dp.calculate_deltas(clean_data) print("Deltas calculated") # Normalise (Standardise dataset to ~N(0, 1)) normalised_data = dp.normalise_dataset(delta_data) print("Data normalised") # Save final dataset dp.save_data(normalised_data, "./detection_data.csv") print("Data pre-processing complete") # Load dataset (with indexes saved) data_frame = dp.load_data("./detection_data.csv") dataset = data_frame.to_numpy() print("Data loaded") # Ordered indexes of trips in dataset anomaly_indexes = np.array([10634, 36136, 57280, 57618, 60545, 63144, 118665, 128524, 131118]) anomaly_indexes = dsf.convert_indexes(anomaly_indexes, dataset) num_features = dataset.shape[1] - 1 seq_length = 144 stride = 72 # Create data loaders for all purposes anomaly_loader, normal_loader, train_loader, valid_loader, train_valid_loader, \ test_loader, valid_index, test_index = \ dsf.create_mode_loaders(dataset, anomaly_indexes, seq_length, stride) print("Data loaders created") # Initialise and train model model = dml.Autoencoder(num_features, 60).to(device) dml.train_autoencoder(model, train_valid_loader, test_loader, 120, False) print("Model training done, best model saved") # Classify all data by mean reconstrution error anomalies = dsf.assess_autoencoder(model, normal_loader, valid_index, test_index, False, anomaly_loader, seq_length, num_features) print("Model assessment done") # Save sorted anomalies with their time indexes pd.DataFrame(anomalies).to_csv("./detected_anomalies.csv") print("Detected anomalies saved") print("Process complete") return
# -*- coding: utf-8 -*- """ Created on Fri Jun 01 22:49:56 2018 @author: xuc """ from data_preprocessing import load_data dataset = load_data('data') print dataset
def TL(source, target=None, path='../gen_patches/dataset_noisy/', retrain=False, retrain_ft_layers=[1, 1, 1, 1, 1, 1], outputfolder='backup', outputfolderres='backup_res', batchsize=1000, sourcemodelspath='./'): """ TO DO: FINISH DOCUMENTATION """ options = { 'sourcemodelspath': sourcemodelspath, 'outputfolder': outputfolder, 'outputfolderres': outputfolderres, 'verbose': 0, 'viewdata': False, 'trainsize': 0.6, 'patchsize': 20, 'measure': 'acc', 'weight': 200, 'datanormalize': True, # ---------- one-class learning 'replicate': False, 'oneclass': False, # ---------- source problem params 'database_source': 'db2', 'resolution_source': source, 'nclasses_source': 2, # TODO: do this automatically # ---------- target problem params 'database_target': 'db2', 'resolution_target': target, # ---------- TL hyperparams 'retrain': retrain, 'retrain_ft_layers': retrain_ft_layers, # ---------- hyperparams 'nruns': 20, 'folds': 3, 'hlayers': [len(retrain_ft_layers) / 2], # X hidden + 1 log layer 'nneurons': [1000], # range(500, 1001, 250), 'pretraining_epochs': [1000], # [200] 'training_epochs': [3000], # [100] 'pretrain_lr': [0.01, 0.001], #[ 0.01, 0.001], 'finetune_lr': [0.1, 0.01], #[ 0.1, 0.01], 'threshold': [0.8], #[ 0.5 , 0.6, 0.8], #numpy.arange(.5,1.01,.1), 'batchsize': [batchsize], #[100] or [1000] depending on the size of the dataset. # ---------- end of hyperparams 'corruptlevels': [0.1], #numpy.arange(0.1, 0.4, 0.1) } print >> sys.stderr, (options), "\n" # ------------------------------------------------------------------------------- datasetpath = path # print argv # print datasetpath # print retrain_ft_layers # alaallala # ------------------------------------------------------------------------------- # load dataset if options['retrain'] == 1: options['database'] = options['database_target'] options['resolution'] = options['resolution_target'] else: options['database'] = options['database_source'] options['resolution'] = options['resolution_source'] (dataset, ndim, nclasses) = load_data(datasetpath, options) options['ndim'] = ndim options['nclasses'] = nclasses # -------------------------------------------------------------------------------------------- for nrun in range(1, options['nruns'] + 1): print >> sys.stderr, ("### {0:03d} of {1:03d}".format( nrun, options['nruns'])) options['numpy_rng'] = numpy.random.RandomState(nrun) options['theano_rng'] = RandomStreams(seed=nrun) # -------------- # generate folds folds = gen_folds(dataset, options, nrun) # continue if options['retrain'] == 1: filename = "{0:s}/{1:05d}_{2:03d}_model.pkl.gz".format( options['sourcemodelspath'], nrun, string.atoi(options['resolution_source'])) print >> sys.stderr, ":: Loading model {0:s}...\n".format(filename) sda_reuse_model = load_savedgzdata(filename) #print sda_reuse_model.logLayer.W.get_value() #print sda_reuse_model.logLayer.W.get_value() #kkk else: sda_reuse_model = None # ---------------------------------------------------------------------------- results = do_experiment(folds, options, nrun, sda_reuse_model) # ---------------------------------------------------------------------------- # -------------------------------------------------- filename = '{0:s}/res_{1:05d}_{2:03d}.pkl.gz'.format( options['outputfolderres'], nrun, string.atoi(options['resolution'])) save_results(filename, results)
def train_model(learn_model, num_epochs, batch_size, learning_rate): """takes a some parameters, trains a specified model, calculates accuracy during training and prints it out. Prints out final accuracy on a test set as well""" #load data num_classes = 6 tr_data = normalize(load_data('X_train.txt')) tr_labels = one_hot_encode_labels(load_data('y_train.txt'),num_classes) te_data = normalize(load_data('X_test.txt')) te_labels = one_hot_encode_labels(load_data('y_test.txt'),num_classes) #determine which learn method to run if learn_model =='logistic' or learn_model == '2-layer': if learn_model=='logistic': x,y_,model,train_op,accuracy = logistic_classifier(learning_rate) else: x,y_,model,train_op,accuracy = two_layer_net(learning_rate) with tf.Session() as sess: sess.run(model) count = 0 #get sample length sample_length = tr_data.shape[0] #get number of batches num_batches = int(sample_length/batch_size) #make the containers to hold test and train accuracy train_accuracy = np.zeros([num_epochs*num_batches,1]) test_accuracy = np.zeros([num_epochs,1]) for epoch in range(num_epochs): #get shuffled index shuffled_indexes = np.arange(sample_length) np.random.shuffle(shuffled_indexes) #shuffle data shuffled_train_data = tr_data[shuffled_indexes] shuffled_train_labels = tr_labels[shuffled_indexes] for i in range(num_batches): #gather batches start = i*batch_size end = (i+1)*batch_size #make sure dont access part of an array that doesn't exist(small overlap of data from previous batch will occur - artifact of uneven data set and even batch size) if end > sample_length: end = sample_length if start > sample_length-batch_size: start = sample_length-batch_size batch_x,batch_y = shuffled_train_data[start:end][:],shuffled_train_labels[start:end][:] _,train_accuracy[count] = sess.run([train_op, accuracy], feed_dict={x:batch_x,y_:batch_y}) #prints out the accuracy every 7 batches (So that an even amount gets printed out based on num_batches) print_out = int(num_batches/7) if i%print_out==0: print("epoch%d, train_step %d, training accuracy %g"%(epoch, i, train_accuracy[count])) count +=1 test_accuracy[epoch] = sess.run(accuracy, feed_dict={x: te_data, y_: te_labels}) print("epoch: %d, test accuracy: %g"%(epoch, test_accuracy[epoch])) print("--------------------------------------------------------------") #calculates average accuracy at five points #plots train accuracy train_line, = plt.plot(train_accuracy,'r.', label = 'train accuracy') test_line, = plt.plot([e*num_batches for e in range(num_epochs)], test_accuracy,'b-', label = 'test accuracy') plt.legend(loc = 'lower right') plt.xlabel('Batch Number') plt.ylabel('Accuracy') plt.title('Prediction Accuracy vs Batch Number') #plt.legend(handles=[]) plt.show() if learn_model =='knn': #get input from user raw_k_val = input("Please enter odd k value (recommend 7) or enter 0 to run a list of k values: ") if int(raw_k_val)==0: #ran multiple k's to decide which was best for this data set (Keep odd so that there are no ties) num_neighbours = [1,3,5,7,9,11,13,15] #num_neighbours = [1,3] else: num_neighbours = [int(raw_k_val)] #will contain the accuracy for each value of k train_accuracy = np.zeros((len(num_neighbours))) for index, k in enumerate(num_neighbours): #retrieve object for graph_constructor x,y_,xtest, ytest,accuracy, model,train_op = knn(k) with tf.Session() as sess: sess.run(model) # loop over test data for i in range(len(te_data)): # Get nearest neighbor to each row of test data which represents one multi dimensional data point train_accuracy[index] += sess.run(accuracy, feed_dict={x: tr_data, y_: tr_labels, xtest: te_data[i, :], ytest: te_labels[i]}) if i%200==0: print(str(i) + ' out of ' + str(len(te_data)) + ' have been tested') print("k = {}, Accuracy: {} ".format(k, train_accuracy[index]/len(te_data))) #only plot if there is more than one k value if int(raw_k_val) == 0: plt.plot(num_neighbours, train_accuracy/len(te_data), 'ro') plt.xlabel('K - value') plt.ylabel('Accuracy') plt.title('Prediction Accuracy vs Batch Number') plt.show()
from data_preprocessing import extract_swell_dataset, extract_dreamer_dataset, extract_amigos_dataset, extract_wesad_dataset, load_data from data_preprocessing import swell_prepare_for_10fold, wesad_prepare_for_10fold, amigos_prepare_for_10fold, dreamer_prepare_for_10fold, current_time, makedirs, one_hot_encoding from send_emails import send_email #data_folder = Path("D:\\Code Repo\\Working dir\\Self supervised task\\NEW_WORK_DIR\\data_folder\\full_data_filtered\\") extract_data = 0 """ for the first time run this """ #if extract_data == 1: # _ = extract_swell_dataset(overlap_pct= 1, window_size_sec= 10, data_save_path= data_folder, save= 1) # _ = extract_dreamer_dataset(overlap_pct= 1, window_size_sec= 10, data_save_path= data_folder, save= 1) # _ = extract_amigos_dataset(overlap_pct= 1, window_size_sec= 10, data_save_path= data_folder, save=1) # _ = extract_wesad_dataset(overlap_pct=1, window_size_sec=10, data_save_path= data_folder, save=1) dirname = Path("/home/pritam/self_supervised_learning/") swell_data = load_data(dirname / "data_folder/full_data_filtered/swell_dict.npy") dreamer_data = load_data(dirname / "data_folder/full_data_filtered/dreamer_dict.npy") amigos_data = load_data(dirname / "data_folder/full_data_filtered/amigos_dict.npy") wesad_data = load_data(dirname / "data_folder/full_data_filtered/wesad_dict.npy") swell_data = swell_prepare_for_10fold( swell_data) #person, y_input_stress, y_arousal, y_valence, wesad_data = wesad_prepare_for_10fold(wesad_data) # person, y_stress amigos_data = amigos_prepare_for_10fold( amigos_data) # person, y_arousal, y_valence, y_dominance dreamer_data = dreamer_prepare_for_10fold( dreamer_data) # person, y_arousal, y_valence, y_dominance
# Importing the libraries import seq2seq_wrapper import importlib importlib.reload(seq2seq_wrapper) import data_preprocessing import data_utils_1 import data_utils_2 ########## PART 1 - DATA PREPROCESSING ########## # Importing the dataset metadata, idx_q, idx_a = data_preprocessing.load_data(PATH = 'C:\Users\LENOVO\Documents\ChatBot\The Best ChatBot\') # Splitting the dataset into the Training set and the Test set (trainX, trainY), (testX, testY), (validX, validY) = data_utils_1.split_dataset(idx_q, idx_a) # Embedding xseq_len = trainX.shape[-1] yseq_len = trainY.shape[-1] batch_size = 16 vocab_twit = metadata['idx2w'] xvocab_size = len(metadata['idx2w']) yvocab_size = xvocab_size emb_dim = 1024 idx2w, w2idx, limit = data_utils_2.get_metadata()
import importlib import imp #importlib.reload(seq2seq_wrapper) imp.reload(seq2seq_wrapper) import data_preprocessing import data_utils_1 import data_utils_2 from flask import Flask, jsonify, render_template, request ########## PART 1 - DATA PREPROCESSING ########## # Importing the dataset metadata, idx_q, idx_a = data_preprocessing.load_data(PATH = './') # Splitting the dataset into the Training set and the Test set (trainX, trainY), (testX, testY), (validX, validY) = data_utils_1.split_dataset(idx_q, idx_a) # Embedding xseq_len = trainX.shape[-1] yseq_len = trainY.shape[-1] batch_size = 16 vocab_twit = metadata['idx2w'] xvocab_size = len(metadata['idx2w']) yvocab_size = xvocab_size emb_dim = 1024 idx2w, w2idx, limit = data_utils_2.get_metadata()
import numpy as np import theano as theano import theano.tensor as T import time import operator from data_preprocessing import load_data, load_model_parameters_theano, generate_sentences from gru_theano import * import sys # Load data (this may take a few minutes) VOCABULARY_SIZE = 8000 X_train, y_train, word_to_index, index_to_word = load_data("data/reddit-comments-2015.csv", VOCABULARY_SIZE) # Load parameters of pre-trained model model = load_model_parameters_theano('./data/pretrained.npz') generate_sentences(model, 100, index_to_word, word_to_index)