def main(): data = DataHelper("data/Twitter.csv") # data.MinMaxScale() rp = Representation(data.getdata()) # rp.cal_mean_by_class() # rp.cal_cosdist_by_eigenvect() _ = rp.cal_KLdivergence()
def run_blstm( dim_proj=256, # LSTM number of hidden units. dim_frame=4096, # feature dimension of image frame in the video output_dim=4096, v_length=24, # video length or number of frames max_iter=100, # The maximum number of epoch to run l2_decay=0.0001, # Weight decay for model params. lrate=0.0001, # Learning rate for SGD, Adam lamb=0.2, optimizer='SGD', # SGD, Adam available saveto='pairwise-blstm_model.npz', # The best model will be saved there dispFreq=2, # Display to stdout the training progress every N updates validFreq=20, # Compute the validation error after this number of update. saveFreq=2, # Save the parameters after every saveFreq updates batch_size=256, # The batch size during training. valid_batch_size=20, # The batch size used for validation/test set. test_batch_size=1024, weights=[ 1. / 3., 1. / 3., 1. / 3. ], # The Weights for forwoad and backward reconstruction and mean value reconstruction pairwise_weight=0.999, reload_model=False, # If reload model from saveto. is_train=False, test_step=1, hiera_step=2, train_data_file_path='/mnt/data2/lixiangpeng/dataset/features/FCV/fcv/', test_data_file_path='/mnt/data2/lixiangpeng/dataset/features/FCV/fcv/', #train_data_file_path = './', #test_data_file_path = './', train_splits_num=1, test_splits_num=1, record_path='./records.txt', SS_path='/mnt/data2/lixiangpeng/dataset/features/FCV/SimilarityInfo/Sim_K1_10_K2_5_fcv.pkl' ): model_options = locals().copy() if reload_model: print "Reloading model options" with open('%s.pkl' % saveto, 'rb') as f: model_options = pkl.load(f) print "model options", model_options test_data = DataHelper.DataHelper(model_options['v_length'], model_options['valid_batch_size'], model_options['dim_frame'], data_file='./data/fcv_test_demo.h5', train=False) model = build_model(model_options) if reload_model: model.reload_params(saveto) model.compile(model_options) if is_train: model.train(model_options['train_data_file_path'], test_data, model_options) else: model.test(test_data, model_options)
def main(): data = DataHelper('data/Twitter.csv') data.MinMaxScale() data = data.getdata() traindata = data[:4] # traindata = oversamplingSMOTE(traindata,read=True) traindata = np.concatenate(traindata) testdata = data[4:] # method = 'KNN' # run abour 40s acc 96.75% f-measure 0.65 # method = 'SVM' # run about 30s acc 95% f-measure overfit(nan) # method = 'Random Forest' # run about 0.44s acc 98.28% f-measure 0.79 method = 'XGBoost' start = time.time() clf = train(method, traindata) acc = [] imbalanced = [] MDDT = [[5, 9], [11, 14], [16, 18], [25, 29], [31, 37]] CUSUM = [[13, 16], [23, 24], [25, 26], [28, 30], [35, 38]] PH = [[7, 8], [13, 14], [25, 26]] for i in range(len(testdata) // 4): print(i) last_i = 0 testbatch = np.concatenate(testdata[i * 4:(i + 1) * 4]) try: acc.append(test(clf, testbatch, criteria='acc')) imbalanced.append(test(clf, testbatch, criteria='confusion matrix')) except: acc.append(0) imbalanced.append([0, 0, 0]) testbatch = np.concatenate(testdata[i * 4:(i + 1) * 4]) clf.fit(testbatch[:, :-1], testbatch[:, -1]) for t in MDDT: start = np.floor(t[0] / 4.0) - 1 end = np.ceil(t[1] / 4.0) - 1 if start == i: try: testbatch = np.concatenate(testdata[t[0] - 4:t[1] - 4]) clf.fit(testbatch[:, :-1], testbatch[:, -1]) except: continue break df = pd.DataFrame(acc) # df.to_csv("MDDT_ACC_XGB.csv",header=None,index=None) df = pd.DataFrame(imbalanced) # df.to_csv("MDDT_CONFUSION_XGB.csv",header=None,index=None) print("time:", time.time() - start)
def parallelize(self): try: for index, pair in enumerate(self.index_pairs): proc = Process(target=DataHelper.DataHelper(self.queue, self.xmls[pair[0] : pair[1] + 1]).run, args=(pair[0], pair[1])) self.procs.append(proc) proc.start() for proc in self.procs: proc.join() except KeyboardInterrupt: # Wait for all processes to terminate while True: alive_proc = False for proc in self.procs: if proc.is_alive() == True: alive_proc = True if alive_proc == False: break print("\nKEYBOARD INTERRUPT !!\n") # Save checkpoint ... check_message = '' pairs = [self.queue.get() for i in range(core_number)] pairs.sort() for pair in pairs: check_message += str(pair[0]) + ' / ' + str(pair[1]) + ' , ' self.progress += int(pair[2]) check_message += str(self.progress) + ' / ' + str(self.total_number) + '\n' print('{}/{} : {:.2f}% Complete..\n'.format(self.progress, self.total_number, self.progress / self.total_number * 100)) self.file_checkpoint.write(check_message) self.file_checkpoint.close() return print('Processing of {} xml data was completed and stored in the database !!'.format(self.total_number)) self.file_checkpoint.close()
import tensorflow as tf import numpy as np from DataHelper import * from Model import * data_helper = DataHelper(pos_data_file="rt-polaritydata/rt-polarity.pos", neg_data_file="rt-polaritydata/rt-polarity.neg", training_data_proportion=.9) with tf.Session() as session: model = Model(sent_len=data_helper.max_sent_len, category_num=2, vocab_size=data_helper.vocab_size, emb_dim=300, filter_widths=[3, 4, 5, 6, 7], filter_num=100, l2_reg_lambda=100.) model.train(session=session, data_helper=data_helper, dropout_keep_prob=.5, max_norm_constraint=3., batch_size=50, epoch_num=50, step_num_between_validations=50)
def test(_encoder, options, uidx): print 'loading test data...' hashcode_array = Array() h_array = Array() label_array = Array() lines_num = 0 for i in range(1, options['test_splits_num'] + 1): file_name = 'fcv_test_feats.h5' labels_name = 'fcv_test_labels.mat' print 'loading ', file_name test_data = DataHelper.DataHelper( options['v_length'], options['batch_size'], options['dim_frame'], data_file=options['test_data_file_path'] + file_name, train=True) labels = sio.loadmat(options['test_data_file_path'] + labels_name)['labels'] lines_num += test_data.data_size_ if i == 1: label_array.setmatrcs(labels) else: label_array.concate_v(labels) print 'data_size: ', test_data.data_size_ print 'batch_size: ', test_data.batch_size_ batch_num = test_data.data_size_ / options['test_batch_size'] if test_data.data_size_ % options['test_batch_size'] == 0: batch_num = batch_num else: batch_num += 1 for batch_idx in range(batch_num): print 'batch_idx: ', batch_idx time1 = time.time() if batch_idx == (batch_num - 1): X = test_data.data_[ batch_idx * options['test_batch_size']:][:, :options['v_length'], :] X = np.row_stack( (X, np.float32( np.zeros( (options['test_batch_size'] - X.shape[0], options['v_length'], options['dim_frame']))))) else: X = test_data.data_[ batch_idx * options['test_batch_size']:(batch_idx + 1) * options['test_batch_size']][:, :options['v_length'], :] time2 = time.time() print 'fetching data costs: ', time2 - time1 print 'batch data shape: ', X.shape my_H = _encoder( X, np.zeros((X.shape[0], options['dim_proj']), dtype=np.float32), np.zeros((X.shape[0], options['dim_proj']), dtype=np.float32), np.zeros((X.shape[0], options['dim_proj']), dtype=np.float32), np.zeros((X.shape[0], options['dim_proj']), dtype=np.float32)) time3 = time.time() print 'forward costs: ', time3 - time2 print 'my_H: ', my_H.shape BinaryCode = sign(my_H) if i == 1 and batch_idx == 0: hashcode_array.setmatrcs(BinaryCode) h_array.setmatrcs(my_H) else: hashcode_array.concate_v(BinaryCode) h_array.concate_v(my_H) hashcode_array.setmatrcs(hashcode_array.getmatrics()[:lines_num]) h_array.setmatrcs(hashcode_array.getmatrics()[:lines_num]) print 'hashcode shape:', hashcode_array.getmatrics().shape #sio.savemat(str(options['dim_proj'])+'_'+'hashcode_' + str(uidx) + '.mat', {'hashcode': hashcode_array.getmatrics()}) #sio.savemat(str(options['dim_proj'])+'_'+'h_' + str(uidx) + '.mat', {'h': h_array.getmatrics()}) test_hashcode = hashcode_array.getmatrics() print 'test_hashcode: ', test_hashcode.shape test_hashcode = np.matrix(test_hashcode) time1 = time.time() Hamming_distance = 0.5 * (-np.dot(test_hashcode, test_hashcode.transpose()) + options['dim_proj']) time2 = time.time() print 'hamming distance computation costs: ', time2 - time1 HammingRank = np.argsort(Hamming_distance, axis=0) time3 = time.time() print 'hamming ranking costs: ', time3 - time2 labels = label_array.getmatrics() print 'labels shape: ', labels.shape sim_labels = np.dot(labels, labels.transpose()) time6 = time.time() print 'similarity labels generation costs: ', time6 - time3 records = open('map.txt', 'w+') maps = [] for i in range(5, 105, 5): map = tools.mAP(sim_labels, HammingRank, i) maps.append(map) records.write('epoch: ' + str(uidx) + '\ttopK: ' + str(i) + '\tmap: ' + str(map) + '\n') print 'i: ', i, ' map: ', map, '\n' time7 = time.time() records.close() print 'computing processing costs: ', time7 - time6 return maps
pos_len = len(pos) neg_len = len(neg) length = pos_len + neg_len labels = [[1, 0]] * pos_len labels += [[0, 1]] * neg_len labels = np.array(labels) data = np.concatenate((pos, neg), axis=0) # shuffle data indice = np.random.permutation(length) data = data[indice] labels = labels[indice] test_data = data[0.9 * length:] test_labels = labels[0.9 * length:] data = data[:0.9 * length] labels = labels[:0.9 * length] d = DataHelper(50, data.shape[0], data, word_vec, word_indice, labels) t_d = DataHelper(50, test_data.shape[0], test_data, word_vec, word_indice, test_labels) optm = tf.train.AdamOptimizer(1e-3) train_ops = optm.apply_gradients(optm.compute_gradients(cross_entropy)) sess = tf.Session() sess.run(tf.initialize_all_variables()) def train_step(x, y): feed_dicts = {x_placeholder: x, y_placeholder: y, prob: 0.5} loss = sess.run([train_ops], feed_dict=feed_dicts) with sess.as_default():
with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_x = graph.get_operation_by_name("input_x").outputs[0] # Get the dropout operation from the graph bt name dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0] # Tensors we want to evaluate prediction_op = graph.get_operation_by_name("output/predictions").outputs[0] test_dm = DataHelper(test_data) # Generate padding dataframe and padding to first sentence padding = pd.DataFrame(columns = ['sentence'], index = range(FLAGS.history_size1)) for i in range(FLAGS.history_size1 + FLAGS.history_size2): padding['sentence'][i]=np.array([0] * max_sentence_length) padding = padding['sentence'] unshuffled_test_data = test_dm.get_test_contents() # Collect the predictions here all_predictions = [] for x_test in unshuffled_test_data: x_test = padding.append(x_test, ignore_index=True) for i in range(len(x_test)-FLAGS.history_size1-1): x_test_batch = x_test[i:i+FLAGS.history_size1+FLAGS.history_size2+1].values.tolist()
summaries, loss, accuracy, precision, recall = sess.run([ dev_summary_op, model.loss, model.accuracy, dev_precision_op, dev_recall_op ], feed_dict) if validation_step % 1000 == 0: time_str = datetime.datetime.now().isoformat() print( "dev> {}: step {}, loss {:g}, acc {:g}, pre {:g}, recall {:g}" .format(time_str, validation_step, loss, accuracy, precision, recall)) if writer: writer.add_summary(summaries, validation_step) train_dm = DataHelper(whole_train_data) dev_dm = DataHelper(whole_dev_data) #zero padding for the first sentence that does not have history sentences padding = pd.DataFrame(columns=['sentence'], index=range(FLAGS.history_size1)) for i in range(FLAGS.history_size1 + FLAGS.history_size2): padding['sentence'][i] = np.array([0] * max_sentence_length) padding = padding['sentence'] validation_step = 0 #generate batches for epoch_i in range(FLAGS.num_epochs): train_data = train_dm.get_contents(shuffle=True) for x_batch, y_batch in train_data: x_batch = padding.append(x_batch, ignore_index=True)
}) print('Validation-time loss = {}'.format(test_loss)) if save_model: saver.save(sess, save_path) print('Saved trained model to disk!') # optional plot print('Plotting predictions...') plt.figure(figsize=(12, 7)) plt.plot(test_predictions[:, -1, :], color='green', alpha=0.9, label='Predicted Value') plt.plot(y_test, color='red', alpha=0.5, label='Ground truth') ax = plt.gca() ax.grid(color='black', alpha=0.12) plt.xlabel('Timestep') plt.ylabel('Value') plt.title('Predictions vs. Ground truth') plt.legend() plt.show() print('Process complete!') if __name__ == '__main__': helper = DataHelper('../data/PFE.csv') tf.reset_default_graph() predictor = PricePredictor(learning_rate=0.0001) predictor.run(helper, 100, save_model=True, save_path='../model/model.ckpt')
labels = np.array(labels) data = np.concatenate((pos, neg), axis=0) # shuffle data indice = np.random.permutation(length) data = data[indice] labels = labels[indice] data_length = len(data) dev_indice = int(0.1 * data_length) print dev_indice train_data = data[dev_indice:] train_labels = labels[dev_indice:] dev_data = data[:dev_indice] dev_labels = labels[:dev_indice] d = DataHelper(50, len(train_data), train_data, word_vec, word_indice, train_labels) dev_d = DataHelper(50, len(dev_data), dev_data, word_vec, word_indice, dev_labels) cnn = CNNNet(4, [3, 4, 5], 50, 300, 56, 2) optm = tf.train.AdamOptimizer(1e-3) train_ops = optm.apply_gradients(optm.compute_gradients(cnn.loss)) tf.summary.scalar('accuracy', cnn.accuracy) tf.summary.scalar('loss', cnn.loss) tf.summary.histogram('full_w', cnn.full_w) summary = tf.merge_all_summaries() sess = tf.Session() summaryWriter = tf.train.SummaryWriter('./log/summary/', sess.graph) sess.run(tf.initialize_all_variables())
def train(self, train_data_path, test_data, options): validFreq = options['validFreq'] saveFreq = options['saveFreq'] dispFreq = options['dispFreq'] max_iter = options['max_iter'] saveto =options['saveto'] train_loss_his = [] test_loss_his = [] start_time = time.time() #test_loss_ = self.test_loss(self._test, test_data, options) # test_loss_his.append(test_loss_) # print 'Valid cost:', test_loss_ train_loss = 0. records_file = open(options['record_path'],'w+') file_name = options['train_data_file_path'] + 'fcv_train_feats.h5' train_data = DataHelper.DataHelper(options['v_length'], options['batch_size'], options['dim_frame'], data_file=file_name, train=True) H = np.zeros([train_data.data_size_, options['dim_proj']],dtype=np.float32) try: for uidx in xrange(1,max_iter+1): #get splits of an epoch for eidx in xrange(1,options['train_splits_num']+1): #for YFCC #file_name = options['train_data_file_path']+'yfcc_train_feats_'+str(eidx)+'.h5' #for FCV file_name = options['train_data_file_path'] + 'fcv_train_feats.h5' train_data = DataHelper.DataHelper(options['v_length'], options['batch_size'], options['dim_frame'], data_file= file_name, train=True) print 'loading data:'+file_name #get the batch train data m = train_data.data_size_/train_data.batch_size_ if train_data.data_size_%train_data.batch_size_ == 0: m = m else: m += 1 print 'm: ',m for i in range(0,m): #if i % 10 ==0: #print i if i == (m-1): x = indexContent(train_data,train_data.idx_[i*options['batch_size']:]) idxs = train_data.idx_[i*options['batch_size']:] else: x = indexContent(train_data,train_data.idx_[i*options['batch_size']:(i+1)*options['batch_size']]) idxs = train_data.idx_[i*options['batch_size']:(i+1)*options['batch_size']] [H, train_loss, loss_pairwise,reconstruction_loss] = self._train( x,idxs,H, np.zeros((x.shape[0], options['dim_proj']), dtype=np.float32), np.zeros((x.shape[0], options['dim_proj']), dtype=np.float32), np.zeros((x.shape[0], options['dim_proj']), dtype=np.float32), np.zeros((x.shape[0], options['dim_proj']), dtype=np.float32), np.zeros((x.shape[0], options['dim_proj']), dtype=np.float32), np.zeros((x.shape[0], options['dim_proj']), dtype=np.float32), np.zeros((x.shape[0], options['dim_proj']), dtype=np.float32), np.zeros((x.shape[0], options['dim_proj']), dtype=np.float32), np.zeros((x.shape[0], options['dim_proj']), dtype=np.float32), np.zeros((x.shape[0], options['dim_proj']), dtype=np.float32)) if i % 10 == 0: print 'Epoch: ',uidx,'\tPart: ',eidx,'\tBatch: ',i,'\tCost: ',train_loss,'\tpairwise_loss: ',loss_pairwise,'\trestruction_loss: ',reconstruction_loss records_file.write('Epoch: '+str(uidx)+'\tPart: '+str(eidx)+'\tBatch: '+str(i)+'\tCost: '+str(train_loss)+'\tpairwise_loss: '+str(loss_pairwise)+'\trestruction_loss'+str(reconstruction_loss)+'\n') if uidx%options['validFreq'] == 0: print 'start testing...' maps = evaluation.test(self._encoder,options,uidx) if np.isnan(train_loss) or np.isinf(train_loss): print 'bad cost detected: ', train_loss if np.mod(uidx, dispFreq) == 0 or uidx is 1: train_loss = train_loss/(x.shape[0]*x.shape[1]) train_loss_his.append(train_loss) print 'Step ', uidx, 'Train cost:', train_loss if saveto and np.mod(uidx, saveFreq) == 0: print 'Saving...', params_to_save = self.get_params_value() updates_value = self.get_updates_value() np.savez(saveto, params=params_to_save, updates_v=updates_value, train_loss_his=train_loss_his) pkl.dump(options, open('%s.pkl' % saveto, 'wb'), -1) print 'Save Done' except KeyboardInterrupt: print "Training interupted" print 'Saving records!' records_file.close() if saveto: print 'Saving...', params_to_save = self.get_params_value() updates_value = self.get_updates_value() np.savez(saveto, params=params_to_save, updates_v=updates_value, train_loss_his=train_loss_his, test_loss_his=test_loss_his) pkl.dump(options, open('%s.pkl' % saveto, 'wb'), -1) print 'Save Done' end_time = time.time() print ('Training took %.1fs' % (end_time - start_time))