def test_fun(var_file, pool_size, num_feature, stp, parse_type): #create result_directory res_dir = '/'.join(var_file.split('/')[:-1]) + '/results/' if not os.path.isdir(res_dir): os.mkdir(res_dir) #initalizing model nn, isize = init_model(var_file) isize = 50 wvect = Word_vector(isize, vtype='msrp') # a.optimum.opt_var.g=g # getting data that is processed for testing porpose train, train_label, test, test_label = get_msrp_data(stp) # preprocessing for train and test set if os.path.isfile('/'.join(var_file.split('/')[:-1])+'/vectors.pickle'): wvect.load_vector('/'.join(var_file.split('/')[:-1])+'/vectors.pickle') data_processing = preprocess(parsing_type=parse_type, structure_type='h', stopword=stp, wvect=wvect) train_set, _ = data_processing.process_words_data(train) test_set, _ = data_processing.process_words_data(test) # generating fixed size phrase vector for train and test set otrain = generate_fixed_vector(nn, train_set, num_feature, pool_size) otest = generate_fixed_vector(nn, test_set, num_feature, pool_size) # classifier defination # clf = MLPClassifier(activation='logistic', solver='adam',alpha=0.0001,batch_size='auto',learning_rate='adaptive',max_iter=10000,tol=1e-5, verbose=0) clf = svm.LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.00001, C=1.0, multi_class='ovr',fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=10000) clf.fit(otrain, train_label) score = clf.predict(otest) # nn classifier # train_label = [[0.0, 1.0] if i == 1 else [1.0, 0.0] for i in train_label] # i_size, o_size = len(otrain[0]), len(train_label[0]) # clf = NN(i_size, 2*i_size, o_size, batch_size=50, epoch=200, neta=.0001, op_method='adam', errtol=0.0001) # clf.train(zip(otrain, train_label)) # score = clf.pridect(otest) # score = [np.argmax(i) for i in score] # getting results tp, tn, fp, fn, acc, f1 = get_results(score, test_label) print '\npool size : %d,\tnumber feature : %d,\t stopword : %d\n\tTrue positive : %d\n\tTrue negative : %d\n\tFalse positive : %d\n\tFalse negatie : %d\n\taccuracy : %f\n\tf1 score : %f\n'%(pool_size,num_feature,stp,tp, tn, fp, fn,acc, f1) # logging result in file open(res_dir+'res.txt','a').write('\npool size : %d,\tnumber feature : %d,\t stopword : %d\n\tTrue positive : %d\n\tTrue negative : %d\n\tFalse positive : %d\n\tFalse negatie : %d\n\taccuracy : %f\n\tf1 score : %f\n'%(pool_size,num_feature,stp,tp, tn, fp, fn,acc, f1))
def main_train(args): print_setting(args) wfname, log = log_data(args) words_data_check = pickle.load( open("/media/zero/41FF48D81730BD9B/DT_RAE/data/pickle/110.pickle", 'rb')) # words_data = pickle.load(open("/media/zero/41FF48D81730BD9B/DT_RAE/data/pickle/110.pickle", 'rb')) words_data = pickle.load(open('/media/zero/41FF48D81730BD9B/DT_RAE/data/pickle/msr_paraphrase_train.pickle', 'rb')) \ + pickle.load(open('/media/zero/41FF48D81730BD9B/DT_RAE/data/pickle/bbc.pickle', 'rb')) \ + pickle.load(open('/media/zero/41FF48D81730BD9B/DT_RAE/data/pickle/1all-news.pickle', 'rb')) \ # + pickle.load(open('/media/zero/41FF48D81730BD9B/DT_RAE/data/pickle/2all-news.pickle', 'rb')) \ # + pickle.load(open('/media/zero/41FF48D81730BD9B/DT_RAE/data/pickle/3all-news.pickle', 'rb')) # + pickle.load(open('/media/zero/41FF48D81730BD9B/DT_RAE/data/pickle/4all-news.pickle', 'rb')) \ # + pickle.load(open('/media/zero/41FF48D81730BD9B/DT_RAE/data/pickle/5all-news.pickle', 'rb')) wvect = Word_vector(args['v_size']) data_processing = preprocess(parsing_type=args['parse-type'], structure_type=args['type'], stopword=args['stp'], wvect=wvect) data, wpresent = data_processing.process_words_data(words_data) nn = create_model(wfname, args['wload'], args['model'], args['method'], args['v_size'], args['h_size'], neta=args['neta'], wpresent=wpresent, logg=log['rae'], vector=wvect) nn.train(xs=data, epoch=args['epoch'], batch_size=args['batch_size']) nn.save_variables(wfname) print "model variables saved." data, wpresent = data_processing.process_words_data(words_data_check) nn.gradient_check(data, sorted(wpresent)) # wvect.save_vector('/'.join(wfname.split('/')[:-1])+'/vectors.pickle') # print "vector variables saved." return
def test_fun(var_file, pool_size, num_feature, stp, parse_type): #create result_directory clfFile = 'weights/classifier_' + str(num_feature) + str(stp) + '.pkl' res_dir = '/'.join(var_file.split('/')[:-1]) + '/results/' if not os.path.isdir(res_dir): os.mkdir(res_dir) if os.path.isfile(clfFile): clf = pickle.load(open(clfFile, 'rb')) else: # getting data that is processed for testing porpose train, train_label, test, test_label = get_msrp_data(stp) #initalizing model isize, hsize, w, b, g = pickle.load(open(var_file, 'rb')) nn = stack_RAE(input_size=isize, hidden_size=hsize) nn.w = w nn.b = b # a.optimum.opt_var.g=g # preprocessing for train and test set wvect = Word_vector(isize) data_processing = preprocess(parsing_type=parse_type, structure_type='h', stopword=stp, wvect=wvect) train_set, _ = data_processing.process_words_data(train) test_set, _ = data_processing.process_words_data(test) # generating fixed size phrase vector for train and test set otrain = generate_fixed_vector(nn, train_set, num_feature, pool_size) otest = generate_fixed_vector(nn, test_set, num_feature, pool_size) # classifier defination # clf = MLPClassifier(activation='logistic', solver='adam',alpha=0.0001,batch_size='auto',learning_rate='adaptive',max_iter=10000,tol=1e-5, verbose=0) # # clf = svm.LinearSVC(penalty='l1',tol=0.001, C=1.0, loss='hinge', fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=7, max_iter=100000) clf = svm.LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.00001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000) # performing classifier training clf.fit(otrain, train_label) pickle.dump(clf, open(clfFile, 'wb')) # performing pridection score = clf.predict(otest) # getting results tp, tn, fp, fn, acc, f1 = get_results(score, test_label) print acc, f1 # logging result in file print '\npool size : %d,\tnumber feature : %d,\t stopword : %d\n\tTrue positive : %d\n\tTrue negative : %d\n\tFalse positive : %d\n\tFalse negatie : %d\n\taccuracy : %f\n\tf1 score : %f\n' % ( pool_size, num_feature, stp, tp, tn, fp, fn, acc, f1) open(res_dir + 'res.txt', 'a').write( '\npool size : %d,\tnumber feature : %d,\t stopword : %d\n\tTrue positive : %d\n\tTrue negative : %d\n\tFalse positive : %d\n\tFalse negatie : %d\n\taccuracy : %f\n\tf1 score : %f\n' % (pool_size, num_feature, stp, tp, tn, fp, fn, acc, f1))
from util.Preprocessing import preprocess import pickle word_data = pickle.load( open( '/media/zero/41FF48D81730BD9B/DT_RAE/data/pickle/msr_paraphrase_train.pickle', 'rb')) data_processing = preprocess(parsing_type='syn', structure_type='nh', stopword=1) data1, _ = data_processing.process_words_data(word_data) data_processing = preprocess(parsing_type='chk', structure_type='h', stopword=0) data2, _ = data_processing.process_words_data(word_data) for i in range(len(data1)): words = data1[i]['words'] wlen = len(words) for j in data1[i]['h_vect']: print[words[k] for k in j], words[wlen] = ' '.join([words[k] for k in j]) wlen += 1 print words = data2[i]['words'] for j in data2[i]['h_vect']: print[words[k] for k in j], print raw_input()
def msrp_train_test(var_file, stp, parse_type): #create result_directory res_dir = '/'.join(var_file.split('/')[:-1]) + '/results/' log_file = '/'.join(var_file.split('/')[:-1]) + '/msrp_train_log.txt' if not os.path.isdir(res_dir): os.mkdir(res_dir) # getting data that is processed for testing porpose train, train_label, test, test_label = get_msrp_data(stp) #initalizing model isize = 50 osize = 2 wvect = Word_vector(isize, vtype='msrp') # a.optimum.opt_var.g=g # preprocessing for train and test set data_processing = preprocess(parsing_type=parse_type, structure_type='h', stopword=stp, wvect=wvect) train_set, _ = data_processing.process_words_data(train) test_set, _ = data_processing.process_words_data(test) train_label = [ np.array([[0.0], [1.0]]) if i == 1 else np.array([[1.0], [0.0]]) for i in train_label ] train = [[train_set[i], train_set[i + 1]] for i in range(0, len(train_set), 2)] test = [[test_set[i], test_set[i + 1]] for i in range(0, len(test_set), 2)] # model layers initalization isize, hsize, w, b, g = pickle.load(open(var_file, 'rb')) opt_var = Optimization_variable(method='rmsprop', isize=isize, osize=hsize, model_type='RAE', option={'wpresent': []}) opt = Optimization(optimization_variable=opt_var, learning_rate=0.0001) rae_layer = stack_RAE(input_size=isize, hidden_size=hsize, optimization=opt, hidden_activation=tanh, hidden_deactivation=dtanh) rae_layer.w = w rae_layer.b = b rae_layer.optimum.opt_var.g = g rae_layer.init_dw(w, b) nn_layer = NN(2 * hsize, osize) nn_layer.optimum = Optimization( optimization_variable=Optimization_variable(method='rmsprop', isize=2 * hsize, osize=osize), learning_rate=0.0001) logg = logger('text', log_file) # trainning of model epoch = 10000 batch_size = 50 for ep in range(epoch): batches = mini_batch(zip(train, train_label), len(train_set), batch_size) cost = 0.0 ecount = 0 for batch in range(len(batches)): for data in batches[batch]: # forward pass of RAE vect1, v1 = rae_layer.encoding(data[0][0]) vect2, v2 = rae_layer.encoding(data[0][1]) data[0][0]['vects'] = vect1 data[0][1]['vects'] = vect2 # forward pass of NN o, vnn = nn_layer.forward_pass( np.concatenate( (vect1[len(vect1) - 1], vect2[len(vect2) - 1]), axis=0)) # cost calculation cost += ce_erro(o, data[1]) ecount += 1 grad = o - data[1] # backward pass of NN nngrad = nn_layer.backward_pass(grad, vnn) # backward padd of RAE nngrad1, nngrad2 = np.split(nngrad, [hsize], axis=0) grad1 = rae_layer.encoding_back(nngrad1, v1, data[0][0]) grad2 = rae_layer.encoding_back(nngrad2, v2, data[0][1]) # calculating weight update nn_layer.calc_dw( grad, np.concatenate( (vect1[len(vect1) - 1], vect2[len(vect2) - 1]), axis=0)) rae_layer.calc_dw(grad1, data[0][0]) rae_layer.calc_dw(grad2, data[0][1]) # updating weights nn_layer.update_weights() rae_layer.update_weights() if (ep + 1) % 50 == 0: score = msrp_test([rae_layer, nn_layer], test) tp, tn, fp, fn, acc, f1 = get_results(score, test_label) print 'stopword : %d, Tp : %d, Tn : %d, Fp : %d Fn : %d,acc : %ff1 score : %f' % ( stp, tp, tn, fp, fn, acc, f1) logg.log_text( 'stopword : %d, Tp : %d, Tn : %d, Fp : %d Fn : %d, acc : %f, f1 score : %f' % (stp, tp, tn, fp, fn, acc, f1)) pickle.dump([rae_layer.w, rae_layer.b, nn_layer.w], open( '/'.join(var_file.split('/')[:-1]) + '/results/weights' + str(ep) + '.pickle', 'wb')) print "%d/%d epoch completed .... error : %f" % (ep + 1, epoch, cost / ecount) logg.log_text("%d/%d epoch completed ....\n" % (ep + 1, epoch)) if cost / ecount < 0.01: break # getting results score = msrp_test([rae_layer, nn_layer], test) tp, tn, fp, fn, acc, f1 = get_results(score, test_label) print '\nstopword : %d\n\tTrue positive : %d\n\tTrue negative : %d\n\tFalse positive : %d\n\tFalse negatie : %d\n\taccuracy : %f\n\tf1 score : %f\n' % ( stp, tp, tn, fp, fn, acc, f1) # logging result in file open(res_dir + 'res.txt', 'a').write( '\nstopword : %d\n\tTrue positive : %d\n\tTrue negative : %d\n\tFalse positive : %d\n\tFalse negatie : %d\n\taccuracy : %f\n\tf1 score : %f\n' % (stp, tp, tn, fp, fn, acc, f1))