def unsupervised_training(learning_rate, decay_rate, epochs, repo, output_dico, database_name): dwin = 9 with closing(open(os.path.join(repo, output_dico), 'rb')) as f: dico = pickle.load(f) n_mot = [len(dico[i]) for i in dico.keys()] vect_size = [20, 10, 5, 5] n_hidden = 100 x = T.itensor3('x') xc = T.itensor3('x') y = T.ivector('y') #xc = T.itensor3('xc') t_nlp = LookUpTrain(dwin, n_mot, vect_size, n_hidden) t_nlp.initialize() cost = T.mean(t_nlp.cost(x, y)) error = T.mean(t_nlp.errors(x, y)) params = getParams(t_nlp, x) for p, i in zip(params, range(len(params))): p.name += '_' + str(i) #calcul du gradient avec RMSProp updates = [] caches = {} grad_params = T.grad(cost, params) for param, grad_param in zip(params, grad_params): if not caches.has_key(param.name): caches[param.name] = shared_floatx(param.get_value() * 0., "cache_" + param.name) # update rule update_cache = decay_rate*caches[param.name]\ + (1 - decay_rate)*grad_param**2 update_param = param - learning_rate * grad_param / T.sqrt( update_cache + 1e-8) updates.append((caches[param.name], update_cache)) updates.append((param, update_param)) train_model = theano.function(inputs=[x, y], outputs=cost, updates=updates, allow_input_downcast=True) valid_model = theano.function(inputs=[x, y], outputs=cost, allow_input_downcast=True) test_model = theano.function(inputs=[x, y], outputs=error, allow_input_downcast=True) data_path = os.path.join(repo, database_name) with closing(open(data_path, 'rb')) as f: data, data_c = pickle.load(f) data = numpy.asarray(data).astype(int) labels = numpy.asarray(data_c).astype(int) # test : reduce data data = data data_c = data_c # reading by minibatch batch_size = 15 n_sample = data.shape[0] / batch_size # 80% of the data will go into the training set n_train = (int)(n_sample * 0.8) y_value = numpy.zeros((2 * batch_size), dtype=int) y_value[batch_size:] = 1 + y_value[batch_size:] index_filename = 0 saving = "params_savings_bis_v4_" #t_nlp.load(repo, (saving+str(95))) #index_filename = 96 #saving = "params_savings_bis" for epoch in range(4): train_cost = [] valid_cost = [] index_valid = n_train for minibatch_index in range(n_train): correct_sentences = data[minibatch_index * batch_size:(minibatch_index + 1) * batch_size, :, :] incorrect_sentences = data_c[minibatch_index * batch_size:(minibatch_index + 1) * batch_size, :, :] sentences = numpy.concatenate( [incorrect_sentences, correct_sentences], axis=0) train_value = train_model(sentences, y_value) if minibatch_index % 10 == 0: train_cost = [] for minibatch_train in range(n_train): correct_sentences = data[minibatch_train * batch_size:(minibatch_train + 1) * batch_size, :, :] incorrect_sentences = data_c[minibatch_train * batch_size:(minibatch_train + 1) * batch_size, :, :] sentences = numpy.concatenate( [incorrect_sentences, correct_sentences], axis=0) train_value = valid_model(sentences, y_value) train_cost.append(train_value) print "Train : " + str(numpy.mean(train_cost) * 100) valid_cost = [] for minibatch_valid in range(n_train, n_sample): correct_sentences = data[minibatch_valid * batch_size:(minibatch_valid + 1) * batch_size, :, :] incorrect_sentences = data_c[minibatch_valid * batch_size:(minibatch_valid + 1) * batch_size, :, :] sentences = numpy.concatenate( [incorrect_sentences, correct_sentences], axis=0) valid_value = test_model(sentences, y_value) #import pdb #pdb.set_trace() valid_cost.append(valid_value) print "Valid : " + str( numpy.mean(valid_value) * 100) + " in : " + (saving + str(index_filename)) t_nlp.save(repo, (saving + str(index_filename))) index_filename += 1
def training_Hollande(repo, output_dico, learning_rate, decay_rate, filenames): ######### # MODEL # ######### dwin = 20 with closing(open(os.path.join(repo, output_dico), 'rb')) as f: dico = pickle.load(f) n_mot = [len(dico[i]) for i in dico.keys()] vect_size = [100, 10, 5, 5] n_hidden = [100, 50] t_nlp = LookUpTrain(dwin, n_mot, vect_size, n_hidden, n_out=2) t_nlp.initialize() #t_nlp.load(repo, filename_load) x = T.itensor3('x') y = T.ivector('y') cost = T.mean(t_nlp.cost(x, y)) error = T.mean(t_nlp.errors(x,y)) params = getParams(t_nlp, x) updates, _ = Adam(cost, params, learning_rate) """ for p, i in zip(params, range(len(params))): p.name+='_'+str(i) #calcul du gradient avec RMSProp updates = [] caches = {} grad_params = T.grad(cost, params) for param, grad_param in zip(params, grad_params): if not caches.has_key(param.name): caches[param.name] = shared_floatx(param.get_value() * 0., "cache_"+param.name) # update rule update_cache = decay_rate*caches[param.name]\ + (1 - decay_rate)*grad_param**2 update_param = param - learning_rate*grad_param/T.sqrt(update_cache + 1e-8) updates.append((caches[param.name], update_cache)) updates.append((param, update_param)) """ train_model = theano.function(inputs=[x,y], outputs=cost, updates=updates, allow_input_downcast=True) valid_model = theano.function(inputs=[x, y], outputs=cost, allow_input_downcast=True) test_model = theano.function(inputs=[x, y], outputs=error, allow_input_downcast=True) predict = theano.function(inputs=[x], outputs=t_nlp.predict(x), allow_input_downcast=True) predict_confidency = theano.function(inputs=[x], outputs=t_nlp.predict_confidency(x)[0], allow_input_downcast=True) index = 0 y_value = [] x_value = [] with closing(open(os.path.join(repo, output_dico), 'rb')) as f: dico = pickle.load(f) for filename in filenames: lines, _ = get_input_from_files(repo, [filename], dico) for line in lines: x_value.append(line) y_value.append(index) if index ==0: index+=1 y_value = np.asarray(y_value, dtype=int) # balance the samples x_value_0 = [ x_value[i] for i in range(np.argmax(y_value))]# put the 0 y_value_0 = [ y_value[i] for i in range(np.argmax(y_value))]# put the 0 indexes = np.random.permutation(y_value.shape[0] - np.argmax(y_value))[:np.argmax(y_value)] x_value_1 = [x_value[i+np.argmax(y_value)] for i in indexes]# balance the numbers y_value_1 = [y_value[i+np.argmax(y_value)] for i in indexes]# balance the numbers pos_percentage = (int) (len(y_value_0)*0.8) neg_percentage = (int) (len(y_value_1)*0.8) other_pos_percentage = (len(y_value_0) - pos_percentage)/2 other_neg_percentage = (len(y_value_1) - neg_percentage)/2 pos_permut = np.random.permutation(len(y_value_0)) neg_permut = np.random.permutation(len(y_value_1)) x_train = [x_value_0[i] for i in pos_permut[:pos_percentage]] + [x_value_1[i] for i in neg_permut[:neg_percentage]] x_valid = [x_value_0[i] for i in pos_permut[pos_percentage:pos_percentage+other_pos_percentage]] + \ [x_value_1[i] for i in neg_permut[neg_percentage:neg_percentage+other_neg_percentage]] x_test = [x_value_0[i] for i in pos_permut[pos_percentage+other_pos_percentage:]] + \ [x_value_1[i] for i in neg_permut[neg_percentage+other_neg_percentage:]] y_train = [y_value_0[i] for i in pos_permut[:pos_percentage]] + [y_value_1[i] for i in neg_permut[:neg_percentage]] y_valid = [y_value_0[i] for i in pos_permut[pos_percentage:pos_percentage+other_pos_percentage]] + \ [y_value_1[i] for i in neg_permut[neg_percentage:neg_percentage+other_neg_percentage]] y_test = [y_value_0[i] for i in pos_permut[pos_percentage+other_pos_percentage:]] + \ [y_value_1[i] for i in neg_permut[neg_percentage+other_neg_percentage:]] index_train = np.random.permutation(len(y_train)) batch_size = 32 index_valid = np.random.permutation(len(y_valid)) index_test = np.random.permutation(len(y_test)) x_train_ = [x_train[i].astype(int) for i in index_train] x_valid_ = [x_valid[i].astype(int) for i in index_valid] x_test_ = [x_test[i].astype(int) for i in index_test] y_train_ = [y_train[i] for i in index_train] y_valid_ = [y_valid[i] for i in index_valid] y_test_ = [y_test[i] for i in index_test] paddings = [ [], [], [], []] for i in range(dwin/2): for i in xrange(4): paddings[i].append(dico[i]['PARSING']) paddings = np.asarray(paddings) #paddings = paddings.reshape((1, paddings.shape[0], paddings.shape[1])) x_train_ = [add_padding(elem, paddings) for elem in x_train_] x_valid_ = [add_padding(elem, paddings) for elem in x_valid_] x_test_ = [add_padding(elem, paddings) for elem in x_test_] x_train=[]; x_valid=[]; x_test=[] y_train=[]; y_valid=[]; y_test=[] for elem, label in zip(x_train_, y_train_): for i in range(elem.shape[1] -dwin): x_train.append(elem[:,i:i+dwin]) y_train.append(label) for elem, label in zip(x_valid_, y_valid_): for i in range(elem.shape[1] -dwin): x_valid.append(elem[:,i:i+dwin]) y_valid.append(label) for elem, label in zip(x_test_, y_test_): for i in range(elem.shape[1] -dwin): x_test.append(elem[:,i:i+dwin]) y_test.append(label) index_train = np.random.permutation(len(y_train)) index_valid = np.random.permutation(len(y_valid)) index_test = np.random.permutation(len(y_test)) x_train = [x_train[i].astype(int) for i in index_train] x_valid = [x_valid[i].astype(int) for i in index_valid] x_test = [x_test[i].astype(int) for i in index_test] y_train = [y_train[i] for i in index_train] y_valid = [y_valid[i] for i in index_valid] y_test = [y_test[i] for i in index_test] n_train = len(y_train)/batch_size n_valid = len(y_valid)/batch_size n_test = len(y_test)/batch_size print (n_train, n_valid, n_test) print (1.*sum(y_valid))/len(y_valid) print (1.*sum(y_test))/len(y_test) print "#############################" saving ='JADT_2_Fev_H_G_' index_filename=0 epochs = 10 # number of iterations on the corpus for epoch in range(epochs): index_valid = n_train for minibatch_index in range(n_train): sentence = x_train[minibatch_index*batch_size:(minibatch_index+1)*batch_size] y_value = y_train[minibatch_index*batch_size:(minibatch_index+1)*batch_size] #before = valid_model(sentence, y_value) train_value = train_model(sentence, y_value) #after = valid_model(sentence, y_value) #print before - after if True: train_cost=[] for minibatch_train in range(n_train): sentence = x_train[minibatch_train*batch_size:(minibatch_train+1)*batch_size] y_value = y_train[minibatch_train*batch_size:(minibatch_train+1)*batch_size] train_value = valid_model(sentence, y_value) train_cost.append(train_value) print "Train : "+str(np.mean(train_cost)*100) valid_cost=[] predictions=[] for minibatch_valid in range(n_valid): y_value = y_valid[minibatch_valid*batch_size:(minibatch_valid+1)*batch_size] sentence = x_valid[minibatch_valid*batch_size:(minibatch_valid+1)*batch_size] valid_value = test_model(sentence, y_value) valid_cost.append(valid_value) print "Valid : "+str(np.mean(valid_cost)*100)+" in : "+(saving+str(index_filename)) test_cost=[] for minibatch_test in range(n_test): sentence = x_test[minibatch_test*batch_size:(minibatch_test+1)*batch_size] y_value = y_test[minibatch_test*batch_size:(minibatch_test+1)*batch_size] test_value = test_model(sentence, y_value) test_cost.append(test_value) print "Test : "+str(np.mean(test_cost)*100) index_filename+=1 t_nlp.save(repo, saving) return #### parcourir le test : take the 10 most accurate sentence ### #### parcourir le test : take the 10 less accurate sentence ### scores = [] for index in range(len(y_test)): x_value=x_test[index:index+1] scores.append(predict_confidency(x_value)) right = [x_test[i] for i in np.argsort(scores)[::-1][:20]] false = [x_test[i] for i in np.argsort(scores)[:20]] print scores[:10] with closing(open('data/sentence/relevant_sentence_H_G', 'wb')) as f: pickle.dump([right, false], f, protocol=pickle.HIGHEST_PROTOCOL)