def build_labelled_data(dwin, repo, filenames, labels, embeddings_filename): # build the network dico, _ = build_dictionnary(repo, filenames) dwin = 9 paddings = [[], [], [], []] values = [] data = [] index = 0 for filename, label in zip(filenames, labels): input_sentences = get_input_from_files(repo, [filename], dico, paddings) for line in input_sentences: np_line = numpy.zeros((4, len(line[0]))) np_line[0] = line[0] np_line[1] = line[1] np_line[2] = line[2] np_line[3] = line[3] np_line = np_line.astype(int) values.append(line) data.append(label) """ with closing(open(os.path.join(repo, embeddings_filename), 'rb')) as f: values = pickle.load(f) """ print 'kikou' with closing( open(os.path.join(repo, embeddings_filename + "_labelled"), 'wb')) as f: pickle.dump([values, data], f, protocol=pickle.HIGHEST_PROTOCOL)
def build_embedding_data(repo, filenames, database_name, filename_load, filename_save): # build the network dico, _ = build_dictionnary(repo, filenames) dwin = 9 paddings = [[], [], [], []] """ for i in range(dwin/2): for i in xrange(4): paddings[i].append(dico[i]['PADDING']) """ paddings = numpy.asarray(paddings) """ # parametres et creation de LookUpTrain : n_mot = [len(dico[i]) for i in dico.keys()] vect_size = [1000, 1000, 5, 5] n_hidden = 25 x = T.imatrix('x') t_nlp = LookUpTrain(dwin, n_mot, vect_size, n_hidden) t_nlp.initialize() lookup = theano.function(inputs=[x], outputs=t_nlp.embedding(x), allow_input_downcast=True) """ # load lines input_sentences = get_input_from_files(repo, filenames, dico, paddings) embedding = [] #t_nlp.load(repo, filename_load) total = 0 for line in input_sentences: np_line = numpy.zeros((4, len(line[0]))) np_line[0] = line[0] np_line[1] = line[1] np_line[2] = line[2] np_line[3] = line[3] np_line = np_line.astype(int) #latent_variables = lookup(np_line) embedding.append(np_line) path = os.path.join(repo, filename_save + "_good") with closing(open(path, 'wb')) as f: pickle.dump(embedding, f, protocol=pickle.HIGHEST_PROTOCOL)
def build_database(repo, dico_filename, filenames, dwin, inverse_dico): index = 0 y_value = [] x_value = [] original_lines = [] with closing(open(os.path.join(repo, dico_filename), 'rb')) as f: dico = pickle.load(f) for filename in filenames: lines, w_lines = get_input_from_files(repo, [filename], dico) for line in lines: x_value.append(line) y_value.append(index) for w in w_lines: original_lines.append(w) if index == 0: index += 1 y_value = np.asarray(y_value, dtype=int) # do cut x = [x_.astype(int) for x_ in x_value] y = [y_.astype(int) for y_ in y_value] paddings = [[], [], [], []] for i in range(dwin / 2): for i in xrange(4): paddings[i].append(dico[i]['PARSING']) paddings = np.asarray(paddings) #paddings = paddings.reshape((1, paddings.shape[0], paddings.shape[1])) x_data = [add_padding(elem, paddings) for elem in x] x_final = [] y_final = [] recovery = {} for original, elem, label in zip(original_lines, x_data, y): for i in range(elem.shape[1] - dwin): sentence = elem[:, i:i + dwin] tmp = reconstruct_sentence(sentence, inverse_dico) recovery[tmp] = [label, original] return recovery
def training_Hollande(repo, output_dico, learning_rate, decay_rate, filenames): ######### # MODEL # ######### dwin = 20 with closing(open(os.path.join(repo, output_dico), 'rb')) as f: dico = pickle.load(f) n_mot = [len(dico[i]) for i in dico.keys()] vect_size = [100, 10, 5, 5] n_hidden = [100, 50] t_nlp = LookUpTrain(dwin, n_mot, vect_size, n_hidden, n_out=2) t_nlp.initialize() #t_nlp.load(repo, filename_load) x = T.itensor3('x') y = T.ivector('y') cost = T.mean(t_nlp.cost(x, y)) error = T.mean(t_nlp.errors(x,y)) params = getParams(t_nlp, x) updates, _ = Adam(cost, params, learning_rate) """ for p, i in zip(params, range(len(params))): p.name+='_'+str(i) #calcul du gradient avec RMSProp updates = [] caches = {} grad_params = T.grad(cost, params) for param, grad_param in zip(params, grad_params): if not caches.has_key(param.name): caches[param.name] = shared_floatx(param.get_value() * 0., "cache_"+param.name) # update rule update_cache = decay_rate*caches[param.name]\ + (1 - decay_rate)*grad_param**2 update_param = param - learning_rate*grad_param/T.sqrt(update_cache + 1e-8) updates.append((caches[param.name], update_cache)) updates.append((param, update_param)) """ train_model = theano.function(inputs=[x,y], outputs=cost, updates=updates, allow_input_downcast=True) valid_model = theano.function(inputs=[x, y], outputs=cost, allow_input_downcast=True) test_model = theano.function(inputs=[x, y], outputs=error, allow_input_downcast=True) predict = theano.function(inputs=[x], outputs=t_nlp.predict(x), allow_input_downcast=True) predict_confidency = theano.function(inputs=[x], outputs=t_nlp.predict_confidency(x)[0], allow_input_downcast=True) index = 0 y_value = [] x_value = [] with closing(open(os.path.join(repo, output_dico), 'rb')) as f: dico = pickle.load(f) for filename in filenames: lines, _ = get_input_from_files(repo, [filename], dico) for line in lines: x_value.append(line) y_value.append(index) if index ==0: index+=1 y_value = np.asarray(y_value, dtype=int) # balance the samples x_value_0 = [ x_value[i] for i in range(np.argmax(y_value))]# put the 0 y_value_0 = [ y_value[i] for i in range(np.argmax(y_value))]# put the 0 indexes = np.random.permutation(y_value.shape[0] - np.argmax(y_value))[:np.argmax(y_value)] x_value_1 = [x_value[i+np.argmax(y_value)] for i in indexes]# balance the numbers y_value_1 = [y_value[i+np.argmax(y_value)] for i in indexes]# balance the numbers pos_percentage = (int) (len(y_value_0)*0.8) neg_percentage = (int) (len(y_value_1)*0.8) other_pos_percentage = (len(y_value_0) - pos_percentage)/2 other_neg_percentage = (len(y_value_1) - neg_percentage)/2 pos_permut = np.random.permutation(len(y_value_0)) neg_permut = np.random.permutation(len(y_value_1)) x_train = [x_value_0[i] for i in pos_permut[:pos_percentage]] + [x_value_1[i] for i in neg_permut[:neg_percentage]] x_valid = [x_value_0[i] for i in pos_permut[pos_percentage:pos_percentage+other_pos_percentage]] + \ [x_value_1[i] for i in neg_permut[neg_percentage:neg_percentage+other_neg_percentage]] x_test = [x_value_0[i] for i in pos_permut[pos_percentage+other_pos_percentage:]] + \ [x_value_1[i] for i in neg_permut[neg_percentage+other_neg_percentage:]] y_train = [y_value_0[i] for i in pos_permut[:pos_percentage]] + [y_value_1[i] for i in neg_permut[:neg_percentage]] y_valid = [y_value_0[i] for i in pos_permut[pos_percentage:pos_percentage+other_pos_percentage]] + \ [y_value_1[i] for i in neg_permut[neg_percentage:neg_percentage+other_neg_percentage]] y_test = [y_value_0[i] for i in pos_permut[pos_percentage+other_pos_percentage:]] + \ [y_value_1[i] for i in neg_permut[neg_percentage+other_neg_percentage:]] index_train = np.random.permutation(len(y_train)) batch_size = 32 index_valid = np.random.permutation(len(y_valid)) index_test = np.random.permutation(len(y_test)) x_train_ = [x_train[i].astype(int) for i in index_train] x_valid_ = [x_valid[i].astype(int) for i in index_valid] x_test_ = [x_test[i].astype(int) for i in index_test] y_train_ = [y_train[i] for i in index_train] y_valid_ = [y_valid[i] for i in index_valid] y_test_ = [y_test[i] for i in index_test] paddings = [ [], [], [], []] for i in range(dwin/2): for i in xrange(4): paddings[i].append(dico[i]['PARSING']) paddings = np.asarray(paddings) #paddings = paddings.reshape((1, paddings.shape[0], paddings.shape[1])) x_train_ = [add_padding(elem, paddings) for elem in x_train_] x_valid_ = [add_padding(elem, paddings) for elem in x_valid_] x_test_ = [add_padding(elem, paddings) for elem in x_test_] x_train=[]; x_valid=[]; x_test=[] y_train=[]; y_valid=[]; y_test=[] for elem, label in zip(x_train_, y_train_): for i in range(elem.shape[1] -dwin): x_train.append(elem[:,i:i+dwin]) y_train.append(label) for elem, label in zip(x_valid_, y_valid_): for i in range(elem.shape[1] -dwin): x_valid.append(elem[:,i:i+dwin]) y_valid.append(label) for elem, label in zip(x_test_, y_test_): for i in range(elem.shape[1] -dwin): x_test.append(elem[:,i:i+dwin]) y_test.append(label) index_train = np.random.permutation(len(y_train)) index_valid = np.random.permutation(len(y_valid)) index_test = np.random.permutation(len(y_test)) x_train = [x_train[i].astype(int) for i in index_train] x_valid = [x_valid[i].astype(int) for i in index_valid] x_test = [x_test[i].astype(int) for i in index_test] y_train = [y_train[i] for i in index_train] y_valid = [y_valid[i] for i in index_valid] y_test = [y_test[i] for i in index_test] n_train = len(y_train)/batch_size n_valid = len(y_valid)/batch_size n_test = len(y_test)/batch_size print (n_train, n_valid, n_test) print (1.*sum(y_valid))/len(y_valid) print (1.*sum(y_test))/len(y_test) print "#############################" saving ='JADT_2_Fev_H_G_' index_filename=0 epochs = 10 # number of iterations on the corpus for epoch in range(epochs): index_valid = n_train for minibatch_index in range(n_train): sentence = x_train[minibatch_index*batch_size:(minibatch_index+1)*batch_size] y_value = y_train[minibatch_index*batch_size:(minibatch_index+1)*batch_size] #before = valid_model(sentence, y_value) train_value = train_model(sentence, y_value) #after = valid_model(sentence, y_value) #print before - after if True: train_cost=[] for minibatch_train in range(n_train): sentence = x_train[minibatch_train*batch_size:(minibatch_train+1)*batch_size] y_value = y_train[minibatch_train*batch_size:(minibatch_train+1)*batch_size] train_value = valid_model(sentence, y_value) train_cost.append(train_value) print "Train : "+str(np.mean(train_cost)*100) valid_cost=[] predictions=[] for minibatch_valid in range(n_valid): y_value = y_valid[minibatch_valid*batch_size:(minibatch_valid+1)*batch_size] sentence = x_valid[minibatch_valid*batch_size:(minibatch_valid+1)*batch_size] valid_value = test_model(sentence, y_value) valid_cost.append(valid_value) print "Valid : "+str(np.mean(valid_cost)*100)+" in : "+(saving+str(index_filename)) test_cost=[] for minibatch_test in range(n_test): sentence = x_test[minibatch_test*batch_size:(minibatch_test+1)*batch_size] y_value = y_test[minibatch_test*batch_size:(minibatch_test+1)*batch_size] test_value = test_model(sentence, y_value) test_cost.append(test_value) print "Test : "+str(np.mean(test_cost)*100) index_filename+=1 t_nlp.save(repo, saving) return #### parcourir le test : take the 10 most accurate sentence ### #### parcourir le test : take the 10 less accurate sentence ### scores = [] for index in range(len(y_test)): x_value=x_test[index:index+1] scores.append(predict_confidency(x_value)) right = [x_test[i] for i in np.argsort(scores)[::-1][:20]] false = [x_test[i] for i in np.argsort(scores)[:20]] print scores[:10] with closing(open('data/sentence/relevant_sentence_H_G', 'wb')) as f: pickle.dump([right, false], f, protocol=pickle.HIGHEST_PROTOCOL)
def build_database(repo, dico_filename, filenames, dwin): index = 0 y_value = [] x_value = [] with closing(open(os.path.join(repo, dico_filename), 'rb')) as f: dico = pickle.load(f) for filename in filenames: lines, _ = get_input_from_files(repo, [filename], dico) for line in lines: x_value.append(line) y_value.append(index) if index == 0: index += 1 y_value = np.asarray(y_value, dtype=int) x_value_0 = [x_value[i] for i in range(np.argmax(y_value))] y_value_0 = [y_value[i] for i in range(np.argmax(y_value))] indexes = np.random.permutation(y_value.shape[0] - np.argmax(y_value))[:np.argmax( y_value)] #TODO PUT IT BACK x_value_1 = [x_value[i + np.argmax(y_value)] for i in indexes] # balance the numbers y_value_1 = [y_value[i + np.argmax(y_value)] for i in indexes] # balance the numbers pos_percentage = (int)(len(y_value_0) * 0.8) neg_percentage = (int)(len(y_value_1) * 0.8) other_pos_percentage = (len(y_value_0) - pos_percentage) / 2 other_neg_percentage = (len(y_value_1) - neg_percentage) / 2 pos_permut = np.random.permutation(len(y_value_0)) neg_permut = np.random.permutation(len(y_value_1)) x_train = [x_value_0[i] for i in pos_permut[:pos_percentage] ] + [x_value_1[i] for i in neg_permut[:neg_percentage]] x_valid = [x_value_0[i] for i in pos_permut[pos_percentage:pos_percentage+other_pos_percentage]] + \ [x_value_1[i] for i in neg_permut[neg_percentage:neg_percentage+other_neg_percentage]] x_test = [x_value_0[i] for i in pos_permut[pos_percentage+other_pos_percentage:]] + \ [x_value_1[i] for i in neg_permut[neg_percentage+other_neg_percentage:]] y_train = [y_value_0[i] for i in pos_permut[:pos_percentage] ] + [y_value_1[i] for i in neg_permut[:neg_percentage]] y_valid = [y_value_0[i] for i in pos_permut[pos_percentage:pos_percentage+other_pos_percentage]] + \ [y_value_1[i] for i in neg_permut[neg_percentage:neg_percentage+other_neg_percentage]] y_test = [y_value_0[i] for i in pos_permut[pos_percentage+other_pos_percentage:]] + \ [y_value_1[i] for i in neg_permut[neg_percentage+other_neg_percentage:]] index_train = np.random.permutation(len(y_train)) index_valid = np.random.permutation(len(y_valid)) index_test = np.random.permutation(len(y_test)) x_train_ = [x_train[i].astype(int) for i in index_train] x_valid_ = [x_valid[i].astype(int) for i in index_valid] x_test_ = [x_test[i].astype(int) for i in index_test] y_train_ = [y_train[i] for i in index_train] y_valid_ = [y_valid[i] for i in index_valid] y_test_ = [y_test[i] for i in index_test] paddings = [[], [], [], []] for i in range(dwin / 2): for i in xrange(4): paddings[i].append(dico[i]['PARSING']) paddings = np.asarray(paddings) #paddings = paddings.reshape((1, paddings.shape[0], paddings.shape[1])) x_train_ = [add_padding(elem, paddings).astype(int) for elem in x_train_] x_valid_ = [add_padding(elem, paddings) for elem in x_valid_] x_test_ = [add_padding(elem, paddings) for elem in x_test_] x_train = [] x_valid = [] x_test = [] y_train = [] y_valid = [] y_test = [] for elem, label in zip(x_train_, y_train_): for i in range(elem.shape[1] - dwin): x_train.append(elem[:, i:i + dwin]) y_train.append(label) for elem, label in zip(x_valid_, y_valid_): for i in range(elem.shape[1] - dwin): x_valid.append(elem[:, i:i + dwin]) y_valid.append(label) for elem, label in zip(x_test_, y_test_): for i in range(elem.shape[1] - dwin): x_test.append(elem[:, i:i + dwin]) y_test.append(label) index_train = np.random.permutation(len(y_train)) index_valid = np.random.permutation(len(y_valid)) index_test = np.random.permutation(len(y_test)) x_train = [x_train[i].astype(int) for i in index_train] x_valid = [x_valid[i].astype(int) for i in index_valid] x_test = [x_test[i].astype(int) for i in index_test] y_train = [y_train[i] for i in index_train] y_valid = [y_valid[i] for i in index_valid] y_test = [y_test[i] for i in index_test] return (x_train, y_train), (x_valid, y_valid), (x_test, y_test)