Exemple #1
0
def build_labelled_data(dwin, repo, filenames, labels, embeddings_filename):
    # build the network
    dico, _ = build_dictionnary(repo, filenames)
    dwin = 9
    paddings = [[], [], [], []]
    values = []
    data = []
    index = 0
    for filename, label in zip(filenames, labels):
        input_sentences = get_input_from_files(repo, [filename], dico,
                                               paddings)
        for line in input_sentences:
            np_line = numpy.zeros((4, len(line[0])))
            np_line[0] = line[0]
            np_line[1] = line[1]
            np_line[2] = line[2]
            np_line[3] = line[3]
            np_line = np_line.astype(int)
            values.append(line)
            data.append(label)
    """
	with closing(open(os.path.join(repo, embeddings_filename), 'rb')) as f:
		values = pickle.load(f)
	"""
    print 'kikou'
    with closing(
            open(os.path.join(repo, embeddings_filename + "_labelled"),
                 'wb')) as f:
        pickle.dump([values, data], f, protocol=pickle.HIGHEST_PROTOCOL)
def build_embedding_data(repo, filenames, database_name, filename_load,
                         filename_save):
    # build the network
    dico, _ = build_dictionnary(repo, filenames)
    dwin = 9
    paddings = [[], [], [], []]
    """
	for i in range(dwin/2):
		for i in xrange(4):
			paddings[i].append(dico[i]['PADDING'])
	"""
    paddings = numpy.asarray(paddings)
    """
	# parametres et creation de LookUpTrain :
	n_mot = [len(dico[i]) for i in dico.keys()]
	vect_size = [1000, 1000, 5, 5]
	n_hidden = 25
	x = T.imatrix('x')
	t_nlp = LookUpTrain(dwin, n_mot, vect_size, n_hidden)
	t_nlp.initialize()
	lookup = theano.function(inputs=[x], outputs=t_nlp.embedding(x), allow_input_downcast=True)
	"""
    # load lines
    input_sentences = get_input_from_files(repo, filenames, dico, paddings)
    embedding = []
    #t_nlp.load(repo, filename_load)
    total = 0
    for line in input_sentences:
        np_line = numpy.zeros((4, len(line[0])))
        np_line[0] = line[0]
        np_line[1] = line[1]
        np_line[2] = line[2]
        np_line[3] = line[3]
        np_line = np_line.astype(int)
        #latent_variables = lookup(np_line)
        embedding.append(np_line)

    path = os.path.join(repo, filename_save + "_good")

    with closing(open(path, 'wb')) as f:
        pickle.dump(embedding, f, protocol=pickle.HIGHEST_PROTOCOL)
Exemple #3
0
def build_database(repo, dico_filename, filenames, dwin, inverse_dico):
    index = 0
    y_value = []
    x_value = []
    original_lines = []
    with closing(open(os.path.join(repo, dico_filename), 'rb')) as f:
        dico = pickle.load(f)
    for filename in filenames:
        lines, w_lines = get_input_from_files(repo, [filename], dico)
        for line in lines:
            x_value.append(line)
            y_value.append(index)
        for w in w_lines:
            original_lines.append(w)
        if index == 0:
            index += 1
    y_value = np.asarray(y_value, dtype=int)

    # do cut
    x = [x_.astype(int) for x_ in x_value]
    y = [y_.astype(int) for y_ in y_value]

    paddings = [[], [], [], []]
    for i in range(dwin / 2):
        for i in xrange(4):
            paddings[i].append(dico[i]['PARSING'])
    paddings = np.asarray(paddings)
    #paddings = paddings.reshape((1, paddings.shape[0], paddings.shape[1]))
    x_data = [add_padding(elem, paddings) for elem in x]

    x_final = []
    y_final = []
    recovery = {}
    for original, elem, label in zip(original_lines, x_data, y):
        for i in range(elem.shape[1] - dwin):
            sentence = elem[:, i:i + dwin]
            tmp = reconstruct_sentence(sentence, inverse_dico)
            recovery[tmp] = [label, original]
    return recovery
Exemple #4
0
def training_Hollande(repo, output_dico, learning_rate, decay_rate, filenames):
	
	#########
	# MODEL #
	#########
	dwin = 20
	with closing(open(os.path.join(repo, output_dico), 'rb')) as f:
		dico = pickle.load(f)
	n_mot = [len(dico[i]) for i in dico.keys()]
	vect_size = [100, 10, 5, 5]
	n_hidden = [100, 50]

	t_nlp = LookUpTrain(dwin, n_mot, vect_size, n_hidden, n_out=2)
	t_nlp.initialize()
	#t_nlp.load(repo, filename_load)

	x = T.itensor3('x')
	y = T.ivector('y')

	cost = T.mean(t_nlp.cost(x, y))
	error = T.mean(t_nlp.errors(x,y))


	params = getParams(t_nlp, x)
	updates, _ = Adam(cost, params, learning_rate)
	"""
	for p, i in zip(params, range(len(params))):
		p.name+='_'+str(i)

	#calcul du gradient avec RMSProp
	updates = []
	caches = {}
	grad_params = T.grad(cost, params)
	for param, grad_param in zip(params, grad_params):

		if not caches.has_key(param.name):
			caches[param.name] = shared_floatx(param.get_value() * 0.,
												"cache_"+param.name)
		# update rule
		update_cache = decay_rate*caches[param.name]\
					+ (1 - decay_rate)*grad_param**2
		update_param = param  - learning_rate*grad_param/T.sqrt(update_cache + 1e-8)
		updates.append((caches[param.name], update_cache))
		updates.append((param, update_param))
	"""

	train_model = theano.function(inputs=[x,y], outputs=cost, updates=updates,
					allow_input_downcast=True)

	valid_model = theano.function(inputs=[x, y], outputs=cost, allow_input_downcast=True)
	test_model = theano.function(inputs=[x, y], outputs=error, allow_input_downcast=True)
        predict = theano.function(inputs=[x], outputs=t_nlp.predict(x), allow_input_downcast=True)
	predict_confidency = theano.function(inputs=[x], outputs=t_nlp.predict_confidency(x)[0], allow_input_downcast=True)
	index = 0
	y_value = []
	x_value = []
	with closing(open(os.path.join(repo, output_dico), 'rb')) as f:
		dico = pickle.load(f)
	for filename in filenames:
		lines, _ = get_input_from_files(repo, [filename], dico)
		for line in lines:
			x_value.append(line)
			y_value.append(index)
		if index ==0:
			index+=1
	y_value = np.asarray(y_value, dtype=int)
	# balance the samples
	x_value_0 = [ x_value[i] for i in range(np.argmax(y_value))]# put the 0
	y_value_0 = [ y_value[i] for i in range(np.argmax(y_value))]# put the 0
	indexes = np.random.permutation(y_value.shape[0] - np.argmax(y_value))[:np.argmax(y_value)]
	x_value_1 = [x_value[i+np.argmax(y_value)] for i in indexes]# balance the numbers
	y_value_1 = [y_value[i+np.argmax(y_value)] for i in indexes]# balance the numbers

	pos_percentage = (int) (len(y_value_0)*0.8)
	neg_percentage = (int) (len(y_value_1)*0.8)
	other_pos_percentage = (len(y_value_0) - pos_percentage)/2
	other_neg_percentage = (len(y_value_1) - neg_percentage)/2

	pos_permut = np.random.permutation(len(y_value_0))
	neg_permut = np.random.permutation(len(y_value_1))
	x_train = [x_value_0[i] for i in pos_permut[:pos_percentage]] + [x_value_1[i] for i in neg_permut[:neg_percentage]]
	x_valid = [x_value_0[i] for i in pos_permut[pos_percentage:pos_percentage+other_pos_percentage]] + \
		  [x_value_1[i] for i in neg_permut[neg_percentage:neg_percentage+other_neg_percentage]]
	x_test = [x_value_0[i] for i in pos_permut[pos_percentage+other_pos_percentage:]] + \
		  [x_value_1[i] for i in neg_permut[neg_percentage+other_neg_percentage:]]

	y_train = [y_value_0[i] for i in pos_permut[:pos_percentage]] + [y_value_1[i] for i in neg_permut[:neg_percentage]]
	y_valid = [y_value_0[i] for i in pos_permut[pos_percentage:pos_percentage+other_pos_percentage]] + \
		  [y_value_1[i] for i in neg_permut[neg_percentage:neg_percentage+other_neg_percentage]]
	y_test = [y_value_0[i] for i in pos_permut[pos_percentage+other_pos_percentage:]] + \
		  [y_value_1[i] for i in neg_permut[neg_percentage+other_neg_percentage:]]

	index_train = np.random.permutation(len(y_train))
	batch_size = 32
	index_valid = np.random.permutation(len(y_valid))
	index_test = np.random.permutation(len(y_test))
	x_train_ = [x_train[i].astype(int) for i in index_train]
	x_valid_ = [x_valid[i].astype(int) for i in index_valid]
	x_test_ = [x_test[i].astype(int) for i in index_test]
	y_train_ = [y_train[i] for i in index_train]
	y_valid_ = [y_valid[i] for i in index_valid]
	y_test_ = [y_test[i] for i in index_test]

	paddings = [ [], [], [], []]
	for i in range(dwin/2):
		for i in xrange(4):
			paddings[i].append(dico[i]['PARSING'])
	paddings = np.asarray(paddings)
	#paddings = paddings.reshape((1, paddings.shape[0], paddings.shape[1]))
	x_train_ = [add_padding(elem, paddings) for elem in x_train_]
	x_valid_ = [add_padding(elem, paddings) for elem in x_valid_]
	x_test_ = [add_padding(elem, paddings) for elem in x_test_]

	x_train=[]; x_valid=[]; x_test=[]
	y_train=[]; y_valid=[]; y_test=[]
	for elem, label in zip(x_train_, y_train_):
		for i in range(elem.shape[1] -dwin):
			x_train.append(elem[:,i:i+dwin])
			y_train.append(label)
	for elem, label in zip(x_valid_, y_valid_):
		for i in range(elem.shape[1] -dwin):
			x_valid.append(elem[:,i:i+dwin])
			y_valid.append(label)
	for elem, label in zip(x_test_, y_test_):
		for i in range(elem.shape[1] -dwin):
			x_test.append(elem[:,i:i+dwin])
			y_test.append(label)

	index_train = np.random.permutation(len(y_train))
	index_valid = np.random.permutation(len(y_valid))
	index_test = np.random.permutation(len(y_test))
	x_train = [x_train[i].astype(int) for i in index_train]
	x_valid = [x_valid[i].astype(int) for i in index_valid]
	x_test = [x_test[i].astype(int) for i in index_test]
	y_train = [y_train[i] for i in index_train]
	y_valid = [y_valid[i] for i in index_valid]
	y_test = [y_test[i] for i in index_test]

	n_train = len(y_train)/batch_size
	n_valid = len(y_valid)/batch_size
	n_test = len(y_test)/batch_size
	print (n_train, n_valid, n_test)
	print (1.*sum(y_valid))/len(y_valid)
	print (1.*sum(y_test))/len(y_test)
	print "#############################"
	saving ='JADT_2_Fev_H_G_'
	index_filename=0
        epochs = 10 # number of iterations on the corpus
	for epoch in range(epochs):
		index_valid = n_train
		for minibatch_index in range(n_train):

			sentence = x_train[minibatch_index*batch_size:(minibatch_index+1)*batch_size]
			y_value = y_train[minibatch_index*batch_size:(minibatch_index+1)*batch_size]
			#before = valid_model(sentence, y_value)
			train_value = train_model(sentence, y_value)
			#after = valid_model(sentence, y_value)
			#print before - after
		if True:
			train_cost=[]
			for minibatch_train in range(n_train):
				sentence = x_train[minibatch_train*batch_size:(minibatch_train+1)*batch_size]
				y_value = y_train[minibatch_train*batch_size:(minibatch_train+1)*batch_size]
				train_value = valid_model(sentence, y_value)
				train_cost.append(train_value)
			print "Train : "+str(np.mean(train_cost)*100)
			valid_cost=[]
			predictions=[]
                        for minibatch_valid in range(n_valid):
				y_value = y_valid[minibatch_valid*batch_size:(minibatch_valid+1)*batch_size]
                                sentence = x_valid[minibatch_valid*batch_size:(minibatch_valid+1)*batch_size]
				valid_value = test_model(sentence, y_value)
                                valid_cost.append(valid_value)
			print "Valid : "+str(np.mean(valid_cost)*100)+" in : "+(saving+str(index_filename))
			test_cost=[]
			for minibatch_test in range(n_test):
				sentence = x_test[minibatch_test*batch_size:(minibatch_test+1)*batch_size]
				y_value = y_test[minibatch_test*batch_size:(minibatch_test+1)*batch_size]
				test_value = test_model(sentence, y_value)
				test_cost.append(test_value)
			print "Test : "+str(np.mean(test_cost)*100)
			index_filename+=1

	t_nlp.save(repo, saving)
	return
	#### parcourir le test : take the 10 most accurate sentence ###
	#### parcourir le test : take the 10 less accurate sentence ###
	scores = []
	for index in range(len(y_test)):
		x_value=x_test[index:index+1]
		scores.append(predict_confidency(x_value))
	right = [x_test[i] for i in np.argsort(scores)[::-1][:20]]
	false = [x_test[i] for i in np.argsort(scores)[:20]]
	print scores[:10]

	with closing(open('data/sentence/relevant_sentence_H_G', 'wb')) as f:
		pickle.dump([right, false], f, protocol=pickle.HIGHEST_PROTOCOL)
def build_database(repo, dico_filename, filenames, dwin):
    index = 0
    y_value = []
    x_value = []
    with closing(open(os.path.join(repo, dico_filename), 'rb')) as f:
        dico = pickle.load(f)
    for filename in filenames:
        lines, _ = get_input_from_files(repo, [filename], dico)
        for line in lines:
            x_value.append(line)
            y_value.append(index)
        if index == 0:
            index += 1
    y_value = np.asarray(y_value, dtype=int)

    x_value_0 = [x_value[i] for i in range(np.argmax(y_value))]
    y_value_0 = [y_value[i] for i in range(np.argmax(y_value))]
    indexes = np.random.permutation(y_value.shape[0] -
                                    np.argmax(y_value))[:np.argmax(
                                        y_value)]  #TODO PUT IT BACK
    x_value_1 = [x_value[i + np.argmax(y_value)]
                 for i in indexes]  # balance the numbers
    y_value_1 = [y_value[i + np.argmax(y_value)]
                 for i in indexes]  # balance the numbers

    pos_percentage = (int)(len(y_value_0) * 0.8)
    neg_percentage = (int)(len(y_value_1) * 0.8)
    other_pos_percentage = (len(y_value_0) - pos_percentage) / 2
    other_neg_percentage = (len(y_value_1) - neg_percentage) / 2

    pos_permut = np.random.permutation(len(y_value_0))
    neg_permut = np.random.permutation(len(y_value_1))
    x_train = [x_value_0[i] for i in pos_permut[:pos_percentage]
               ] + [x_value_1[i] for i in neg_permut[:neg_percentage]]
    x_valid = [x_value_0[i] for i in pos_permut[pos_percentage:pos_percentage+other_pos_percentage]] + \
       [x_value_1[i] for i in neg_permut[neg_percentage:neg_percentage+other_neg_percentage]]
    x_test = [x_value_0[i] for i in pos_permut[pos_percentage+other_pos_percentage:]] + \
       [x_value_1[i] for i in neg_permut[neg_percentage+other_neg_percentage:]]

    y_train = [y_value_0[i] for i in pos_permut[:pos_percentage]
               ] + [y_value_1[i] for i in neg_permut[:neg_percentage]]
    y_valid = [y_value_0[i] for i in pos_permut[pos_percentage:pos_percentage+other_pos_percentage]] + \
       [y_value_1[i] for i in neg_permut[neg_percentage:neg_percentage+other_neg_percentage]]
    y_test = [y_value_0[i] for i in pos_permut[pos_percentage+other_pos_percentage:]] + \
       [y_value_1[i] for i in neg_permut[neg_percentage+other_neg_percentage:]]

    index_train = np.random.permutation(len(y_train))
    index_valid = np.random.permutation(len(y_valid))
    index_test = np.random.permutation(len(y_test))
    x_train_ = [x_train[i].astype(int) for i in index_train]
    x_valid_ = [x_valid[i].astype(int) for i in index_valid]
    x_test_ = [x_test[i].astype(int) for i in index_test]
    y_train_ = [y_train[i] for i in index_train]
    y_valid_ = [y_valid[i] for i in index_valid]
    y_test_ = [y_test[i] for i in index_test]

    paddings = [[], [], [], []]
    for i in range(dwin / 2):
        for i in xrange(4):
            paddings[i].append(dico[i]['PARSING'])
    paddings = np.asarray(paddings)
    #paddings = paddings.reshape((1, paddings.shape[0], paddings.shape[1]))
    x_train_ = [add_padding(elem, paddings).astype(int) for elem in x_train_]
    x_valid_ = [add_padding(elem, paddings) for elem in x_valid_]
    x_test_ = [add_padding(elem, paddings) for elem in x_test_]

    x_train = []
    x_valid = []
    x_test = []
    y_train = []
    y_valid = []
    y_test = []
    for elem, label in zip(x_train_, y_train_):
        for i in range(elem.shape[1] - dwin):
            x_train.append(elem[:, i:i + dwin])
            y_train.append(label)
    for elem, label in zip(x_valid_, y_valid_):
        for i in range(elem.shape[1] - dwin):
            x_valid.append(elem[:, i:i + dwin])
            y_valid.append(label)
    for elem, label in zip(x_test_, y_test_):
        for i in range(elem.shape[1] - dwin):
            x_test.append(elem[:, i:i + dwin])
            y_test.append(label)

    index_train = np.random.permutation(len(y_train))
    index_valid = np.random.permutation(len(y_valid))
    index_test = np.random.permutation(len(y_test))
    x_train = [x_train[i].astype(int) for i in index_train]
    x_valid = [x_valid[i].astype(int) for i in index_valid]
    x_test = [x_test[i].astype(int) for i in index_test]
    y_train = [y_train[i] for i in index_train]
    y_valid = [y_valid[i] for i in index_valid]
    y_test = [y_test[i] for i in index_test]

    return (x_train, y_train), (x_valid, y_valid), (x_test, y_test)