def unsupervised_training(learning_rate, decay_rate, epochs, repo, output_dico,
                          database_name):
    dwin = 9
    with closing(open(os.path.join(repo, output_dico), 'rb')) as f:
        dico = pickle.load(f)
    n_mot = [len(dico[i]) for i in dico.keys()]
    vect_size = [20, 10, 5, 5]
    n_hidden = 100
    x = T.itensor3('x')
    xc = T.itensor3('x')
    y = T.ivector('y')
    #xc = T.itensor3('xc')
    t_nlp = LookUpTrain(dwin, n_mot, vect_size, n_hidden)
    t_nlp.initialize()
    cost = T.mean(t_nlp.cost(x, y))
    error = T.mean(t_nlp.errors(x, y))

    params = getParams(t_nlp, x)
    for p, i in zip(params, range(len(params))):
        p.name += '_' + str(i)

    #calcul du gradient avec RMSProp
    updates = []
    caches = {}
    grad_params = T.grad(cost, params)
    for param, grad_param in zip(params, grad_params):

        if not caches.has_key(param.name):
            caches[param.name] = shared_floatx(param.get_value() * 0.,
                                               "cache_" + param.name)
        # update rule
        update_cache = decay_rate*caches[param.name]\
           + (1 - decay_rate)*grad_param**2
        update_param = param - learning_rate * grad_param / T.sqrt(
            update_cache + 1e-8)
        updates.append((caches[param.name], update_cache))
        updates.append((param, update_param))

    train_model = theano.function(inputs=[x, y],
                                  outputs=cost,
                                  updates=updates,
                                  allow_input_downcast=True)

    valid_model = theano.function(inputs=[x, y],
                                  outputs=cost,
                                  allow_input_downcast=True)
    test_model = theano.function(inputs=[x, y],
                                 outputs=error,
                                 allow_input_downcast=True)
    data_path = os.path.join(repo, database_name)
    with closing(open(data_path, 'rb')) as f:
        data, data_c = pickle.load(f)
    data = numpy.asarray(data).astype(int)
    labels = numpy.asarray(data_c).astype(int)

    # test : reduce data
    data = data
    data_c = data_c
    # reading by minibatch
    batch_size = 15
    n_sample = data.shape[0] / batch_size
    # 80% of the data will go into the training set
    n_train = (int)(n_sample * 0.8)

    y_value = numpy.zeros((2 * batch_size), dtype=int)
    y_value[batch_size:] = 1 + y_value[batch_size:]
    index_filename = 0

    saving = "params_savings_bis_v4_"

    #t_nlp.load(repo, (saving+str(95)))
    #index_filename = 96
    #saving = "params_savings_bis"

    for epoch in range(4):
        train_cost = []
        valid_cost = []
        index_valid = n_train
        for minibatch_index in range(n_train):

            correct_sentences = data[minibatch_index *
                                     batch_size:(minibatch_index + 1) *
                                     batch_size, :, :]
            incorrect_sentences = data_c[minibatch_index *
                                         batch_size:(minibatch_index + 1) *
                                         batch_size, :, :]
            sentences = numpy.concatenate(
                [incorrect_sentences, correct_sentences], axis=0)
            train_value = train_model(sentences, y_value)
            if minibatch_index % 10 == 0:
                train_cost = []
                for minibatch_train in range(n_train):
                    correct_sentences = data[minibatch_train *
                                             batch_size:(minibatch_train + 1) *
                                             batch_size, :, :]
                    incorrect_sentences = data_c[minibatch_train *
                                                 batch_size:(minibatch_train +
                                                             1) *
                                                 batch_size, :, :]
                    sentences = numpy.concatenate(
                        [incorrect_sentences, correct_sentences], axis=0)
                    train_value = valid_model(sentences, y_value)
                    train_cost.append(train_value)
                print "Train : " + str(numpy.mean(train_cost) * 100)
                valid_cost = []
                for minibatch_valid in range(n_train, n_sample):
                    correct_sentences = data[minibatch_valid *
                                             batch_size:(minibatch_valid + 1) *
                                             batch_size, :, :]
                    incorrect_sentences = data_c[minibatch_valid *
                                                 batch_size:(minibatch_valid +
                                                             1) *
                                                 batch_size, :, :]
                    sentences = numpy.concatenate(
                        [incorrect_sentences, correct_sentences], axis=0)
                    valid_value = test_model(sentences, y_value)
                    #import pdb
                    #pdb.set_trace()
                    valid_cost.append(valid_value)
                print "Valid : " + str(
                    numpy.mean(valid_value) *
                    100) + " in : " + (saving + str(index_filename))
                t_nlp.save(repo, (saving + str(index_filename)))
                index_filename += 1
Exemple #2
0
def training_Hollande(repo, output_dico, learning_rate, decay_rate, filenames):
	
	#########
	# MODEL #
	#########
	dwin = 20
	with closing(open(os.path.join(repo, output_dico), 'rb')) as f:
		dico = pickle.load(f)
	n_mot = [len(dico[i]) for i in dico.keys()]
	vect_size = [100, 10, 5, 5]
	n_hidden = [100, 50]

	t_nlp = LookUpTrain(dwin, n_mot, vect_size, n_hidden, n_out=2)
	t_nlp.initialize()
	#t_nlp.load(repo, filename_load)

	x = T.itensor3('x')
	y = T.ivector('y')

	cost = T.mean(t_nlp.cost(x, y))
	error = T.mean(t_nlp.errors(x,y))


	params = getParams(t_nlp, x)
	updates, _ = Adam(cost, params, learning_rate)
	"""
	for p, i in zip(params, range(len(params))):
		p.name+='_'+str(i)

	#calcul du gradient avec RMSProp
	updates = []
	caches = {}
	grad_params = T.grad(cost, params)
	for param, grad_param in zip(params, grad_params):

		if not caches.has_key(param.name):
			caches[param.name] = shared_floatx(param.get_value() * 0.,
												"cache_"+param.name)
		# update rule
		update_cache = decay_rate*caches[param.name]\
					+ (1 - decay_rate)*grad_param**2
		update_param = param  - learning_rate*grad_param/T.sqrt(update_cache + 1e-8)
		updates.append((caches[param.name], update_cache))
		updates.append((param, update_param))
	"""

	train_model = theano.function(inputs=[x,y], outputs=cost, updates=updates,
					allow_input_downcast=True)

	valid_model = theano.function(inputs=[x, y], outputs=cost, allow_input_downcast=True)
	test_model = theano.function(inputs=[x, y], outputs=error, allow_input_downcast=True)
        predict = theano.function(inputs=[x], outputs=t_nlp.predict(x), allow_input_downcast=True)
	predict_confidency = theano.function(inputs=[x], outputs=t_nlp.predict_confidency(x)[0], allow_input_downcast=True)
	index = 0
	y_value = []
	x_value = []
	with closing(open(os.path.join(repo, output_dico), 'rb')) as f:
		dico = pickle.load(f)
	for filename in filenames:
		lines, _ = get_input_from_files(repo, [filename], dico)
		for line in lines:
			x_value.append(line)
			y_value.append(index)
		if index ==0:
			index+=1
	y_value = np.asarray(y_value, dtype=int)
	# balance the samples
	x_value_0 = [ x_value[i] for i in range(np.argmax(y_value))]# put the 0
	y_value_0 = [ y_value[i] for i in range(np.argmax(y_value))]# put the 0
	indexes = np.random.permutation(y_value.shape[0] - np.argmax(y_value))[:np.argmax(y_value)]
	x_value_1 = [x_value[i+np.argmax(y_value)] for i in indexes]# balance the numbers
	y_value_1 = [y_value[i+np.argmax(y_value)] for i in indexes]# balance the numbers

	pos_percentage = (int) (len(y_value_0)*0.8)
	neg_percentage = (int) (len(y_value_1)*0.8)
	other_pos_percentage = (len(y_value_0) - pos_percentage)/2
	other_neg_percentage = (len(y_value_1) - neg_percentage)/2

	pos_permut = np.random.permutation(len(y_value_0))
	neg_permut = np.random.permutation(len(y_value_1))
	x_train = [x_value_0[i] for i in pos_permut[:pos_percentage]] + [x_value_1[i] for i in neg_permut[:neg_percentage]]
	x_valid = [x_value_0[i] for i in pos_permut[pos_percentage:pos_percentage+other_pos_percentage]] + \
		  [x_value_1[i] for i in neg_permut[neg_percentage:neg_percentage+other_neg_percentage]]
	x_test = [x_value_0[i] for i in pos_permut[pos_percentage+other_pos_percentage:]] + \
		  [x_value_1[i] for i in neg_permut[neg_percentage+other_neg_percentage:]]

	y_train = [y_value_0[i] for i in pos_permut[:pos_percentage]] + [y_value_1[i] for i in neg_permut[:neg_percentage]]
	y_valid = [y_value_0[i] for i in pos_permut[pos_percentage:pos_percentage+other_pos_percentage]] + \
		  [y_value_1[i] for i in neg_permut[neg_percentage:neg_percentage+other_neg_percentage]]
	y_test = [y_value_0[i] for i in pos_permut[pos_percentage+other_pos_percentage:]] + \
		  [y_value_1[i] for i in neg_permut[neg_percentage+other_neg_percentage:]]

	index_train = np.random.permutation(len(y_train))
	batch_size = 32
	index_valid = np.random.permutation(len(y_valid))
	index_test = np.random.permutation(len(y_test))
	x_train_ = [x_train[i].astype(int) for i in index_train]
	x_valid_ = [x_valid[i].astype(int) for i in index_valid]
	x_test_ = [x_test[i].astype(int) for i in index_test]
	y_train_ = [y_train[i] for i in index_train]
	y_valid_ = [y_valid[i] for i in index_valid]
	y_test_ = [y_test[i] for i in index_test]

	paddings = [ [], [], [], []]
	for i in range(dwin/2):
		for i in xrange(4):
			paddings[i].append(dico[i]['PARSING'])
	paddings = np.asarray(paddings)
	#paddings = paddings.reshape((1, paddings.shape[0], paddings.shape[1]))
	x_train_ = [add_padding(elem, paddings) for elem in x_train_]
	x_valid_ = [add_padding(elem, paddings) for elem in x_valid_]
	x_test_ = [add_padding(elem, paddings) for elem in x_test_]

	x_train=[]; x_valid=[]; x_test=[]
	y_train=[]; y_valid=[]; y_test=[]
	for elem, label in zip(x_train_, y_train_):
		for i in range(elem.shape[1] -dwin):
			x_train.append(elem[:,i:i+dwin])
			y_train.append(label)
	for elem, label in zip(x_valid_, y_valid_):
		for i in range(elem.shape[1] -dwin):
			x_valid.append(elem[:,i:i+dwin])
			y_valid.append(label)
	for elem, label in zip(x_test_, y_test_):
		for i in range(elem.shape[1] -dwin):
			x_test.append(elem[:,i:i+dwin])
			y_test.append(label)

	index_train = np.random.permutation(len(y_train))
	index_valid = np.random.permutation(len(y_valid))
	index_test = np.random.permutation(len(y_test))
	x_train = [x_train[i].astype(int) for i in index_train]
	x_valid = [x_valid[i].astype(int) for i in index_valid]
	x_test = [x_test[i].astype(int) for i in index_test]
	y_train = [y_train[i] for i in index_train]
	y_valid = [y_valid[i] for i in index_valid]
	y_test = [y_test[i] for i in index_test]

	n_train = len(y_train)/batch_size
	n_valid = len(y_valid)/batch_size
	n_test = len(y_test)/batch_size
	print (n_train, n_valid, n_test)
	print (1.*sum(y_valid))/len(y_valid)
	print (1.*sum(y_test))/len(y_test)
	print "#############################"
	saving ='JADT_2_Fev_H_G_'
	index_filename=0
        epochs = 10 # number of iterations on the corpus
	for epoch in range(epochs):
		index_valid = n_train
		for minibatch_index in range(n_train):

			sentence = x_train[minibatch_index*batch_size:(minibatch_index+1)*batch_size]
			y_value = y_train[minibatch_index*batch_size:(minibatch_index+1)*batch_size]
			#before = valid_model(sentence, y_value)
			train_value = train_model(sentence, y_value)
			#after = valid_model(sentence, y_value)
			#print before - after
		if True:
			train_cost=[]
			for minibatch_train in range(n_train):
				sentence = x_train[minibatch_train*batch_size:(minibatch_train+1)*batch_size]
				y_value = y_train[minibatch_train*batch_size:(minibatch_train+1)*batch_size]
				train_value = valid_model(sentence, y_value)
				train_cost.append(train_value)
			print "Train : "+str(np.mean(train_cost)*100)
			valid_cost=[]
			predictions=[]
                        for minibatch_valid in range(n_valid):
				y_value = y_valid[minibatch_valid*batch_size:(minibatch_valid+1)*batch_size]
                                sentence = x_valid[minibatch_valid*batch_size:(minibatch_valid+1)*batch_size]
				valid_value = test_model(sentence, y_value)
                                valid_cost.append(valid_value)
			print "Valid : "+str(np.mean(valid_cost)*100)+" in : "+(saving+str(index_filename))
			test_cost=[]
			for minibatch_test in range(n_test):
				sentence = x_test[minibatch_test*batch_size:(minibatch_test+1)*batch_size]
				y_value = y_test[minibatch_test*batch_size:(minibatch_test+1)*batch_size]
				test_value = test_model(sentence, y_value)
				test_cost.append(test_value)
			print "Test : "+str(np.mean(test_cost)*100)
			index_filename+=1

	t_nlp.save(repo, saving)
	return
	#### parcourir le test : take the 10 most accurate sentence ###
	#### parcourir le test : take the 10 less accurate sentence ###
	scores = []
	for index in range(len(y_test)):
		x_value=x_test[index:index+1]
		scores.append(predict_confidency(x_value))
	right = [x_test[i] for i in np.argsort(scores)[::-1][:20]]
	false = [x_test[i] for i in np.argsort(scores)[:20]]
	print scores[:10]

	with closing(open('data/sentence/relevant_sentence_H_G', 'wb')) as f:
		pickle.dump([right, false], f, protocol=pickle.HIGHEST_PROTOCOL)