def eval(input_x,input_y,test_x,test,label,write_folder = None):
			tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
              		  			'C': [1, 10, 100, 1000]},
                				{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
			grid_clf = GridSearchCV(SVR(C=1,epsilon=0.2), tuned_parameters)
			grid_clf.fit(input_x,input_y)
			print "params : \t"
			print grid_clf.get_params()
			result = grid_clf.predict(test_x)
			#py_weka = python_weka(input_x,input_y,label)
			#py_weka.train()
			#result = py_weka.predict(test_x)
			#py_weka.close()
			#clf = SVR(C=1.0, epsilon=0.2)
			#clf.fit(input_x,input_y)
			#result =  clf.predict(test_x)
			score_index = 0
			produce_set = []
			for i in test:
				produce_set.append([])
				score_list = []
				index_list = []
				for j in i.thread:
					for k in j.sentences:
						k.predict_score = result[score_index]
						score_index += 1
						score_list.append(k.predict_score)
						index_list.append(k.index)
				sorted_index_array = sorted_index(score_list)
				sen_length = 0
				for j in range(len(index_list)):
					if sen_length < float(len(index_list))*0.3:
						produce_set[-1].append(index_list[sorted_index_array[len(index_list)-j-1]])
						sen_length += 1
					else:
						break
			score =  weightRecall(test,produce_set,write_folder)
			print score
			rouge_eval = rouge(test,produce_set)
			rouge_score =  rouge_eval.eval()['rouge_l_f_score']
			print rouge_score
			return score,rouge_score
	def rnn_test(self):
		produce_set = []
		for i in self.test:
			produce_set.append([])
			score_list = []
			index_list = []
			for j in i.thread:
				input_ins = []
				label_ins = []
				index = []
				for k in j.sentences:
                                	input_ins.append(k.feature)
					index.append(k.index)
                                input_ins = input_ins + input_ins
                                input_ins = numpy.asarray(numpy.float32(input_ins))
                           	softmax_array = self.rnn_model.prob(input_ins)
				count = 0
                		for i in softmax_array[(len(softmax_array)/2) :]:
					score = i#(i[1] * 0.33) + (i[2] * 0.66) + (i[3] * 1)
					score_list.append(score)
					index_list.append(index[count])
					count += 1
			sorted_index_array = sorted_index(score_list)
                        sen_length = 0
                        for j in range(len(index_list)):
                        	if sen_length < float(len(index_list))*0.3:
                                	produce_set[-1].append(index_list[sorted_index_array[len(index_list)-j-1]])
                                        sen_length += 1
                                else:
                                	break
		score =  weightRecall(self.test,produce_set)
                print score
		rouge_eval = rouge(self.test,produce_set)
                rouge_score =  rouge_eval.eval()['rouge_l_f_score']
                print rouge_score
		return score,rouge_score
Exemple #3
0
def test_mlp(dataset,learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, batch_size=250, n_hidden=100):
	datasets,length,testSet = load_data(dataset)
    	train_set_x, train_set_y = datasets[0]
    	valid_set_x, valid_set_y = datasets[1]
    	test_set_x, test_set_y = datasets[2]

    	n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    	n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    	n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    	######################
    	# BUILD ACTUAL MODEL #
    	######################
    	print '... building the model'

    	index = T.lscalar()  # index to a [mini]batch
    	x = T.matrix('x')  # the data is presented as rasterized images
    	y = T.ivector('y')  # the labels are presented as 1D vector of
                        	# [int] labels

    	rng = numpy.random.RandomState(1234)

    	classifier = MLP(
        	rng=rng,
        	input=x,
        	n_in=length,
        	n_hidden=n_hidden,
        	n_out=2
    	)

    	# start-snippet-4
    	# the cost we minimize during training is the negative log likelihood of
    	# the model plus the regularization terms (L1 and L2); cost is expressed
    	# here symbolically
    	cost = (
        	classifier.negative_log_likelihood(y)
        	+ L1_reg * classifier.L1
        	+ L2_reg * classifier.L2_sqr
    	)
    	# end-snippet-4

    	# compiling a Theano function that computes the mistakes that are made
    	# by the model on a minibatch
    	test_model = theano.function(
        	inputs=[index],
        	outputs=classifier.errors(y),
        	givens={
            		x: test_set_x[index * batch_size:(index + 1) * batch_size],
            		y: test_set_y[index * batch_size:(index + 1) * batch_size]
        	}
    	)

    	validate_model = theano.function(
        	inputs=[index],
        	outputs=classifier.errors(y),
        	givens={
            		x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            		y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        	}
    	)

    	# start-snippet-5
    	# compute the gradient of cost with respect to theta (sotred in params)
    	# the resulting gradients will be stored in a list gparams
    	gparams = [T.grad(cost, param) for param in classifier.params]

    	# specify how to update the parameters of the model as a list of
    	# (variable, update expression) pairs

    	# given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of
    	# same length, zip generates a list C of same size, where each element
    	# is a pair formed from the two lists :
    	#    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
    	updates = [
        	(param, param - learning_rate * gparam)
        	for param, gparam in zip(classifier.params, gparams)
    	]

    	# compiling a Theano function `train_model` that returns the cost, but
    	# in the same time updates the parameter of the model based on the rules
    	# defined in `updates`
    	train_model = theano.function(
        	inputs=[index],
        	outputs=cost,
        	updates=updates,
        	givens={
            		x: train_set_x[index * batch_size: (index + 1) * batch_size],
            		y: train_set_y[index * batch_size: (index + 1) * batch_size]
        	}
    	)
    	# end-snippet-5

    	###############
    	# TRAIN MODEL #
    	###############
    	print '... training'

    	# early-stopping parameters
    	patience = 10000  # look as this many examples regardless
    	patience_increase = 2  # wait this much longer when a new best is
                           # found
    	improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    	validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    	best_validation_loss = numpy.inf
    	best_iter = 0
    	test_score = 0.
    	start_time = time.clock()

    	epoch = 0
    	done_looping = False

    	while (epoch < n_epochs) and (not done_looping):
        	epoch = epoch + 1
        	for minibatch_index in xrange(n_train_batches):

            		minibatch_avg_cost = train_model(minibatch_index)
            		# iteration number
            		iter = (epoch - 1) * n_train_batches + minibatch_index

			if (iter + 1) % validation_frequency == 0:
                		# compute zero-one loss on validation set
                		validation_losses = [validate_model(i) for i
                                			in xrange(n_valid_batches)]
                		this_validation_loss = numpy.mean(validation_losses)

                		print(
                    			'epoch %i, minibatch %i/%i, validation error %f %%' %
                    			(
                        			epoch,
                        			minibatch_index + 1,
                        			n_train_batches,
                        			this_validation_loss * 100.
                    			)
                		)

                		# if we got the best validation score until now
                		if this_validation_loss < best_validation_loss:
                    			#improve patience if loss improvement is good enough
                    			if (
                        			this_validation_loss < best_validation_loss *
                        			improvement_threshold
                    			):
                        			patience = max(patience, iter * patience_increase)

                    			best_validation_loss = this_validation_loss
                    			best_iter = iter

                    			# test it on the test set
                    			test_losses = [test_model(i) for i
                                   			in xrange(n_test_batches)]
                    			test_score = numpy.mean(test_losses)

                    			print(('     epoch %i, minibatch %i/%i, test error of '
                           			'best model %f %%') %
                          			(epoch, minibatch_index + 1, n_train_batches,
                           			test_score * 100.))

			if patience <= iter:
                		done_looping = True
                		break

	end_time = time.clock()
	print(('Optimization complete. Best validation score of %f %% '
        	'obtained at iteration %i, with test performance %f %%') %
        	(best_validation_loss * 100., best_iter + 1, test_score * 100.))
    	print >> sys.stderr, ('The code for file ' +
        		os.path.split(__file__)[1] +
                        ' ran for %.2fm' % ((end_time - start_time) / 60.))
	prediction_model = theano.function(
                inputs=[],
                outputs = classifier.logRegressionLayer.y_pred,
                givens={
                        x: test_set_x
                }
        )
	produceSet = process_test_data(prediction_model(),testSet)
        print weightRecall(testSet,produceSet)
        print produceSet
def sgd_optimization_mnist(dataset,learning_rate=0.13, n_epochs=5000,batch_size=250):
	datasets,length,testSet = load_data(dataset)
    	train_set_x, train_set_y = datasets[0]
    	valid_set_x, valid_set_y = datasets[1]
    	test_set_x, test_set_y = datasets[2]
		
	n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
	n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
	n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size
		
	index = T.lscalar()

	x = T.matrix('x')  # data, presented as rasterized images
    	y = T.matrix('y')
		
	classifier = logisticRegression(input=x, n_in=length, n_out=1)
	cost = classifier.mse(y)
	test_model = theano.function(
		inputs=[index],
		outputs=classifier.mse(y),
		givens={
			x: test_set_x[index * batch_size: (index + 1) * batch_size],
			y: test_set_y[index * batch_size: (index + 1) * batch_size]
		}
	)
	prediction_model = theano.function(
		inputs=[],
		outputs = classifier.y_pred,
		givens={
			x: test_set_x
		}
	)
	validate_model = theano.function(
        	inputs=[index],
        	outputs=classifier.mse(y),
        	givens={
           		x: valid_set_x[index * batch_size: (index + 1) * batch_size],
            		y: valid_set_y[index * batch_size: (index + 1) * batch_size]
        	}
    	)
		
	g_W = T.grad(cost=cost, wrt=classifier.W)
    	g_b = T.grad(cost=cost, wrt=classifier.b)
		
	updates = [(classifier.W, classifier.W - learning_rate * g_W),(classifier.b, classifier.b - learning_rate * g_b)]

	train_model = theano.function(
        	inputs=[index],
        	outputs=cost,
        	updates=updates,
        	givens={
            		x: train_set_x[index * batch_size: (index + 1) * batch_size],
            		y: train_set_y[index * batch_size: (index + 1) * batch_size]
        	}
    	)

	print '... training the model'
	patience = 7000
	patience_increase = 2
	improvement_threshold = 0.995
	validation_frequency = min(n_train_batches, patience / 2)

	best_validation_loss = numpy.inf
    	test_score = 0.
    	start_time = time.clock()

    	done_looping = False
    	epoch = 0
	
	while (epoch < n_epochs) and (not done_looping):
        	epoch = epoch + 1
        	for minibatch_index in xrange(n_train_batches):
			minibatch_avg_cost = train_model(minibatch_index) 
			iter = (epoch - 1) * n_train_batches + minibatch_index #the number of batches that had trained
			
			if (iter + 1) % validation_frequency == 0:
				validation_losses = [validate_model(i)for i in xrange(n_valid_batches)]
                		this_validation_loss = numpy.mean(validation_losses)
				print(
                    			'epoch %i, minibatch %i/%i, validation error %f %%' %
                    			(
                        			epoch,
                        			minibatch_index + 1,
                        			n_train_batches,
                        			this_validation_loss * 100.
                    			)
                		)
				if this_validation_loss < best_validation_loss:
					if this_validation_loss < best_validation_loss * improvement_threshold:
                        			patience = max(patience, iter * patience_increase)	
					best_validation_loss = this_validation_loss

					test_losses = [test_model(i)for i in xrange(n_test_batches)]
					test_score = numpy.mean(test_losses)

					print(
                        			(
                            				'     epoch %i, minibatch %i/%i, test error of'
                            				' best model %f %%'
                        			) %
                        			(
                            				epoch,
                            				minibatch_index + 1,
                            				n_train_batches,
                            				test_score * 100.
                        			)
                    			)
				
				if patience <= iter:
                			done_looping = True
                			break
	end_time = time.clock()
	print(
        	(
            		'Optimization complete with best validation score of %f %%,'
            		'with test performance %f %%'
        	)
        	% (best_validation_loss * 100., test_score * 100.)
    	)
	print 'The code run for %d epochs, with %f epochs/sec' % (
        	epoch, 1. * epoch / (end_time - start_time))
	print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.1fs' % ((end_time - start_time)))
	produceSet = process_test_data(prediction_model(),testSet)
	print weightRecall(testSet,produceSet)
	print produceSet