Ejemplo n.º 1
0
def work():
	print "Started!"
	
	print "Loading data."
	cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/split", labelset="data/traindataset2zgb")
	docMatrixes, docSentenceNums, sentenceWordNums, labels = transToTensor(cr.getCorpus([0, 12]))
	
# 	valid_cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/valid/split", labelset="data/valid/label.txt")
	validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validLabels = transToTensor(cr.getCorpus([800, 870]))
	print "Data loaded."
	
	
	learning_rate = 0.1
	docSentenceCount = T.vector("docSentenceCount")
	sentenceWordCount = T.vector("sentenceWordCount")
	corpus = T.matrix("corpus")
	docLabel = T.ivector('docLabel') 
	index = T.lscalar("index")
	rng = numpy.random.RandomState(23455)
	batchSize = 1
	mr =numpy.max([len(docMatrixes.get_value()), len(validDocMatrixes.get_value())])
	n_batches = (len(docSentenceNums.get_value()) -1 ) / batchSize
	
	print "Train set size is ", len(docMatrixes.get_value())
	print "Validating set size is ", len(validDocMatrixes.get_value())
	print "Batch size is ", batchSize
	print "Number of training batches  is ", n_batches
	
	# for list-type data
	layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \
													 maxRandge=mr, \
													 sentenceLayerNodesNum=100, \
													 sentenceLayerNodesSize=[5, 200], \
													 docLayerNodesNum=100, \
													 docLayerNodesSize=[3, 100])
	# for padding data
# 	layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, corpus_shape=(batchSize, cr.getMaxDocSentenceNum(), cr.getMaxSentenceWordNum(), cr.getDim()), \
# 													 maxRandge=mr, \
# 													 sentenceLayerNodesNum=100, \
# 													 sentenceLayerNodesSize=5, \
# 													 docLayerNodesNum=200, \
# 													 docLayerNodesSize=3)

	layer1 = HiddenLayer(
		rng,
		input=layer0.output,
		n_in=layer0.outputDimension,
		n_out=100,
		activation=T.tanh
	)
	
	layer2 = LogisticRegression(input=layer1.output, n_in=100, n_out=2)

	error = layer2.errors(docLabel)
	cost = layer2.negative_log_likelihood(docLabel)
	
	# construct the parameter array.
	params = layer2.params + layer1.params + layer0.params
	
	# Load the parameters last time, optionally.
	loadParamsVal(params)

	grads = T.grad(cost, params)

	updates = [
		(param_i, param_i - learning_rate * grad_i)
		for param_i, grad_i in zip(params, grads)
	]
	
	
	print "Compiling computing graph."
	
	valid_model = theano.function(
 		[],
 		[cost, error],
 		givens={
						corpus: validDocMatrixes,
						docSentenceCount: validDocSentenceNums,
						sentenceWordCount: validSentenceWordNums,
						docLabel: validLabels
				}
 	)
	# for list-type data
	train_model = theano.function(
 		[index],
 		[cost, error],
 		updates=updates,
 		givens={
						corpus: docMatrixes,
						docSentenceCount: docSentenceNums[index * batchSize: (index + 1) * batchSize + 1],
						sentenceWordCount: sentenceWordNums,
						docLabel: labels[index * batchSize: (index + 1) * batchSize]
					}
 	)
	
	# for padding data
	# 	train_model = theano.function(
	# 		[corpus, docLabel],
	# 		[cost, error],
	# 		updates=updates,
	# 	)
	print "Compiled."
	
	print "Start to train."
	epoch = 0
	n_epochs = 200
	ite = 0
	
	# ####Validate the model####
	costNum, errorNum = valid_model()
	print "Valid current model:"
	print "Cost: ", costNum
	print "Error: ", errorNum
	
	while (epoch < n_epochs):
		epoch = epoch + 1
		#######################
		for i in range(n_batches):
			# for list-type data
			costNum, errorNum = train_model(i)
			ite = ite + 1
			# for padding data
# 			costNum, errorNum = train_model(docMatrixes, labels)
# 			del docMatrixes, docSentenceNums, sentenceWordNums, labels
			# print ".", 
			if(ite % 1 == 0):
				print
				print "@iter: ", ite
				print "Cost: ", costNum
				print "Error: ", errorNum
				
		# Validate the model
		costNum, errorNum = valid_model()
		print "Valid current model:"
		print "Cost: ", costNum
		print "Error: ", errorNum
		
		# Save model
		print "Saving parameters."
		saveParamsVal(params)
		print "Saved."
	print "All finished!"
def work():
	print "Started!"
	
	print "Loading data."
	cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/split", labelset="data/traindataset2zgb")
# 	valid_cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/valid/split", labelset="data/valid/label.txt")
	validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validLabels = cr.getCorpus([800, 870])
	print "Data loaded."
	
	mr = numpy.max([cr.getMaxDocSentenceNum(), cr.getMaxDocSentenceNum()])
	learning_rate = 0.01
	docSentenceCount = T.fvector("docSentenceCount")
	sentenceWordCount = T.fmatrix("sentenceWordCount")
	corpus = T.ftensor4("corpus")
	docLabel = T.ivector('docLabel') 
	rng = numpy.random.RandomState(23455)
	batchSize = 40
	# for list-type data
	layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \
													 maxRandge=mr, \
													 sentenceLayerNodesNum=100, \
													 sentenceLayerNodesSize=[5, 200], \
													 docLayerNodesNum=100, \
													 docLayerNodesSize=[3, 100])
	# for padding data
# 	layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, corpus_shape=(batchSize, cr.getMaxDocSentenceNum(), cr.getMaxSentenceWordNum(), cr.getDim()), \
# 													 maxRandge=mr, \
# 													 sentenceLayerNodesNum=100, \
# 													 sentenceLayerNodesSize=5, \
# 													 docLayerNodesNum=200, \
# 													 docLayerNodesSize=3)

	layer1 = HiddenLayer(
		rng,
		input=layer0.output,
		n_in=layer0.outputDimension,
		n_out=100,
		activation=T.tanh
	)
	
	layer2 = LogisticRegression(input=layer1.output, n_in=100, n_out=2)

	error = layer2.errors(docLabel)
	cost = layer2.negative_log_likelihood(docLabel)
	
	# construct the parameter array.
	params = layer2.params + layer1.params + layer0.params
	
	# Load the parameters last time, optionally.
	loadParamsVal(params)

	grads = T.grad(cost, params)

	updates = [
		(param_i, param_i - learning_rate * grad_i)
		for param_i, grad_i in zip(params, grads)
	]
	
	
	print "Compiling computing graph."
	
	# for list-type data
 	train_model = theano.function(
 		[corpus, docSentenceCount, sentenceWordCount, docLabel],
 		[cost, error],
 		updates=updates,
 	)
 	
	valid_model = theano.function(
 		[corpus, docSentenceCount, sentenceWordCount, docLabel],
 		[cost, error]
 	)

	# for padding data
	# 	train_model = theano.function(
	# 		[corpus, docLabel],
	# 		[cost, error],
	# 		updates=updates,
	# 	)
	print "Compiled."
	
	print "Start to train."
	epoch = 0
	n_epochs = 200
	ite = 0
	
	while (epoch < n_epochs):
		epoch = epoch + 1
		# ####Validate the model####
		# for list-type data
		costNum, errorNum = valid_model(validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validLabels)
		# for padding data
		# 	costNum, errorNum = train_model(docMatrixes, labels)
		print "Valid current model:"
		print "Cost: ", costNum
		print "Error: ", errorNum
		#######################
		for i in range(1000):
			
# 			if(i * batchSize >= 800):
# 				break
			
			docInfo = cr.getCorpus([i * batchSize, numpy.min([ (i + 1) * batchSize, 800])])
			if(docInfo is None):
				break
			ite = ite + 1
			
			docMatrixes, docSentenceNums, sentenceWordNums, labels = docInfo
			# for list-type data
			costNum, errorNum = train_model(docMatrixes, docSentenceNums, sentenceWordNums, labels)
			
			# for padding data
# 			costNum, errorNum = train_model(docMatrixes, labels)
			del docMatrixes, docSentenceNums, sentenceWordNums, labels
			# print ".", 
			if(ite % 1 == 0):
				print
				print "@iter: ", ite
				print "Cost: ", costNum
				print "Error: ", errorNum

		print "Saving parameters."
		saveParamsVal(params)
		print "Saved."
	print "All finished!"
def work(argv):
    print "Started!"
    rng = numpy.random.RandomState(23455)
    sentenceWordCount = T.ivector("sentenceWordCount")
    corpus = T.matrix("corpus")
    docLabel = T.ivector('docLabel')

    # for list-type data
    layer0 = DocEmbeddingNNOneDoc(corpus, sentenceWordCount, rng, wordEmbeddingDim=200, \
                 sentenceLayerNodesNum=100, \
                 sentenceLayerNodesSize=[5, 200], \
                 docLayerNodesNum=100, \
                 docLayerNodesSize=[3, 100])

    layer1 = HiddenLayer(rng,
                         input=layer0.output,
                         n_in=layer0.outputDimension,
                         n_out=100,
                         activation=T.tanh)

    layer2 = LogisticRegression(input=layer1.output, n_in=100, n_out=2)

    cost = layer2.negative_log_likelihood(1 - layer2.y_pred)

    grads = T.grad(cost, layer0.sentenceResults)
    score = T.diag(T.dot(grads, T.transpose(layer0.sentenceResults)))

    # construct the parameter array.
    params = layer2.params + layer1.params + layer0.params

    # Load the parameters last time, optionally.
    loadParamsVal(params)
    print "Compiling computing graph."
    output_model = theano.function([corpus, sentenceWordCount],
                                   [layer2.y_pred, score])

    print "Compiled."
    cr = CorpusReader(minDocSentenceNum=5,
                      minSentenceWordNum=5,
                      dataset="data/train_valid/split")
    count = 0
    while (count <= 1000):
        info = cr.getCorpus([count, count + 1])
        count += 1
        if info is None:
            print "Pass"
            continue
        docMatrixes, _, sentenceWordNums, ids, sentences = info
        docMatrixes = numpy.matrix(docMatrixes, dtype=theano.config.floatX)
        sentenceWordNums = numpy.array(sentenceWordNums, dtype=numpy.int32)
        print "start to predict: %s." % ids[0]
        pred_y, g = output_model(docMatrixes, sentenceWordNums)
        print "End predicting."
        print "Writing resfile."

        score_sentence_list = zip(g, sentences)
        score_sentence_list.sort(key=lambda x: -x[0])

        with codecs.open("data/output/" + str(pred_y[0]) + "/" + ids[0], "w",
                         'utf-8', "ignore") as f:
            f.write("pred_y: %i\n" % pred_y[0])
            for g0, s in score_sentence_list:
                f.write("%f\t%s\n" % (g0, string.join(s, " ")))


# 		print zip(ids, pred_y[0])
# 		f = file("data/test/res/res" + str(count), "w")
# 		f.write(str(zip(ids, pred_y[0])))
# 		f.close()
        print "Written." + str(count)

    print "All finished!"
def work(argv):
	print "Started!"
	rng = numpy.random.RandomState(23455)
	sentenceWordCount = T.ivector("sentenceWordCount")
	corpus = T.matrix("corpus")
	docLabel = T.ivector('docLabel') 
	
	# for list-type data
	layer0 = DocEmbeddingNNOneDoc(corpus, sentenceWordCount, rng, wordEmbeddingDim=200, \
													 sentenceLayerNodesNum=100, \
													 sentenceLayerNodesSize=[5, 200], \
													 docLayerNodesNum=100, \
													 docLayerNodesSize=[3, 100])

	layer1 = HiddenLayer(
		rng,
		input=layer0.output,
		n_in=layer0.outputDimension,
		n_out=100,
		activation=T.tanh
	)
	
	layer2 = LogisticRegression(input=layer1.output, n_in=100, n_out=2)

	cost = layer2.negative_log_likelihood(1 - layer2.y_pred)
		
	grads = T.grad(cost, layer0.sentenceResults)
	score = T.diag(T.dot(grads, T.transpose(layer0.sentenceResults)))
	
	# construct the parameter array.
	params = layer2.params + layer1.params + layer0.params
	
	
	# Load the parameters last time, optionally.
	loadParamsVal(params)
	print "Compiling computing graph."
	output_model = theano.function(
 		[corpus, sentenceWordCount],
 		[layer2.y_pred, score]
 	)
	
	print "Compiled."
	cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/train_valid/split")
	count = 0
	while(count <= 1000):
		info = cr.getCorpus([count, count + 1])
		count += 1
		if info is None:
			print "Pass"
			continue
		docMatrixes, _, sentenceWordNums, ids, sentences = info
		docMatrixes = numpy.matrix(
		            docMatrixes,
		            dtype=theano.config.floatX
		        )
		sentenceWordNums = numpy.array(
		            sentenceWordNums,
		            dtype=numpy.int32
		        )
		print "start to predict: %s." % ids[0]
		pred_y, g = output_model(docMatrixes, sentenceWordNums)
		print "End predicting."
		print "Writing resfile."
		
		score_sentence_list = zip(g, sentences)
		score_sentence_list.sort(key=lambda x:-x[0])
		
		with codecs.open("data/output/" + str(pred_y[0]) + "/" + ids[0], "w", 'utf-8', "ignore") as f:
			f .write("pred_y: %i\n" % pred_y[0])
			for g0, s in score_sentence_list:
				f.write("%f\t%s\n" % (g0, string.join(s, " ")))
# 		print zip(ids, pred_y[0])
# 		f = file("data/test/res/res" + str(count), "w")
# 		f.write(str(zip(ids, pred_y[0])))
# 		f.close()
		print "Written." + str(count)
		
		
	print "All finished!"
def work(mode, data_name, test_dataname):
	print "mode: ", mode
	print "data_name: ", data_name
	print "Started!"
	rng = numpy.random.RandomState(23455)
	docSentenceCount = T.ivector("docSentenceCount")
	sentenceWordCount = T.ivector("sentenceWordCount")
	corpus = T.matrix("corpus")
	docLabel = T.ivector('docLabel') 
	
	# for list-type data
	layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \
													 sentenceLayerNodesNum=100, \
													 sentenceLayerNodesSize=[5, 200], \
													 docLayerNodesNum=100, \
													 docLayerNodesSize=[3, 100])

	layer1 = HiddenLayer(
		rng,
		input=layer0.output,
		n_in=layer0.outputDimension,
		n_out=100,
		activation=T.tanh
	)
	
	layer2 = LogisticRegression(input=layer1.output, n_in=100, n_out=2)

	# construct the parameter array.
	params = layer2.params + layer1.params + layer0.params
	
	# Load the parameters last time, optionally.
	
# 	data_name = "car"
	
	para_path = "data/" + data_name + "/model/scnn.model"
	traintext = "data/" + data_name + "/train/text"
	trainlabel = "data/" + data_name + "/train/label"
	testtext = "data/" + test_dataname + "/test/text"
	testlabel = "data/" + test_dataname + "/test/label"
	
	
	loadParamsVal(para_path, params)

	if(mode == "train"):
		print "Loading train data."
		cr_train = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=traintext, labelset=trainlabel)
		docMatrixes, docSentenceNums, sentenceWordNums, ids, labels = cr_train.getCorpus([0, 100000])
		
# 		print "Right answer: "
# 		print zip(ids, labels)
		
		docMatrixes = transToTensor(docMatrixes, theano.config.floatX)
		docSentenceNums = transToTensor(docSentenceNums, numpy.int32)
		sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32)
		labels = transToTensor(labels, numpy.int32)
		
	# 	valid_cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/valid/split", labelset="data/valid/label.txt")
		print
		print "Loading test data."
		cr_test = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=testtext, labelset=testlabel)
		validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels = cr_test.getCorpus([0, 1000])
		
# 		print "Right answer: "
# 		print zip(validIds, validLabels)
		
		validDocMatrixes = transToTensor(validDocMatrixes, theano.config.floatX)
		validDocSentenceNums = transToTensor(validDocSentenceNums, numpy.int32)
		validSentenceWordNums = transToTensor(validSentenceWordNums, numpy.int32)
		validLabels = transToTensor(validLabels, numpy.int32)
		print "Data loaded."
		
		learning_rate = 0.1
	
		index = T.lscalar("index")
		batchSize = 10
		n_batches = (len(docSentenceNums.get_value()) - 1) / batchSize + 1
		print
		print "Train set size is ", len(docMatrixes.get_value())
		print "Validating set size is ", len(validDocMatrixes.get_value())
		print "Batch size is ", batchSize
		print "Number of training batches  is ", n_batches
		error = layer2.errors(docLabel)
		cost = layer2.negative_log_likelihood(docLabel)
		
		grads = T.grad(cost, params)
	
		updates = [
			(param_i, param_i - learning_rate * grad_i)
			for param_i, grad_i in zip(params, grads)
		]
		
		
		print "Compiling computing graph."
		
		valid_model = theano.function(
	 		[],
	 		[cost, error, layer2.y_pred, docLabel, T.transpose(layer2.p_y_given_x)[1]],
	 		givens={
							corpus: validDocMatrixes,
							docSentenceCount: validDocSentenceNums,
							sentenceWordCount: validSentenceWordNums,
							docLabel: validLabels
					}
	 	)
		# for list-type data
		train_model = theano.function(
	 		[index],
	 		[cost, error, layer2.y_pred, docLabel],
	 		updates=updates,
	 		givens={
							corpus: docMatrixes,
							docSentenceCount: docSentenceNums[index * batchSize: (index + 1) * batchSize + 1],
							sentenceWordCount: sentenceWordNums,
							docLabel: labels[index * batchSize: (index + 1) * batchSize]
						}
	 	)
		
		print "Compiled."
		print "Start to train."
		epoch = 0
		n_epochs = 2000
		ite = 0
		
		# ####Validate the model####
		costNum, errorNum, pred_label, real_label, pred_prob = valid_model()
		print "Valid current model:"
		print "Cost: ", costNum
		print "Error: ", errorNum
		print "Valid Pred: ", pred_label
		print "pred_prob: ", pred_prob
		
		fpr, tpr, _ = roc_curve(real_label, pred_prob)
		roc_auc = auc(fpr, tpr)
		print "data_name: ", data_name
		print "test_dataname: ", test_dataname
		print "ROC: ", roc_auc
			
		while (epoch < n_epochs):
			epoch = epoch + 1
			#######################
			for i in range(n_batches):
				# for list-type data
				costNum, errorNum, pred_label, real_label = train_model(i)
				ite = ite + 1
				# for padding data
	# 			costNum, errorNum = train_model(docMatrixes, labels)
	# 			del docMatrixes, docSentenceNums, sentenceWordNums, labels
				# print ".", 
				if(ite % 10 == 0):
					print
					print "@iter: ", ite
					print "Cost: ", costNum
					print "Error: ", errorNum
					
			# Validate the model
			costNum, errorNum, pred_label, real_label, pred_prob = valid_model()
			print "Valid current model:"
			print "Cost: ", costNum
			print "Error: ", errorNum
			print "pred_prob: ", pred_prob
# 			print "Valid Pred: ", pred_label
			
			fpr, tpr, _ = roc_curve(real_label, pred_prob)
			roc_auc = auc(fpr, tpr)
			print "data_name: ", data_name
			print "test_dataname: ", test_dataname
			print "ROC: ", roc_auc
			
			# Save model
			print "Saving parameters."
			saveParamsVal(para_path, params)
			print "Saved."
	elif(mode == "deploy"):
		print "Compiling computing graph."
		output_model = theano.function(
	 		[corpus, docSentenceCount, sentenceWordCount],
	 		[layer2.y_pred]
	 	)
		print "Compiled."
		cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/train_valid/split")
		count = 21000
		while(count <= 21000):
			docMatrixes, docSentenceNums, sentenceWordNums, ids = cr.getCorpus([count, count + 100])
			docMatrixes = numpy.matrix(
			            docMatrixes,
			            dtype=theano.config.floatX
			        )
			docSentenceNums = numpy.array(
			            docSentenceNums,
			            dtype=numpy.int32
			        )
			sentenceWordNums = numpy.array(
			            sentenceWordNums,
			            dtype=numpy.int32
			        )
			print "start to predict."
			pred_y = output_model(docMatrixes, docSentenceNums, sentenceWordNums)
			print "End predicting."
			print "Writing resfile."
	# 		print zip(ids, pred_y[0])
			f = file("data/test/res/res" + str(count), "w")
			f.write(str(zip(ids, pred_y[0])))
			f.close()
			print "Written." + str(count)
			count += 100
		
		
	print "All finished!"
Ejemplo n.º 6
0
def work(mode, data_name, test_dataname):
    print "mode: ", mode
    print "data_name: ", data_name
    print "Started!"
    rng = numpy.random.RandomState(23455)
    docSentenceCount = T.ivector("docSentenceCount")
    sentenceWordCount = T.ivector("sentenceWordCount")
    corpus = T.matrix("corpus")
    docLabel = T.ivector('docLabel')

    # for list-type data
    layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \
                 sentenceLayerNodesNum=100, \
                 sentenceLayerNodesSize=[5, 200], \
                 docLayerNodesNum=100, \
                 docLayerNodesSize=[3, 100])

    layer1 = HiddenLayer(rng,
                         input=layer0.output,
                         n_in=layer0.outputDimension,
                         n_out=100,
                         activation=T.tanh)

    layer2 = LogisticRegression(input=layer1.output, n_in=100, n_out=2)

    # construct the parameter array.
    params = layer2.params + layer1.params + layer0.params

    # Load the parameters last time, optionally.

    # 	data_name = "car"

    para_path = "data/" + data_name + "/model/scnn.model"
    traintext = "data/" + data_name + "/train/text"
    trainlabel = "data/" + data_name + "/train/label"
    testtext = "data/" + test_dataname + "/test/text"
    testlabel = "data/" + test_dataname + "/test/label"

    loadParamsVal(para_path, params)

    if (mode == "train"):
        print "Loading train data."
        cr_train = CorpusReader(minDocSentenceNum=5,
                                minSentenceWordNum=5,
                                dataset=traintext,
                                labelset=trainlabel)
        docMatrixes, docSentenceNums, sentenceWordNums, ids, labels = cr_train.getCorpus(
            [0, 100000])

        # 		print "Right answer: "
        # 		print zip(ids, labels)

        docMatrixes = transToTensor(docMatrixes, theano.config.floatX)
        docSentenceNums = transToTensor(docSentenceNums, numpy.int32)
        sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32)
        labels = transToTensor(labels, numpy.int32)

        # 	valid_cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/valid/split", labelset="data/valid/label.txt")
        print
        print "Loading test data."
        cr_test = CorpusReader(minDocSentenceNum=5,
                               minSentenceWordNum=5,
                               dataset=testtext,
                               labelset=testlabel)
        validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels = cr_test.getCorpus(
            [0, 1000])

        # 		print "Right answer: "
        # 		print zip(validIds, validLabels)

        validDocMatrixes = transToTensor(validDocMatrixes,
                                         theano.config.floatX)
        validDocSentenceNums = transToTensor(validDocSentenceNums, numpy.int32)
        validSentenceWordNums = transToTensor(validSentenceWordNums,
                                              numpy.int32)
        validLabels = transToTensor(validLabels, numpy.int32)
        print "Data loaded."

        learning_rate = 0.1

        index = T.lscalar("index")
        batchSize = 10
        n_batches = (len(docSentenceNums.get_value()) - 1) / batchSize + 1
        print
        print "Train set size is ", len(docMatrixes.get_value())
        print "Validating set size is ", len(validDocMatrixes.get_value())
        print "Batch size is ", batchSize
        print "Number of training batches  is ", n_batches
        error = layer2.errors(docLabel)
        cost = layer2.negative_log_likelihood(docLabel)

        grads = T.grad(cost, params)

        updates = [(param_i, param_i - learning_rate * grad_i)
                   for param_i, grad_i in zip(params, grads)]

        print "Compiling computing graph."

        valid_model = theano.function(
            [], [
                cost, error, layer2.y_pred, docLabel,
                T.transpose(layer2.p_y_given_x)[1]
            ],
            givens={
                corpus: validDocMatrixes,
                docSentenceCount: validDocSentenceNums,
                sentenceWordCount: validSentenceWordNums,
                docLabel: validLabels
            })
        # for list-type data
        train_model = theano.function(
            [index], [cost, error, layer2.y_pred, docLabel],
            updates=updates,
            givens={
                corpus:
                docMatrixes,
                docSentenceCount:
                docSentenceNums[index * batchSize:(index + 1) * batchSize + 1],
                sentenceWordCount:
                sentenceWordNums,
                docLabel:
                labels[index * batchSize:(index + 1) * batchSize]
            })

        print "Compiled."
        print "Start to train."
        epoch = 0
        n_epochs = 2000
        ite = 0

        # ####Validate the model####
        costNum, errorNum, pred_label, real_label, pred_prob = valid_model()
        print "Valid current model:"
        print "Cost: ", costNum
        print "Error: ", errorNum
        print "Valid Pred: ", pred_label
        print "pred_prob: ", pred_prob

        fpr, tpr, _ = roc_curve(real_label, pred_prob)
        roc_auc = auc(fpr, tpr)
        print "data_name: ", data_name
        print "test_dataname: ", test_dataname
        print "ROC: ", roc_auc

        while (epoch < n_epochs):
            epoch = epoch + 1
            #######################
            for i in range(n_batches):
                # for list-type data
                costNum, errorNum, pred_label, real_label = train_model(i)
                ite = ite + 1
                # for padding data
                # 			costNum, errorNum = train_model(docMatrixes, labels)
                # 			del docMatrixes, docSentenceNums, sentenceWordNums, labels
                # print ".",
                if (ite % 10 == 0):
                    print
                    print "@iter: ", ite
                    print "Cost: ", costNum
                    print "Error: ", errorNum

            # Validate the model
            costNum, errorNum, pred_label, real_label, pred_prob = valid_model(
            )
            print "Valid current model:"
            print "Cost: ", costNum
            print "Error: ", errorNum
            print "pred_prob: ", pred_prob
            # 			print "Valid Pred: ", pred_label

            fpr, tpr, _ = roc_curve(real_label, pred_prob)
            roc_auc = auc(fpr, tpr)
            print "data_name: ", data_name
            print "test_dataname: ", test_dataname
            print "ROC: ", roc_auc

            # Save model
            print "Saving parameters."
            saveParamsVal(para_path, params)
            print "Saved."
    elif (mode == "deploy"):
        print "Compiling computing graph."
        output_model = theano.function(
            [corpus, docSentenceCount, sentenceWordCount], [layer2.y_pred])
        print "Compiled."
        cr = CorpusReader(minDocSentenceNum=5,
                          minSentenceWordNum=5,
                          dataset="data/train_valid/split")
        count = 21000
        while (count <= 21000):
            docMatrixes, docSentenceNums, sentenceWordNums, ids = cr.getCorpus(
                [count, count + 100])
            docMatrixes = numpy.matrix(docMatrixes, dtype=theano.config.floatX)
            docSentenceNums = numpy.array(docSentenceNums, dtype=numpy.int32)
            sentenceWordNums = numpy.array(sentenceWordNums, dtype=numpy.int32)
            print "start to predict."
            pred_y = output_model(docMatrixes, docSentenceNums,
                                  sentenceWordNums)
            print "End predicting."
            print "Writing resfile."
            # 		print zip(ids, pred_y[0])
            f = file("data/test/res/res" + str(count), "w")
            f.write(str(zip(ids, pred_y[0])))
            f.close()
            print "Written." + str(count)
            count += 100

    print "All finished!"