コード例 #1
0
ファイル: test.py プロジェクト: shockline/KnowlegeableCNN
def main3():
    rng = numpy.random.RandomState(23455)

    docList = ttList.TypedListType(
        ttList.TypedListType(TensorType(theano.config.floatX,
                                        (False, False))))("docList")
    docLabel = T.ivector('docLabel')
    layer0 = DocEmbeddingNN(docList, rng, 4)

    layer1 = HiddenLayer(rng,
                         input=layer0.output,
                         n_in=layer0.outputDimension,
                         n_out=10,
                         activation=T.tanh)

    layer2 = LogisticRegression(input=layer1.output, n_in=10, n_out=10)
    cost = layer2.negative_log_likelihood(docLabel)
    params = layer2.params + layer1.params + layer0.params
    grads = T.grad(cost, params)

    f = theano.function([docList], layer2.y_pred)

    a = [[[[2, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
          [[1, 2, 4, 4], [1, 2, 3, 4]]],
         [[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
          [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]]]]
    print f(a)
    print "All finished!"
コード例 #2
0
ファイル: test.py プロジェクト: shockline/KnowlegeableCNN
def main6():
    print "Start!"
    d = [[[[2, 2, 3, 4], [1, 2, 3, 4], [3, 1, 2, 3], [6, 4, 2, 1],
           [0, 0, 0, 0]],
          [[4, 3, 2, 1], [4, 6, 9, 2], [6, 6, 3, 1], [2, 5, 2, 9],
           [3, 2, 1, 7]]],
         [[[9, 8, 7, 6], [5, 4, 3, 2], [1, 9, 8, 7], [6, 5, 4, 3],
           [0, 0, 0, 0]],
          [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0],
           [0, 0, 0, 0]]]]
    mr = numpy.max([numpy.max([2, 1]), numpy.max([[4, 5], [4, 0]])])

    #     docSentenceCount = [2, 1]
    #     sentenceWordCount = [[4, 5], [4, 0]]
    docSentenceCount = T.fvector("docSentenceCount")
    sentenceWordCount = T.fmatrix("sentenceWordCount")
    corpus = T.ftensor4("corpus")
    rng = numpy.random.RandomState(23455)

    layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng,
                            4, mr, 3, [2, 2], 2, [1, 2])
    s = layer0.output.sum()
    g = theano.grad(s, layer0.params)
    print "Compiling!"
    f = theano.function([corpus, docSentenceCount, sentenceWordCount], g)
    print "Compiled!"
    print f(d, [2, 1], [[4, 5], [4, 0]])
    print "All finished!"
コード例 #3
0
def work(mode, data_name, test_dataname, pooling_mode="average_exc_pad"):
    print "mode: ", mode
    print "data_name: ", data_name
    print "pooling_mode: ", pooling_mode
    print "Started!"
    rng = numpy.random.RandomState(23455)
    docSentenceCount = T.ivector("docSentenceCount")
    sentenceWordCount = T.ivector("sentenceWordCount")
    corpus = T.matrix("corpus")
    docLabel = T.ivector('docLabel')

    # for list-type data
    layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \
                 sentenceLayerNodesNum=50, \
                 sentenceLayerNodesSize=[5, 200], \
                 docLayerNodesNum=10, \
                 docLayerNodesSize=[3, 50],
                 pooling_mode=pooling_mode)
    # 	layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \
    # 													 sentenceLayerNodesNum=100, \
    # 													 sentenceLayerNodesSize=[5, 200], \
    # 													 docLayerNodesNum=100, \
    # 													 docLayerNodesSize=[3, 100],
    # 													 pooling_mode=pooling_mode)

    layer1 = HiddenLayer(rng,
                         input=layer0.output,
                         n_in=layer0.outputDimension,
                         n_out=10,
                         activation=T.tanh)

    layer2 = LogisticRegression(input=layer1.output, n_in=10, n_out=2)

    # construct the parameter array.
    params = layer2.params + layer1.params + layer0.params

    # Load the parameters last time, optionally.

    # 	data_name = "car"

    para_path = "data/" + data_name + "/model/" + pooling_mode + ".model"
    traintext = "data/" + data_name + "/train/text"
    trainlabel = "data/" + data_name + "/train/label"
    testtext = "data/" + test_dataname + "/test/text"
    testlabel = "data/" + test_dataname + "/test/label"

    loadParamsVal(para_path, params)

    if (mode == "train" or mode == "test"):
        learning_rate = 0.1
        error = layer2.errors(docLabel)
        cost = layer2.negative_log_likelihood(docLabel)

        grads = T.grad(cost, params)

        updates = [(param_i, param_i - learning_rate * grad_i)
                   for param_i, grad_i in zip(params, grads)]

        print "Loading test data."
        cr_test = CorpusReader(minDocSentenceNum=5,
                               minSentenceWordNum=5,
                               dataset=testtext,
                               labelset=testlabel)
        validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels, _, _ = cr_test.getCorpus(
            [0, 1000])

        # 		print "Right answer: "
        # 		print zip(validIds, validLabels)

        validDocMatrixes = transToTensor(validDocMatrixes,
                                         theano.config.floatX)
        validDocSentenceNums = transToTensor(validDocSentenceNums, numpy.int32)
        validSentenceWordNums = transToTensor(validSentenceWordNums,
                                              numpy.int32)
        validLabels = transToTensor(validLabels, numpy.int32)
        print "Data loaded."

        valid_model = theano.function(
            [], [
                cost, error, layer2.y_pred, docLabel,
                T.transpose(layer2.p_y_given_x)[1]
            ],
            givens={
                corpus: validDocMatrixes,
                docSentenceCount: validDocSentenceNums,
                sentenceWordCount: validSentenceWordNums,
                docLabel: validLabels
            })

        # ####Validate the model####
        costNum, errorNum, pred_label, real_label, pred_prob = valid_model()
        print "Valid current model:"
        print "Cost: ", costNum
        print "Error: ", errorNum
        # 		print "Valid Pred: ", pred_label
        # 		print "pred_prob: ", pred_prob

        fpr, tpr, _ = roc_curve(real_label, pred_prob)
        if mode == "test":
            print "tpr_all: ", tpr
            print "fpr_all: ", fpr
        roc_auc = auc(fpr, tpr)
        print "data_name: ", data_name
        print "test_dataname: ", test_dataname
        print "ROC: ", roc_auc

        fpr, tpr, threshold = roc_curve(real_label, pred_label)

        index_of_one = list(threshold).index(1)
        ar = (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2
        print "TPR: ", tpr[index_of_one]
        print "FPR: ", fpr[index_of_one]
        print "AR: ", ar
        print "threshold: ", threshold[index_of_one]
        if mode == "test":
            valid_model.free()
            return errorNum, roc_auc, tpr[index_of_one], fpr[index_of_one], ar

        print "Loading train data."
        cr_train = CorpusReader(minDocSentenceNum=5,
                                minSentenceWordNum=5,
                                dataset=traintext,
                                labelset=trainlabel)
        docMatrixes, docSentenceNums, sentenceWordNums, ids, labels, _, _ = cr_train.getCorpus(
            [0, 100000])

        # 		print "Right answer: "
        # 		print zip(ids, labels)

        docMatrixes = transToTensor(docMatrixes, theano.config.floatX)
        docSentenceNums = transToTensor(docSentenceNums, numpy.int32)
        sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32)
        labels = transToTensor(labels, numpy.int32)

        # 	valid_cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/valid/split", labelset="data/valid/label.txt")
        print
        index = T.lscalar("index")
        batchSize = 10
        n_batches = (len(docSentenceNums.get_value()) - 1 - 1) / batchSize + 1
        print
        print "Train set size is ", len(docMatrixes.get_value())
        print "Validating set size is ", len(validDocMatrixes.get_value())
        print "Batch size is ", batchSize
        print "Number of training batches  is ", n_batches

        print "Compiling computing graph."

        # for list-type data
        train_model = theano.function(
            [index], [cost, error, layer2.y_pred, docLabel],
            updates=updates,
            givens={
                corpus:
                docMatrixes,
                docSentenceCount:
                docSentenceNums[index * batchSize:(index + 1) * batchSize + 1],
                sentenceWordCount:
                sentenceWordNums,
                docLabel:
                labels[index * batchSize:(index + 1) * batchSize]
            })

        print "Compiled."
        print "Start to train."
        epoch = 0
        n_epochs = 10
        ite = 0

        while (epoch < n_epochs):
            epoch = epoch + 1
            #######################
            for i in range(n_batches):
                # for list-type data
                costNum, errorNum, pred_label, real_label = train_model(i)
                ite = ite + 1
                # for padding data
                # 			costNum, errorNum = train_model(docMatrixes, labels)
                # 			del docMatrixes, docSentenceNums, sentenceWordNums, labels
                # print ".",
                if (ite % 10 == 0):
                    print
                    print "@iter: ", ite
                    print "Cost: ", costNum
                    print "Error: ", errorNum

            # Validate the model
            costNum, errorNum, pred_label, real_label, pred_prob = valid_model(
            )
            print "Valid current model:"
            print "Cost: ", costNum
            print "Error: ", errorNum
            # 			print "pred_prob: ", pred_prob
            # 			print "Valid Pred: ", pred_label

            fpr, tpr, _ = roc_curve(real_label, pred_prob)
            roc_auc = auc(fpr, tpr)
            print "data_name: ", data_name
            print "test_dataname: ", test_dataname
            print "ROC: ", roc_auc

            fpr, tpr, threshold = roc_curve(real_label, pred_label)
            index_of_one = list(threshold).index(1)
            print "TPR: ", tpr[index_of_one]
            print "FPR: ", fpr[index_of_one]
            print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2
            print "threshold: ", threshold[index_of_one]
            # Save model
            print "Saving parameters."
            saveParamsVal(para_path, params)
            print "Saved."
        valid_model.free()
        train_model.free()
    elif (mode == "deploy"):
        print "Compiling computing graph."
        output_model = theano.function(
            [corpus, docSentenceCount, sentenceWordCount], [layer2.y_pred])
        print "Compiled."
        cr = CorpusReader(minDocSentenceNum=5,
                          minSentenceWordNum=5,
                          dataset="data/train_valid/split")
        count = 21000
        while (count <= 21000):
            docMatrixes, docSentenceNums, sentenceWordNums, ids = cr.getCorpus(
                [count, count + 100])
            docMatrixes = numpy.matrix(docMatrixes, dtype=theano.config.floatX)
            docSentenceNums = numpy.array(docSentenceNums, dtype=numpy.int32)
            sentenceWordNums = numpy.array(sentenceWordNums, dtype=numpy.int32)
            print "start to predict."
            pred_y = output_model(docMatrixes, docSentenceNums,
                                  sentenceWordNums)
            print "End predicting."
            print "Writing resfile."
            # 		print zip(ids, pred_y[0])
            f = file("data/test/res/res" + str(count), "w")
            f.write(str(zip(ids, pred_y[0])))
            f.close()
            print "Written." + str(count)
            count += 100
コード例 #4
0
def work():
	print "Started!"
	
	print "Loading data."
	cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/split", labelset="data/traindataset2zgb")
	docMatrixes, docSentenceNums, sentenceWordNums, labels = transToTensor(cr.getCorpus([0, 12]))
	
# 	valid_cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/valid/split", labelset="data/valid/label.txt")
	validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validLabels = transToTensor(cr.getCorpus([800, 870]))
	print "Data loaded."
	
	
	learning_rate = 0.1
	docSentenceCount = T.vector("docSentenceCount")
	sentenceWordCount = T.vector("sentenceWordCount")
	corpus = T.matrix("corpus")
	docLabel = T.ivector('docLabel') 
	index = T.lscalar("index")
	rng = numpy.random.RandomState(23455)
	batchSize = 1
	mr =numpy.max([len(docMatrixes.get_value()), len(validDocMatrixes.get_value())])
	n_batches = (len(docSentenceNums.get_value()) -1 ) / batchSize
	
	print "Train set size is ", len(docMatrixes.get_value())
	print "Validating set size is ", len(validDocMatrixes.get_value())
	print "Batch size is ", batchSize
	print "Number of training batches  is ", n_batches
	
	# for list-type data
	layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \
													 maxRandge=mr, \
													 sentenceLayerNodesNum=100, \
													 sentenceLayerNodesSize=[5, 200], \
													 docLayerNodesNum=100, \
													 docLayerNodesSize=[3, 100])
	# for padding data
# 	layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, corpus_shape=(batchSize, cr.getMaxDocSentenceNum(), cr.getMaxSentenceWordNum(), cr.getDim()), \
# 													 maxRandge=mr, \
# 													 sentenceLayerNodesNum=100, \
# 													 sentenceLayerNodesSize=5, \
# 													 docLayerNodesNum=200, \
# 													 docLayerNodesSize=3)

	layer1 = HiddenLayer(
		rng,
		input=layer0.output,
		n_in=layer0.outputDimension,
		n_out=100,
		activation=T.tanh
	)
	
	layer2 = LogisticRegression(input=layer1.output, n_in=100, n_out=2)

	error = layer2.errors(docLabel)
	cost = layer2.negative_log_likelihood(docLabel)
	
	# construct the parameter array.
	params = layer2.params + layer1.params + layer0.params
	
	# Load the parameters last time, optionally.
	loadParamsVal(params)

	grads = T.grad(cost, params)

	updates = [
		(param_i, param_i - learning_rate * grad_i)
		for param_i, grad_i in zip(params, grads)
	]
	
	
	print "Compiling computing graph."
	
	valid_model = theano.function(
 		[],
 		[cost, error],
 		givens={
						corpus: validDocMatrixes,
						docSentenceCount: validDocSentenceNums,
						sentenceWordCount: validSentenceWordNums,
						docLabel: validLabels
				}
 	)
	# for list-type data
	train_model = theano.function(
 		[index],
 		[cost, error],
 		updates=updates,
 		givens={
						corpus: docMatrixes,
						docSentenceCount: docSentenceNums[index * batchSize: (index + 1) * batchSize + 1],
						sentenceWordCount: sentenceWordNums,
						docLabel: labels[index * batchSize: (index + 1) * batchSize]
					}
 	)
	
	# for padding data
	# 	train_model = theano.function(
	# 		[corpus, docLabel],
	# 		[cost, error],
	# 		updates=updates,
	# 	)
	print "Compiled."
	
	print "Start to train."
	epoch = 0
	n_epochs = 200
	ite = 0
	
	# ####Validate the model####
	costNum, errorNum = valid_model()
	print "Valid current model:"
	print "Cost: ", costNum
	print "Error: ", errorNum
	
	while (epoch < n_epochs):
		epoch = epoch + 1
		#######################
		for i in range(n_batches):
			# for list-type data
			costNum, errorNum = train_model(i)
			ite = ite + 1
			# for padding data
# 			costNum, errorNum = train_model(docMatrixes, labels)
# 			del docMatrixes, docSentenceNums, sentenceWordNums, labels
			# print ".", 
			if(ite % 1 == 0):
				print
				print "@iter: ", ite
				print "Cost: ", costNum
				print "Error: ", errorNum
				
		# Validate the model
		costNum, errorNum = valid_model()
		print "Valid current model:"
		print "Cost: ", costNum
		print "Error: ", errorNum
		
		# Save model
		print "Saving parameters."
		saveParamsVal(params)
		print "Saved."
	print "All finished!"
def work(mode, data_name, test_dataname, pooling_mode="average_exc_pad"):
    print "mode: ", mode
    print "data_name: ", data_name
    print "pooling_mode: ", pooling_mode
    print "Started!"

    data_names = data_name.split(":")
    data_count = len(data_names)
    print "Train dataset:"
    for i in xrange(data_count):
        print "%d: %s" % (i, data_names[i])

    print "Test dataset:"
    test_data_names = test_dataname.split(":")
    test_data_count = len(test_data_names)
    for i in xrange(test_data_count):
        print "%d: %s" % (i, test_data_names[i])

    if test_data_count != data_count:
        raise Exception(
            "The amount of test and train dataset must be the same.")

    rng = numpy.random.RandomState(23455)
    docSentenceCount = T.ivector("docSentenceCount")
    sentenceWordCount = T.ivector("sentenceWordCount")
    corpus = T.matrix("corpus")
    docLabel = T.ivector('docLabel')

    sentenceW = None
    sentenceB = None
    docW = None
    docB = None

    hidden_layer_w = None
    hidden_layer_b = None
    logistic_layer_w = None
    logistic_layer_b = None
    layer0 = list()
    layer1 = list()
    layer2 = list()
    local_params = list()
    # for list-type data
    for i in xrange(data_count):
        layer0.append(DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \
                     sentenceLayerNodesNum=50, \
                     sentenceLayerNodesSize=[5, 200], \
                     docLayerNodesNum=10, \
                     docLayerNodesSize=[3, 50],
                     sentenceW=sentenceW,
                     sentenceB=sentenceB,
                     docW=docW,
                     docB=docB,
                     pooling_mode=pooling_mode))

        sentenceW = layer0[i].sentenceW
        sentenceB = layer0[i].sentenceB
        docW = layer0[i].docW
        docB = layer0[i].docB

        layer1.append(
            HiddenLayer(rng,
                        input=layer0[i].output,
                        n_in=layer0[i].outputDimension,
                        n_out=10,
                        activation=T.tanh,
                        W=hidden_layer_w,
                        b=hidden_layer_b))

        hidden_layer_w = layer1[i].W
        hidden_layer_b = layer1[i].b

        layer2.append(
            LogisticRegression(input=layer1[i].output,
                               n_in=10,
                               n_out=2,
                               W=logistic_layer_w,
                               b=logistic_layer_b))
        # 		logistic_layer_w = layer2[i].W
        # 		logistic_layer_b = layer2[i].b

        local_params.append(layer2[i].params)

    share_params = list(layer0[0].params + layer1[0].params)
    # construct the parameter array.
    params = list(layer0[0].params) + layer1[0].params

    for i in xrange(data_count):
        params += layer2[i].params

# 	data_name = "car"

    para_path = "data/" + data_name + "/share_hidden_low_model/" + pooling_mode + ".model"
    traintext = [
        "data/" + data_names[i] + "/train/text" for i in xrange(data_count)
    ]
    trainlabel = [
        "data/" + data_names[i] + "/train/label" for i in xrange(data_count)
    ]
    testtext = [
        "data/" + test_data_names[i] + "/test/text" for i in xrange(data_count)
    ]
    testlabel = [
        "data/" + test_data_names[i] + "/test/label"
        for i in xrange(data_count)
    ]

    # Load the parameters last time, optionally.
    loadParamsVal(para_path, params)

    if (mode == "train" or mode == "test"):
        train_model = list()
        valid_model = list()
        print "Loading train data."
        batchSize = 10
        share_learning_rate = 0.1
        local_learning_rate = 0.1
        n_batches = list()

        print "Loading test data."

        all_pred_label = list()
        all_real_label = list()
        all_pred_prob = list()
        for i in xrange(data_count):
            cr_train = CorpusReader(minDocSentenceNum=5,
                                    minSentenceWordNum=5,
                                    dataset=traintext[i],
                                    labelset=trainlabel[i])
            docMatrixes, docSentenceNums, sentenceWordNums, ids, labels, _, _ = cr_train.getCorpus(
                [0, 100000])

            docMatrixes = transToTensor(docMatrixes, theano.config.floatX)
            docSentenceNums = transToTensor(docSentenceNums, numpy.int32)
            sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32)
            labels = transToTensor(labels, numpy.int32)

            index = T.lscalar("index")

            n_batches.append((len(docSentenceNums.get_value()) - 1 - 1) /
                             batchSize + 1)
            print "Dataname: %s" % data_names[i]
            print "Train set size is ", len(docMatrixes.get_value())
            print "Batch size is ", batchSize
            print "Number of training batches  is ", n_batches[i]
            error = layer2[i].errors(docLabel)
            cost = layer2[i].negative_log_likelihood(docLabel)

            share_grads = T.grad(cost, share_params)
            share_updates = [
                (param_i, param_i - share_learning_rate * grad_i)
                for param_i, grad_i in zip(share_params, share_grads)
            ]

            grads = T.grad(cost, local_params[i])
            local_updates = [
                (param_i, param_i - local_learning_rate * grad_i)
                for param_i, grad_i in zip(local_params[i], grads)
            ]
            updates = share_updates + local_updates
            print "Compiling train computing graph."
            if mode == "train":
                train_model.append(
                    theano.function(
                        [index], [cost, error, layer2[i].y_pred, docLabel],
                        updates=updates,
                        givens={
                            corpus:
                            docMatrixes,
                            docSentenceCount:
                            docSentenceNums[index *
                                            batchSize:(index + 1) * batchSize +
                                            1],
                            sentenceWordCount:
                            sentenceWordNums,
                            docLabel:
                            labels[index * batchSize:(index + 1) * batchSize]
                        }))
            print "Compiled."

            print "Load test dataname: %s" % test_data_names[i]
            cr_test = CorpusReader(minDocSentenceNum=5,
                                   minSentenceWordNum=5,
                                   dataset=testtext[i],
                                   labelset=testlabel[i])
            validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels, _, _ = cr_test.getCorpus(
                [0, 1000])
            validDocMatrixes = transToTensor(validDocMatrixes,
                                             theano.config.floatX)
            validDocSentenceNums = transToTensor(validDocSentenceNums,
                                                 numpy.int32)
            validSentenceWordNums = transToTensor(validSentenceWordNums,
                                                  numpy.int32)
            validLabels = transToTensor(validLabels, numpy.int32)
            print "Validating set size is ", len(validDocMatrixes.get_value())
            print "Data loaded."

            print "Compiling test computing graph."
            valid_model.append(
                theano.function(
                    [], [
                        cost, error, layer2[i].y_pred, docLabel,
                        T.transpose(layer2[i].p_y_given_x)[1]
                    ],
                    givens={
                        corpus: validDocMatrixes,
                        docSentenceCount: validDocSentenceNums,
                        sentenceWordCount: validSentenceWordNums,
                        docLabel: validLabels
                    }))
            print "Compiled."
            costNum, errorNum, pred_label, real_label, pred_prob = valid_model[
                i]()

            all_pred_label.extend(pred_label)
            all_real_label.extend(real_label)
            all_pred_prob.extend(pred_prob)

            print "Valid current model :", data_names[i]
            print "Cost: ", costNum
            print "Error: ", errorNum

            fpr, tpr, _ = roc_curve(real_label, pred_prob)
            roc_auc = auc(fpr, tpr)
            print "data_name: ", data_name
            print "ROC: ", roc_auc
            fpr, tpr, threshold = roc_curve(real_label, pred_label)
            if 1 in threshold:
                index_of_one = list(threshold).index(1)
                print "TPR: ", tpr[index_of_one]
                print "FPR: ", fpr[index_of_one]
                print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2
                print "threshold: ", threshold[index_of_one]

        print "Valid current model :", data_names
        errorNum = 1 - accuracy_score(all_real_label, all_pred_label)
        print "Error: ", errorNum

        fpr, tpr, _ = roc_curve(all_real_label, all_pred_prob)
        if mode == "test":
            print "tpr_all: ", tpr
            print "fpr_all: ", fpr
        roc_auc = auc(fpr, tpr)
        print "data_name: ", data_name
        print "ROC: ", roc_auc
        fpr, tpr, threshold = roc_curve(all_real_label, all_pred_label)
        if 1 in threshold:
            index_of_one = list(threshold).index(1)
            print "TPR: ", tpr[index_of_one]
            print "FPR: ", fpr[index_of_one]
            print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2
            print "threshold: ", threshold[index_of_one]

        if mode == "test":
            return

        print "Start to train."
        epoch = 0
        n_epochs = 10
        ite = 0

        while (epoch < n_epochs):
            epoch = epoch + 1
            #######################
            for i in range(max(n_batches)):
                for dataset_index in xrange(data_count):
                    if i >= n_batches[dataset_index]:
                        continue
                    # for list-type data
                    costNum, errorNum, pred_label, real_label = train_model[
                        dataset_index](i)
                    ite = ite + 1
                    # for padding data
                    if (ite % 10 == 0):
                        print
                        print "Dataset name: ", data_names[dataset_index]
                        print "@iter: ", ite
                        print "Cost: ", costNum
                        print "Error: ", errorNum

            # Validate the model
            all_pred_label = list()
            all_real_label = list()
            all_pred_prob = list()
            for dataset_index in xrange(data_count):
                costNum, errorNum, pred_label, real_label, pred_prob = valid_model[
                    dataset_index]()

                all_pred_label.extend(pred_label)
                all_real_label.extend(real_label)
                all_pred_prob.extend(pred_prob)

                print "Valid current model :", data_names[dataset_index]
                print "Cost: ", costNum
                print "Error: ", errorNum

                fpr, tpr, _ = roc_curve(real_label, pred_prob)
                roc_auc = auc(fpr, tpr)
                print "data_name: ", data_name
                print "ROC: ", roc_auc

                fpr, tpr, threshold = roc_curve(real_label, pred_label)
                index_of_one = list(threshold).index(1)
                print "TPR: ", tpr[index_of_one]
                print "FPR: ", fpr[index_of_one]
                print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2
                print "threshold: ", threshold[index_of_one]

            print "Valid current model :", data_names
            errorNum = 1 - accuracy_score(all_real_label, all_pred_label)
            print "Error: ", errorNum

            fpr, tpr, _ = roc_curve(all_real_label, all_pred_prob)
            roc_auc = auc(fpr, tpr)
            print "data_name: ", data_name
            print "ROC: ", roc_auc
            fpr, tpr, threshold = roc_curve(all_real_label, all_pred_label)
            index_of_one = list(threshold).index(1)
            print "TPR: ", tpr[index_of_one]
            print "FPR: ", fpr[index_of_one]
            print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2
            print "threshold: ", threshold[index_of_one]
            # Save model
            print "Saving parameters."
            saveParamsVal(para_path, params)
            print "Saved."
def work(mode, data_name, test_dataname):
	print "mode: ", mode
	print "data_name: ", data_name
	print "Started!"
	
	data_names = data_name.split(":")
	data_count = len(data_names)
	print "Train dataset:"
	for i in xrange(data_count):
		print "%d: %s" % (i, data_names[i])
		
	print "Test dataset:"
	test_data_names = test_dataname.split(":")
	test_data_count = len(test_data_names)
	for i in xrange(test_data_count):
		print "%d: %s" % (i, test_data_names[i])
	
	if test_data_count != data_count:
		raise Exception("The amount of test and train dataset must be the same.")
	
	rng = numpy.random.RandomState(23455)
	docSentenceCount = T.ivector("docSentenceCount")
	sentenceWordCount = T.ivector("sentenceWordCount")
	corpus = T.matrix("corpus")
	docLabel = T.ivector('docLabel')
	
	hidden_layer_w = None
	hidden_layer_b = None
	logistic_layer_w = None
	logistic_layer_b = None
	layer0 = list()
	layer1 = list()
	layer2 = list()
	local_params = list()
	# for list-type data
	for i in xrange(data_count):
		layer0.append(DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \
														 sentenceLayerNodesNum=100, \
														 sentenceLayerNodesSize=[5, 200], \
														 docLayerNodesNum=100, \
														 docLayerNodesSize=[3, 100]))

		layer1.append(HiddenLayer(
			rng,
			input=layer0[i].output,
			n_in=layer0[i].outputDimension,
			n_out=100,
			activation=T.tanh,
			W=hidden_layer_w,
			b=hidden_layer_b
		))
		
		hidden_layer_w = layer1[i].W
		hidden_layer_b = layer1[i].b
	
		layer2.append(LogisticRegression(input=layer1[i].output, n_in=100, n_out=2, W=logistic_layer_w, b=logistic_layer_b))
		logistic_layer_w = layer2[i].W
		logistic_layer_b = layer2[i].b
		
		local_params.append(layer2[i].params + layer1[i].params + layer0[i].params)

	# construct the parameter array.
	params = layer2[0].params + layer1[0].params
	
	for i in xrange(data_count):
		params += layer0[i].params
		
	# Load the parameters last time, optionally.
	
# 	data_name = "car"
	
	para_path = "data/" + data_name + "/model/scnn.model"
	traintext = ["data/" + data_names[i] + "/train/text"  for i in xrange(data_count)]
	trainlabel = ["data/" + data_names[i] + "/train/label"  for i in xrange(data_count)]
	testtext = ["data/" + test_data_names[i] + "/test/text"  for i in xrange(data_count)]
	testlabel =  ["data/" + test_data_names[i] + "/test/label"  for i in xrange(data_count)]
	
	loadParamsVal(para_path, params)

	if(mode == "train"):
		train_model = list()
		valid_model = list()
		print "Loading train data."
		batchSize = 10
		learning_rate = 0.1
		n_batches = list()
		
		print "Loading test data."
 		
		for i in xrange(data_count):
			cr_train = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=traintext[i], labelset=trainlabel[i])
			docMatrixes, docSentenceNums, sentenceWordNums, ids, labels = cr_train.getCorpus([0, 100000])
			
			docMatrixes = transToTensor(docMatrixes, theano.config.floatX)
			docSentenceNums = transToTensor(docSentenceNums, numpy.int32)
			sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32)
			labels = transToTensor(labels, numpy.int32)
			
			index = T.lscalar("index")
			
			n_batches.append((len(docSentenceNums.get_value()) - 1) / batchSize + 1)
			print "Dataname: %s" % data_names[i]
			print "Train set size is ", len(docMatrixes.get_value())
			print "Batch size is ", batchSize
			print "Number of training batches  is ", n_batches[i]
			error = layer2[i].errors(docLabel)
			cost = layer2[i].negative_log_likelihood(docLabel)
			
			grads = T.grad(cost, local_params[i])
		
			updates = [
				(param_i, param_i - learning_rate * grad_i)
				for param_i, grad_i in zip(local_params[i], grads)
			]
			print "Compiling train computing graph."
			
			train_model.append(theano.function(
		 		[index],
		 		[cost, error, layer2[i].y_pred, docLabel],
		 		updates=updates,
		 		givens={
								corpus: docMatrixes,
								docSentenceCount: docSentenceNums[index * batchSize: (index + 1) * batchSize + 1],
								sentenceWordCount: sentenceWordNums,
								docLabel: labels[index * batchSize: (index + 1) * batchSize]
							}
	 		))
			print "Compiled."
			
			print "Load test dataname: %s" % test_data_names[i]
			cr_test = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=testtext[i], labelset=testlabel[i])
			validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels = cr_test.getCorpus([0, 1000])
			validDocMatrixes = transToTensor(validDocMatrixes, theano.config.floatX)
			validDocSentenceNums = transToTensor(validDocSentenceNums, numpy.int32)
			validSentenceWordNums = transToTensor(validSentenceWordNums, numpy.int32)
			validLabels = transToTensor(validLabels, numpy.int32)
			print "Validating set size is ", len(validDocMatrixes.get_value())
			print "Data loaded."
			
			print "Compiling test computing graph."
			valid_model.append(theano.function(
		 		[],
		 		[cost, error, layer2[i].y_pred, docLabel, T.transpose(layer2[i].p_y_given_x)[1]],
		 		givens={
								corpus: validDocMatrixes,
								docSentenceCount: validDocSentenceNums,
								sentenceWordCount: validSentenceWordNums,
								docLabel: validLabels
						}
		 	))
			print "Compiled."
		# for list-type data

		print "Start to train."
		epoch = 0
		n_epochs = 2000
		ite = 0
		
		# ####Validate the model####
		for dataset_index in xrange(data_count):
			costNum, errorNum, pred_label, real_label, pred_prob = valid_model[i]()
			print "Valid current model :", data_names[dataset_index]
			print "Cost: ", costNum
			print "Error: ", errorNum
# 			print "Valid Pred: ", pred_label
# 			print "pred_prob: ", pred_prob
	 		
			fpr, tpr, _ = roc_curve(real_label, pred_prob)
			roc_auc = auc(fpr, tpr)
			print "data_name: ", data_name
			print "test_dataname: ", test_dataname
			print "ROC: ", roc_auc
			
		while (epoch < n_epochs):
			epoch = epoch + 1
			#######################
			for i in range(max(n_batches)):
				for dataset_index in xrange(data_count):
					if i >= n_batches[dataset_index]:
						continue
					# for list-type data
					costNum, errorNum, pred_label, real_label = train_model[dataset_index](i)
					ite = ite + 1
					# for padding data
		# 			costNum, errorNum = train_model(docMatrixes, labels)
		# 			del docMatrixes, docSentenceNums, sentenceWordNums, labels
					# print ".", 
					if(ite % 10 == 0):
						print
						print "Dataset name: ", data_names[dataset_index]
						print "@iter: ", ite
						print "Cost: ", costNum
						print "Error: ", errorNum
						
			# Validate the model
			for dataset_index in xrange(data_count):
				costNum, errorNum, pred_label, real_label, pred_prob = valid_model[i]()
				print "Valid current model :", data_names[dataset_index]
				print "Cost: ", costNum
				print "Error: ", errorNum
	# 			print "Valid Pred: ", pred_label
	# 			print "pred_prob: ", pred_prob
		 		
				fpr, tpr, _ = roc_curve(real_label, pred_prob)
				roc_auc = auc(fpr, tpr)
				print "data_name: ", data_name
				print "test_dataname: ", test_dataname
				print "ROC: ", roc_auc
		
				# Save model
				print "Saving parameters."
				saveParamsVal(para_path, params)
				print "Saved."
# 	elif(mode == "deploy"):
# 		print "Compiling computing graph."
# 		output_model = theano.function(
# 	 		[corpus, docSentenceCount, sentenceWordCount],
# 	 		[layer2.y_pred]
# 	 	)
# 		print "Compiled."
# 		cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/train_valid/split")
# 		count = 21000
# 		while(count <= 21000):
# 			docMatrixes, docSentenceNums, sentenceWordNums, ids = cr.getCorpus([count, count + 100])
# 			docMatrixes = numpy.matrix(
# 			            docMatrixes,
# 			            dtype=theano.config.floatX
# 			        )
# 			docSentenceNums = numpy.array(
# 			            docSentenceNums,
# 			            dtype=numpy.int32
# 			        )
# 			sentenceWordNums = numpy.array(
# 			            sentenceWordNums,
# 			            dtype=numpy.int32
# 			        )
# 			print "start to predict."
# 			pred_y = output_model(docMatrixes, docSentenceNums, sentenceWordNums)
# 			print "End predicting."
# 			print "Writing resfile."
# 	# 		print zip(ids, pred_y[0])
# 			f = file("data/test/res/res" + str(count), "w")
# 			f.write(str(zip(ids, pred_y[0])))
# 			f.close()
# 			print "Written." + str(count)
# 			count += 100
		
		
	print "All finished!"