def main3(): rng = numpy.random.RandomState(23455) docList = ttList.TypedListType( ttList.TypedListType(TensorType(theano.config.floatX, (False, False))))("docList") docLabel = T.ivector('docLabel') layer0 = DocEmbeddingNN(docList, rng, 4) layer1 = HiddenLayer(rng, input=layer0.output, n_in=layer0.outputDimension, n_out=10, activation=T.tanh) layer2 = LogisticRegression(input=layer1.output, n_in=10, n_out=10) cost = layer2.negative_log_likelihood(docLabel) params = layer2.params + layer1.params + layer0.params grads = T.grad(cost, params) f = theano.function([docList], layer2.y_pred) a = [[[[2, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], [[1, 2, 4, 4], [1, 2, 3, 4]]], [[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]]]] print f(a) print "All finished!"
def main6(): print "Start!" d = [[[[2, 2, 3, 4], [1, 2, 3, 4], [3, 1, 2, 3], [6, 4, 2, 1], [0, 0, 0, 0]], [[4, 3, 2, 1], [4, 6, 9, 2], [6, 6, 3, 1], [2, 5, 2, 9], [3, 2, 1, 7]]], [[[9, 8, 7, 6], [5, 4, 3, 2], [1, 9, 8, 7], [6, 5, 4, 3], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]] mr = numpy.max([numpy.max([2, 1]), numpy.max([[4, 5], [4, 0]])]) # docSentenceCount = [2, 1] # sentenceWordCount = [[4, 5], [4, 0]] docSentenceCount = T.fvector("docSentenceCount") sentenceWordCount = T.fmatrix("sentenceWordCount") corpus = T.ftensor4("corpus") rng = numpy.random.RandomState(23455) layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, 4, mr, 3, [2, 2], 2, [1, 2]) s = layer0.output.sum() g = theano.grad(s, layer0.params) print "Compiling!" f = theano.function([corpus, docSentenceCount, sentenceWordCount], g) print "Compiled!" print f(d, [2, 1], [[4, 5], [4, 0]]) print "All finished!"
def work(mode, data_name, test_dataname, pooling_mode="average_exc_pad"): print "mode: ", mode print "data_name: ", data_name print "pooling_mode: ", pooling_mode print "Started!" rng = numpy.random.RandomState(23455) docSentenceCount = T.ivector("docSentenceCount") sentenceWordCount = T.ivector("sentenceWordCount") corpus = T.matrix("corpus") docLabel = T.ivector('docLabel') # for list-type data layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \ sentenceLayerNodesNum=50, \ sentenceLayerNodesSize=[5, 200], \ docLayerNodesNum=10, \ docLayerNodesSize=[3, 50], pooling_mode=pooling_mode) # layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \ # sentenceLayerNodesNum=100, \ # sentenceLayerNodesSize=[5, 200], \ # docLayerNodesNum=100, \ # docLayerNodesSize=[3, 100], # pooling_mode=pooling_mode) layer1 = HiddenLayer(rng, input=layer0.output, n_in=layer0.outputDimension, n_out=10, activation=T.tanh) layer2 = LogisticRegression(input=layer1.output, n_in=10, n_out=2) # construct the parameter array. params = layer2.params + layer1.params + layer0.params # Load the parameters last time, optionally. # data_name = "car" para_path = "data/" + data_name + "/model/" + pooling_mode + ".model" traintext = "data/" + data_name + "/train/text" trainlabel = "data/" + data_name + "/train/label" testtext = "data/" + test_dataname + "/test/text" testlabel = "data/" + test_dataname + "/test/label" loadParamsVal(para_path, params) if (mode == "train" or mode == "test"): learning_rate = 0.1 error = layer2.errors(docLabel) cost = layer2.negative_log_likelihood(docLabel) grads = T.grad(cost, params) updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] print "Loading test data." cr_test = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=testtext, labelset=testlabel) validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels, _, _ = cr_test.getCorpus( [0, 1000]) # print "Right answer: " # print zip(validIds, validLabels) validDocMatrixes = transToTensor(validDocMatrixes, theano.config.floatX) validDocSentenceNums = transToTensor(validDocSentenceNums, numpy.int32) validSentenceWordNums = transToTensor(validSentenceWordNums, numpy.int32) validLabels = transToTensor(validLabels, numpy.int32) print "Data loaded." valid_model = theano.function( [], [ cost, error, layer2.y_pred, docLabel, T.transpose(layer2.p_y_given_x)[1] ], givens={ corpus: validDocMatrixes, docSentenceCount: validDocSentenceNums, sentenceWordCount: validSentenceWordNums, docLabel: validLabels }) # ####Validate the model#### costNum, errorNum, pred_label, real_label, pred_prob = valid_model() print "Valid current model:" print "Cost: ", costNum print "Error: ", errorNum # print "Valid Pred: ", pred_label # print "pred_prob: ", pred_prob fpr, tpr, _ = roc_curve(real_label, pred_prob) if mode == "test": print "tpr_all: ", tpr print "fpr_all: ", fpr roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "test_dataname: ", test_dataname print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(real_label, pred_label) index_of_one = list(threshold).index(1) ar = (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2 print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "AR: ", ar print "threshold: ", threshold[index_of_one] if mode == "test": valid_model.free() return errorNum, roc_auc, tpr[index_of_one], fpr[index_of_one], ar print "Loading train data." cr_train = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=traintext, labelset=trainlabel) docMatrixes, docSentenceNums, sentenceWordNums, ids, labels, _, _ = cr_train.getCorpus( [0, 100000]) # print "Right answer: " # print zip(ids, labels) docMatrixes = transToTensor(docMatrixes, theano.config.floatX) docSentenceNums = transToTensor(docSentenceNums, numpy.int32) sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32) labels = transToTensor(labels, numpy.int32) # valid_cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/valid/split", labelset="data/valid/label.txt") print index = T.lscalar("index") batchSize = 10 n_batches = (len(docSentenceNums.get_value()) - 1 - 1) / batchSize + 1 print print "Train set size is ", len(docMatrixes.get_value()) print "Validating set size is ", len(validDocMatrixes.get_value()) print "Batch size is ", batchSize print "Number of training batches is ", n_batches print "Compiling computing graph." # for list-type data train_model = theano.function( [index], [cost, error, layer2.y_pred, docLabel], updates=updates, givens={ corpus: docMatrixes, docSentenceCount: docSentenceNums[index * batchSize:(index + 1) * batchSize + 1], sentenceWordCount: sentenceWordNums, docLabel: labels[index * batchSize:(index + 1) * batchSize] }) print "Compiled." print "Start to train." epoch = 0 n_epochs = 10 ite = 0 while (epoch < n_epochs): epoch = epoch + 1 ####################### for i in range(n_batches): # for list-type data costNum, errorNum, pred_label, real_label = train_model(i) ite = ite + 1 # for padding data # costNum, errorNum = train_model(docMatrixes, labels) # del docMatrixes, docSentenceNums, sentenceWordNums, labels # print ".", if (ite % 10 == 0): print print "@iter: ", ite print "Cost: ", costNum print "Error: ", errorNum # Validate the model costNum, errorNum, pred_label, real_label, pred_prob = valid_model( ) print "Valid current model:" print "Cost: ", costNum print "Error: ", errorNum # print "pred_prob: ", pred_prob # print "Valid Pred: ", pred_label fpr, tpr, _ = roc_curve(real_label, pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "test_dataname: ", test_dataname print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(real_label, pred_label) index_of_one = list(threshold).index(1) print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2 print "threshold: ", threshold[index_of_one] # Save model print "Saving parameters." saveParamsVal(para_path, params) print "Saved." valid_model.free() train_model.free() elif (mode == "deploy"): print "Compiling computing graph." output_model = theano.function( [corpus, docSentenceCount, sentenceWordCount], [layer2.y_pred]) print "Compiled." cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/train_valid/split") count = 21000 while (count <= 21000): docMatrixes, docSentenceNums, sentenceWordNums, ids = cr.getCorpus( [count, count + 100]) docMatrixes = numpy.matrix(docMatrixes, dtype=theano.config.floatX) docSentenceNums = numpy.array(docSentenceNums, dtype=numpy.int32) sentenceWordNums = numpy.array(sentenceWordNums, dtype=numpy.int32) print "start to predict." pred_y = output_model(docMatrixes, docSentenceNums, sentenceWordNums) print "End predicting." print "Writing resfile." # print zip(ids, pred_y[0]) f = file("data/test/res/res" + str(count), "w") f.write(str(zip(ids, pred_y[0]))) f.close() print "Written." + str(count) count += 100
def work(): print "Started!" print "Loading data." cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/split", labelset="data/traindataset2zgb") docMatrixes, docSentenceNums, sentenceWordNums, labels = transToTensor(cr.getCorpus([0, 12])) # valid_cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/valid/split", labelset="data/valid/label.txt") validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validLabels = transToTensor(cr.getCorpus([800, 870])) print "Data loaded." learning_rate = 0.1 docSentenceCount = T.vector("docSentenceCount") sentenceWordCount = T.vector("sentenceWordCount") corpus = T.matrix("corpus") docLabel = T.ivector('docLabel') index = T.lscalar("index") rng = numpy.random.RandomState(23455) batchSize = 1 mr =numpy.max([len(docMatrixes.get_value()), len(validDocMatrixes.get_value())]) n_batches = (len(docSentenceNums.get_value()) -1 ) / batchSize print "Train set size is ", len(docMatrixes.get_value()) print "Validating set size is ", len(validDocMatrixes.get_value()) print "Batch size is ", batchSize print "Number of training batches is ", n_batches # for list-type data layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \ maxRandge=mr, \ sentenceLayerNodesNum=100, \ sentenceLayerNodesSize=[5, 200], \ docLayerNodesNum=100, \ docLayerNodesSize=[3, 100]) # for padding data # layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, corpus_shape=(batchSize, cr.getMaxDocSentenceNum(), cr.getMaxSentenceWordNum(), cr.getDim()), \ # maxRandge=mr, \ # sentenceLayerNodesNum=100, \ # sentenceLayerNodesSize=5, \ # docLayerNodesNum=200, \ # docLayerNodesSize=3) layer1 = HiddenLayer( rng, input=layer0.output, n_in=layer0.outputDimension, n_out=100, activation=T.tanh ) layer2 = LogisticRegression(input=layer1.output, n_in=100, n_out=2) error = layer2.errors(docLabel) cost = layer2.negative_log_likelihood(docLabel) # construct the parameter array. params = layer2.params + layer1.params + layer0.params # Load the parameters last time, optionally. loadParamsVal(params) grads = T.grad(cost, params) updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] print "Compiling computing graph." valid_model = theano.function( [], [cost, error], givens={ corpus: validDocMatrixes, docSentenceCount: validDocSentenceNums, sentenceWordCount: validSentenceWordNums, docLabel: validLabels } ) # for list-type data train_model = theano.function( [index], [cost, error], updates=updates, givens={ corpus: docMatrixes, docSentenceCount: docSentenceNums[index * batchSize: (index + 1) * batchSize + 1], sentenceWordCount: sentenceWordNums, docLabel: labels[index * batchSize: (index + 1) * batchSize] } ) # for padding data # train_model = theano.function( # [corpus, docLabel], # [cost, error], # updates=updates, # ) print "Compiled." print "Start to train." epoch = 0 n_epochs = 200 ite = 0 # ####Validate the model#### costNum, errorNum = valid_model() print "Valid current model:" print "Cost: ", costNum print "Error: ", errorNum while (epoch < n_epochs): epoch = epoch + 1 ####################### for i in range(n_batches): # for list-type data costNum, errorNum = train_model(i) ite = ite + 1 # for padding data # costNum, errorNum = train_model(docMatrixes, labels) # del docMatrixes, docSentenceNums, sentenceWordNums, labels # print ".", if(ite % 1 == 0): print print "@iter: ", ite print "Cost: ", costNum print "Error: ", errorNum # Validate the model costNum, errorNum = valid_model() print "Valid current model:" print "Cost: ", costNum print "Error: ", errorNum # Save model print "Saving parameters." saveParamsVal(params) print "Saved." print "All finished!"
def work(mode, data_name, test_dataname, pooling_mode="average_exc_pad"): print "mode: ", mode print "data_name: ", data_name print "pooling_mode: ", pooling_mode print "Started!" data_names = data_name.split(":") data_count = len(data_names) print "Train dataset:" for i in xrange(data_count): print "%d: %s" % (i, data_names[i]) print "Test dataset:" test_data_names = test_dataname.split(":") test_data_count = len(test_data_names) for i in xrange(test_data_count): print "%d: %s" % (i, test_data_names[i]) if test_data_count != data_count: raise Exception( "The amount of test and train dataset must be the same.") rng = numpy.random.RandomState(23455) docSentenceCount = T.ivector("docSentenceCount") sentenceWordCount = T.ivector("sentenceWordCount") corpus = T.matrix("corpus") docLabel = T.ivector('docLabel') sentenceW = None sentenceB = None docW = None docB = None hidden_layer_w = None hidden_layer_b = None logistic_layer_w = None logistic_layer_b = None layer0 = list() layer1 = list() layer2 = list() local_params = list() # for list-type data for i in xrange(data_count): layer0.append(DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \ sentenceLayerNodesNum=50, \ sentenceLayerNodesSize=[5, 200], \ docLayerNodesNum=10, \ docLayerNodesSize=[3, 50], sentenceW=sentenceW, sentenceB=sentenceB, docW=docW, docB=docB, pooling_mode=pooling_mode)) sentenceW = layer0[i].sentenceW sentenceB = layer0[i].sentenceB docW = layer0[i].docW docB = layer0[i].docB layer1.append( HiddenLayer(rng, input=layer0[i].output, n_in=layer0[i].outputDimension, n_out=10, activation=T.tanh, W=hidden_layer_w, b=hidden_layer_b)) hidden_layer_w = layer1[i].W hidden_layer_b = layer1[i].b layer2.append( LogisticRegression(input=layer1[i].output, n_in=10, n_out=2, W=logistic_layer_w, b=logistic_layer_b)) # logistic_layer_w = layer2[i].W # logistic_layer_b = layer2[i].b local_params.append(layer2[i].params) share_params = list(layer0[0].params + layer1[0].params) # construct the parameter array. params = list(layer0[0].params) + layer1[0].params for i in xrange(data_count): params += layer2[i].params # data_name = "car" para_path = "data/" + data_name + "/share_hidden_low_model/" + pooling_mode + ".model" traintext = [ "data/" + data_names[i] + "/train/text" for i in xrange(data_count) ] trainlabel = [ "data/" + data_names[i] + "/train/label" for i in xrange(data_count) ] testtext = [ "data/" + test_data_names[i] + "/test/text" for i in xrange(data_count) ] testlabel = [ "data/" + test_data_names[i] + "/test/label" for i in xrange(data_count) ] # Load the parameters last time, optionally. loadParamsVal(para_path, params) if (mode == "train" or mode == "test"): train_model = list() valid_model = list() print "Loading train data." batchSize = 10 share_learning_rate = 0.1 local_learning_rate = 0.1 n_batches = list() print "Loading test data." all_pred_label = list() all_real_label = list() all_pred_prob = list() for i in xrange(data_count): cr_train = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=traintext[i], labelset=trainlabel[i]) docMatrixes, docSentenceNums, sentenceWordNums, ids, labels, _, _ = cr_train.getCorpus( [0, 100000]) docMatrixes = transToTensor(docMatrixes, theano.config.floatX) docSentenceNums = transToTensor(docSentenceNums, numpy.int32) sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32) labels = transToTensor(labels, numpy.int32) index = T.lscalar("index") n_batches.append((len(docSentenceNums.get_value()) - 1 - 1) / batchSize + 1) print "Dataname: %s" % data_names[i] print "Train set size is ", len(docMatrixes.get_value()) print "Batch size is ", batchSize print "Number of training batches is ", n_batches[i] error = layer2[i].errors(docLabel) cost = layer2[i].negative_log_likelihood(docLabel) share_grads = T.grad(cost, share_params) share_updates = [ (param_i, param_i - share_learning_rate * grad_i) for param_i, grad_i in zip(share_params, share_grads) ] grads = T.grad(cost, local_params[i]) local_updates = [ (param_i, param_i - local_learning_rate * grad_i) for param_i, grad_i in zip(local_params[i], grads) ] updates = share_updates + local_updates print "Compiling train computing graph." if mode == "train": train_model.append( theano.function( [index], [cost, error, layer2[i].y_pred, docLabel], updates=updates, givens={ corpus: docMatrixes, docSentenceCount: docSentenceNums[index * batchSize:(index + 1) * batchSize + 1], sentenceWordCount: sentenceWordNums, docLabel: labels[index * batchSize:(index + 1) * batchSize] })) print "Compiled." print "Load test dataname: %s" % test_data_names[i] cr_test = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=testtext[i], labelset=testlabel[i]) validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels, _, _ = cr_test.getCorpus( [0, 1000]) validDocMatrixes = transToTensor(validDocMatrixes, theano.config.floatX) validDocSentenceNums = transToTensor(validDocSentenceNums, numpy.int32) validSentenceWordNums = transToTensor(validSentenceWordNums, numpy.int32) validLabels = transToTensor(validLabels, numpy.int32) print "Validating set size is ", len(validDocMatrixes.get_value()) print "Data loaded." print "Compiling test computing graph." valid_model.append( theano.function( [], [ cost, error, layer2[i].y_pred, docLabel, T.transpose(layer2[i].p_y_given_x)[1] ], givens={ corpus: validDocMatrixes, docSentenceCount: validDocSentenceNums, sentenceWordCount: validSentenceWordNums, docLabel: validLabels })) print "Compiled." costNum, errorNum, pred_label, real_label, pred_prob = valid_model[ i]() all_pred_label.extend(pred_label) all_real_label.extend(real_label) all_pred_prob.extend(pred_prob) print "Valid current model :", data_names[i] print "Cost: ", costNum print "Error: ", errorNum fpr, tpr, _ = roc_curve(real_label, pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(real_label, pred_label) if 1 in threshold: index_of_one = list(threshold).index(1) print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2 print "threshold: ", threshold[index_of_one] print "Valid current model :", data_names errorNum = 1 - accuracy_score(all_real_label, all_pred_label) print "Error: ", errorNum fpr, tpr, _ = roc_curve(all_real_label, all_pred_prob) if mode == "test": print "tpr_all: ", tpr print "fpr_all: ", fpr roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(all_real_label, all_pred_label) if 1 in threshold: index_of_one = list(threshold).index(1) print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2 print "threshold: ", threshold[index_of_one] if mode == "test": return print "Start to train." epoch = 0 n_epochs = 10 ite = 0 while (epoch < n_epochs): epoch = epoch + 1 ####################### for i in range(max(n_batches)): for dataset_index in xrange(data_count): if i >= n_batches[dataset_index]: continue # for list-type data costNum, errorNum, pred_label, real_label = train_model[ dataset_index](i) ite = ite + 1 # for padding data if (ite % 10 == 0): print print "Dataset name: ", data_names[dataset_index] print "@iter: ", ite print "Cost: ", costNum print "Error: ", errorNum # Validate the model all_pred_label = list() all_real_label = list() all_pred_prob = list() for dataset_index in xrange(data_count): costNum, errorNum, pred_label, real_label, pred_prob = valid_model[ dataset_index]() all_pred_label.extend(pred_label) all_real_label.extend(real_label) all_pred_prob.extend(pred_prob) print "Valid current model :", data_names[dataset_index] print "Cost: ", costNum print "Error: ", errorNum fpr, tpr, _ = roc_curve(real_label, pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(real_label, pred_label) index_of_one = list(threshold).index(1) print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2 print "threshold: ", threshold[index_of_one] print "Valid current model :", data_names errorNum = 1 - accuracy_score(all_real_label, all_pred_label) print "Error: ", errorNum fpr, tpr, _ = roc_curve(all_real_label, all_pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(all_real_label, all_pred_label) index_of_one = list(threshold).index(1) print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2 print "threshold: ", threshold[index_of_one] # Save model print "Saving parameters." saveParamsVal(para_path, params) print "Saved."
def work(mode, data_name, test_dataname): print "mode: ", mode print "data_name: ", data_name print "Started!" data_names = data_name.split(":") data_count = len(data_names) print "Train dataset:" for i in xrange(data_count): print "%d: %s" % (i, data_names[i]) print "Test dataset:" test_data_names = test_dataname.split(":") test_data_count = len(test_data_names) for i in xrange(test_data_count): print "%d: %s" % (i, test_data_names[i]) if test_data_count != data_count: raise Exception("The amount of test and train dataset must be the same.") rng = numpy.random.RandomState(23455) docSentenceCount = T.ivector("docSentenceCount") sentenceWordCount = T.ivector("sentenceWordCount") corpus = T.matrix("corpus") docLabel = T.ivector('docLabel') hidden_layer_w = None hidden_layer_b = None logistic_layer_w = None logistic_layer_b = None layer0 = list() layer1 = list() layer2 = list() local_params = list() # for list-type data for i in xrange(data_count): layer0.append(DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \ sentenceLayerNodesNum=100, \ sentenceLayerNodesSize=[5, 200], \ docLayerNodesNum=100, \ docLayerNodesSize=[3, 100])) layer1.append(HiddenLayer( rng, input=layer0[i].output, n_in=layer0[i].outputDimension, n_out=100, activation=T.tanh, W=hidden_layer_w, b=hidden_layer_b )) hidden_layer_w = layer1[i].W hidden_layer_b = layer1[i].b layer2.append(LogisticRegression(input=layer1[i].output, n_in=100, n_out=2, W=logistic_layer_w, b=logistic_layer_b)) logistic_layer_w = layer2[i].W logistic_layer_b = layer2[i].b local_params.append(layer2[i].params + layer1[i].params + layer0[i].params) # construct the parameter array. params = layer2[0].params + layer1[0].params for i in xrange(data_count): params += layer0[i].params # Load the parameters last time, optionally. # data_name = "car" para_path = "data/" + data_name + "/model/scnn.model" traintext = ["data/" + data_names[i] + "/train/text" for i in xrange(data_count)] trainlabel = ["data/" + data_names[i] + "/train/label" for i in xrange(data_count)] testtext = ["data/" + test_data_names[i] + "/test/text" for i in xrange(data_count)] testlabel = ["data/" + test_data_names[i] + "/test/label" for i in xrange(data_count)] loadParamsVal(para_path, params) if(mode == "train"): train_model = list() valid_model = list() print "Loading train data." batchSize = 10 learning_rate = 0.1 n_batches = list() print "Loading test data." for i in xrange(data_count): cr_train = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=traintext[i], labelset=trainlabel[i]) docMatrixes, docSentenceNums, sentenceWordNums, ids, labels = cr_train.getCorpus([0, 100000]) docMatrixes = transToTensor(docMatrixes, theano.config.floatX) docSentenceNums = transToTensor(docSentenceNums, numpy.int32) sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32) labels = transToTensor(labels, numpy.int32) index = T.lscalar("index") n_batches.append((len(docSentenceNums.get_value()) - 1) / batchSize + 1) print "Dataname: %s" % data_names[i] print "Train set size is ", len(docMatrixes.get_value()) print "Batch size is ", batchSize print "Number of training batches is ", n_batches[i] error = layer2[i].errors(docLabel) cost = layer2[i].negative_log_likelihood(docLabel) grads = T.grad(cost, local_params[i]) updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(local_params[i], grads) ] print "Compiling train computing graph." train_model.append(theano.function( [index], [cost, error, layer2[i].y_pred, docLabel], updates=updates, givens={ corpus: docMatrixes, docSentenceCount: docSentenceNums[index * batchSize: (index + 1) * batchSize + 1], sentenceWordCount: sentenceWordNums, docLabel: labels[index * batchSize: (index + 1) * batchSize] } )) print "Compiled." print "Load test dataname: %s" % test_data_names[i] cr_test = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=testtext[i], labelset=testlabel[i]) validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels = cr_test.getCorpus([0, 1000]) validDocMatrixes = transToTensor(validDocMatrixes, theano.config.floatX) validDocSentenceNums = transToTensor(validDocSentenceNums, numpy.int32) validSentenceWordNums = transToTensor(validSentenceWordNums, numpy.int32) validLabels = transToTensor(validLabels, numpy.int32) print "Validating set size is ", len(validDocMatrixes.get_value()) print "Data loaded." print "Compiling test computing graph." valid_model.append(theano.function( [], [cost, error, layer2[i].y_pred, docLabel, T.transpose(layer2[i].p_y_given_x)[1]], givens={ corpus: validDocMatrixes, docSentenceCount: validDocSentenceNums, sentenceWordCount: validSentenceWordNums, docLabel: validLabels } )) print "Compiled." # for list-type data print "Start to train." epoch = 0 n_epochs = 2000 ite = 0 # ####Validate the model#### for dataset_index in xrange(data_count): costNum, errorNum, pred_label, real_label, pred_prob = valid_model[i]() print "Valid current model :", data_names[dataset_index] print "Cost: ", costNum print "Error: ", errorNum # print "Valid Pred: ", pred_label # print "pred_prob: ", pred_prob fpr, tpr, _ = roc_curve(real_label, pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "test_dataname: ", test_dataname print "ROC: ", roc_auc while (epoch < n_epochs): epoch = epoch + 1 ####################### for i in range(max(n_batches)): for dataset_index in xrange(data_count): if i >= n_batches[dataset_index]: continue # for list-type data costNum, errorNum, pred_label, real_label = train_model[dataset_index](i) ite = ite + 1 # for padding data # costNum, errorNum = train_model(docMatrixes, labels) # del docMatrixes, docSentenceNums, sentenceWordNums, labels # print ".", if(ite % 10 == 0): print print "Dataset name: ", data_names[dataset_index] print "@iter: ", ite print "Cost: ", costNum print "Error: ", errorNum # Validate the model for dataset_index in xrange(data_count): costNum, errorNum, pred_label, real_label, pred_prob = valid_model[i]() print "Valid current model :", data_names[dataset_index] print "Cost: ", costNum print "Error: ", errorNum # print "Valid Pred: ", pred_label # print "pred_prob: ", pred_prob fpr, tpr, _ = roc_curve(real_label, pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "test_dataname: ", test_dataname print "ROC: ", roc_auc # Save model print "Saving parameters." saveParamsVal(para_path, params) print "Saved." # elif(mode == "deploy"): # print "Compiling computing graph." # output_model = theano.function( # [corpus, docSentenceCount, sentenceWordCount], # [layer2.y_pred] # ) # print "Compiled." # cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/train_valid/split") # count = 21000 # while(count <= 21000): # docMatrixes, docSentenceNums, sentenceWordNums, ids = cr.getCorpus([count, count + 100]) # docMatrixes = numpy.matrix( # docMatrixes, # dtype=theano.config.floatX # ) # docSentenceNums = numpy.array( # docSentenceNums, # dtype=numpy.int32 # ) # sentenceWordNums = numpy.array( # sentenceWordNums, # dtype=numpy.int32 # ) # print "start to predict." # pred_y = output_model(docMatrixes, docSentenceNums, sentenceWordNums) # print "End predicting." # print "Writing resfile." # # print zip(ids, pred_y[0]) # f = file("data/test/res/res" + str(count), "w") # f.write(str(zip(ids, pred_y[0]))) # f.close() # print "Written." + str(count) # count += 100 print "All finished!"