def work(mode, data_name, test_dataname, pooling_mode="average_exc_pad"): print "mode: ", mode print "data_name: ", data_name print "pooling_mode: ", pooling_mode print "Started!" rng = numpy.random.RandomState(23455) docSentenceCount = T.ivector("docSentenceCount") sentenceWordCount = T.ivector("sentenceWordCount") corpus = T.matrix("corpus") docLabel = T.ivector('docLabel') # for list-type data layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \ sentenceLayerNodesNum=50, \ sentenceLayerNodesSize=[5, 200], \ docLayerNodesNum=10, \ docLayerNodesSize=[3, 50], pooling_mode=pooling_mode) # layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \ # sentenceLayerNodesNum=100, \ # sentenceLayerNodesSize=[5, 200], \ # docLayerNodesNum=100, \ # docLayerNodesSize=[3, 100], # pooling_mode=pooling_mode) layer1 = HiddenLayer(rng, input=layer0.output, n_in=layer0.outputDimension, n_out=10, activation=T.tanh) layer2 = LogisticRegression(input=layer1.output, n_in=10, n_out=2) # construct the parameter array. params = layer2.params + layer1.params + layer0.params # Load the parameters last time, optionally. # data_name = "car" para_path = "data/" + data_name + "/model/" + pooling_mode + ".model" traintext = "data/" + data_name + "/train/text" trainlabel = "data/" + data_name + "/train/label" testtext = "data/" + test_dataname + "/test/text" testlabel = "data/" + test_dataname + "/test/label" loadParamsVal(para_path, params) if (mode == "train" or mode == "test"): learning_rate = 0.1 error = layer2.errors(docLabel) cost = layer2.negative_log_likelihood(docLabel) grads = T.grad(cost, params) updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] print "Loading test data." cr_test = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=testtext, labelset=testlabel) validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels, _, _ = cr_test.getCorpus( [0, 1000]) # print "Right answer: " # print zip(validIds, validLabels) validDocMatrixes = transToTensor(validDocMatrixes, theano.config.floatX) validDocSentenceNums = transToTensor(validDocSentenceNums, numpy.int32) validSentenceWordNums = transToTensor(validSentenceWordNums, numpy.int32) validLabels = transToTensor(validLabels, numpy.int32) print "Data loaded." valid_model = theano.function( [], [ cost, error, layer2.y_pred, docLabel, T.transpose(layer2.p_y_given_x)[1] ], givens={ corpus: validDocMatrixes, docSentenceCount: validDocSentenceNums, sentenceWordCount: validSentenceWordNums, docLabel: validLabels }) # ####Validate the model#### costNum, errorNum, pred_label, real_label, pred_prob = valid_model() print "Valid current model:" print "Cost: ", costNum print "Error: ", errorNum # print "Valid Pred: ", pred_label # print "pred_prob: ", pred_prob fpr, tpr, _ = roc_curve(real_label, pred_prob) if mode == "test": print "tpr_all: ", tpr print "fpr_all: ", fpr roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "test_dataname: ", test_dataname print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(real_label, pred_label) index_of_one = list(threshold).index(1) ar = (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2 print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "AR: ", ar print "threshold: ", threshold[index_of_one] if mode == "test": valid_model.free() return errorNum, roc_auc, tpr[index_of_one], fpr[index_of_one], ar print "Loading train data." cr_train = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=traintext, labelset=trainlabel) docMatrixes, docSentenceNums, sentenceWordNums, ids, labels, _, _ = cr_train.getCorpus( [0, 100000]) # print "Right answer: " # print zip(ids, labels) docMatrixes = transToTensor(docMatrixes, theano.config.floatX) docSentenceNums = transToTensor(docSentenceNums, numpy.int32) sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32) labels = transToTensor(labels, numpy.int32) # valid_cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/valid/split", labelset="data/valid/label.txt") print index = T.lscalar("index") batchSize = 10 n_batches = (len(docSentenceNums.get_value()) - 1 - 1) / batchSize + 1 print print "Train set size is ", len(docMatrixes.get_value()) print "Validating set size is ", len(validDocMatrixes.get_value()) print "Batch size is ", batchSize print "Number of training batches is ", n_batches print "Compiling computing graph." # for list-type data train_model = theano.function( [index], [cost, error, layer2.y_pred, docLabel], updates=updates, givens={ corpus: docMatrixes, docSentenceCount: docSentenceNums[index * batchSize:(index + 1) * batchSize + 1], sentenceWordCount: sentenceWordNums, docLabel: labels[index * batchSize:(index + 1) * batchSize] }) print "Compiled." print "Start to train." epoch = 0 n_epochs = 10 ite = 0 while (epoch < n_epochs): epoch = epoch + 1 ####################### for i in range(n_batches): # for list-type data costNum, errorNum, pred_label, real_label = train_model(i) ite = ite + 1 # for padding data # costNum, errorNum = train_model(docMatrixes, labels) # del docMatrixes, docSentenceNums, sentenceWordNums, labels # print ".", if (ite % 10 == 0): print print "@iter: ", ite print "Cost: ", costNum print "Error: ", errorNum # Validate the model costNum, errorNum, pred_label, real_label, pred_prob = valid_model( ) print "Valid current model:" print "Cost: ", costNum print "Error: ", errorNum # print "pred_prob: ", pred_prob # print "Valid Pred: ", pred_label fpr, tpr, _ = roc_curve(real_label, pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "test_dataname: ", test_dataname print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(real_label, pred_label) index_of_one = list(threshold).index(1) print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2 print "threshold: ", threshold[index_of_one] # Save model print "Saving parameters." saveParamsVal(para_path, params) print "Saved." valid_model.free() train_model.free() elif (mode == "deploy"): print "Compiling computing graph." output_model = theano.function( [corpus, docSentenceCount, sentenceWordCount], [layer2.y_pred]) print "Compiled." cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/train_valid/split") count = 21000 while (count <= 21000): docMatrixes, docSentenceNums, sentenceWordNums, ids = cr.getCorpus( [count, count + 100]) docMatrixes = numpy.matrix(docMatrixes, dtype=theano.config.floatX) docSentenceNums = numpy.array(docSentenceNums, dtype=numpy.int32) sentenceWordNums = numpy.array(sentenceWordNums, dtype=numpy.int32) print "start to predict." pred_y = output_model(docMatrixes, docSentenceNums, sentenceWordNums) print "End predicting." print "Writing resfile." # print zip(ids, pred_y[0]) f = file("data/test/res/res" + str(count), "w") f.write(str(zip(ids, pred_y[0]))) f.close() print "Written." + str(count) count += 100
def work(mode, data_name, test_dataname, pooling_mode="average_exc_pad"): print "mode: ", mode print "data_name: ", data_name print "pooling_mode: ", pooling_mode print "Started!" data_names = data_name.split(":") data_count = len(data_names) print "Train dataset:" for i in xrange(data_count): print "%d: %s" % (i, data_names[i]) print "Test dataset:" test_data_names = test_dataname.split(":") test_data_count = len(test_data_names) for i in xrange(test_data_count): print "%d: %s" % (i, test_data_names[i]) if test_data_count != data_count: raise Exception("The amount of test and train dataset must be the same.") rng = numpy.random.RandomState(23455) docSentenceCount = T.ivector("docSentenceCount") sentenceWordCount = T.ivector("sentenceWordCount") corpus = T.matrix("corpus") docLabel = T.ivector('docLabel') hidden_layer_w = None hidden_layer_b = None logistic_layer_w = None logistic_layer_b = None layer0 = list() layer1 = list() layer2 = list() local_params = list() # for list-type data for i in xrange(data_count): layer0.append(DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \ sentenceLayerNodesNum=50, \ sentenceLayerNodesSize=[5, 200], \ docLayerNodesNum=10, \ docLayerNodesSize=[3, 50], pooling_mode=pooling_mode)) layer1.append(HiddenLayer( rng, input=layer0[i].output, n_in=layer0[i].outputDimension, n_out=10, activation=T.tanh, W=hidden_layer_w, b=hidden_layer_b )) # hidden_layer_w = layer1[i].W # hidden_layer_b = layer1[i].b layer2.append(LogisticRegression(input=layer1[i].output, n_in=10, n_out=2, W=logistic_layer_w, b=logistic_layer_b)) logistic_layer_w = layer2[i].W logistic_layer_b = layer2[i].b local_params.append(layer0[i].params + layer1[i].params) share_params = list(layer2[0].params) # construct the parameter array. params = list(layer2[0].params) for i in xrange(data_count): params += layer1[0].params + layer0[i].params # data_name = "car" para_path = "data/" + data_name + "/log_model/" + pooling_mode + ".model" traintext = ["data/" + data_names[i] + "/train/text" for i in xrange(data_count)] trainlabel = ["data/" + data_names[i] + "/train/label" for i in xrange(data_count)] testtext = ["data/" + test_data_names[i] + "/test/text" for i in xrange(data_count)] testlabel = ["data/" + test_data_names[i] + "/test/label" for i in xrange(data_count)] # Load the parameters last time, optionally. loadParamsVal(para_path, params) if(mode == "train" or mode == "test"): train_model = list() valid_model = list() print "Loading train data." batchSize = 10 share_learning_rate = 0.01 local_learning_rate = 0.1 n_batches = list() print "Loading test data." for i in xrange(data_count): cr_train = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=traintext[i], labelset=trainlabel[i]) docMatrixes, docSentenceNums, sentenceWordNums, ids, labels, _, _ = cr_train.getCorpus([0, 100000]) docMatrixes = transToTensor(docMatrixes, theano.config.floatX) docSentenceNums = transToTensor(docSentenceNums, numpy.int32) sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32) labels = transToTensor(labels, numpy.int32) index = T.lscalar("index") n_batches.append((len(docSentenceNums.get_value()) - 1 - 1) / batchSize + 1) print "Dataname: %s" % data_names[i] print "Train set size is ", len(docMatrixes.get_value()) print "Batch size is ", batchSize print "Number of training batches is ", n_batches[i] error = layer2[i].errors(docLabel) cost = layer2[i].negative_log_likelihood(docLabel) share_grads = T.grad(cost, share_params) share_updates = [ (param_i, param_i - share_learning_rate * grad_i) for param_i, grad_i in zip(share_params, share_grads) ] grads = T.grad(cost, local_params[i]) local_updates = [ (param_i, param_i - local_learning_rate * grad_i) for param_i, grad_i in zip(local_params[i], grads) ] updates = share_updates + local_updates print "Compiling train computing graph." if mode == "train": train_model.append(theano.function( [index], [cost, error, layer2[i].y_pred, docLabel], updates=updates, givens={ corpus: docMatrixes, docSentenceCount: docSentenceNums[index * batchSize: (index + 1) * batchSize + 1], sentenceWordCount: sentenceWordNums, docLabel: labels[index * batchSize: (index + 1) * batchSize] } )) print "Compiled." print "Load test dataname: %s" % test_data_names[i] cr_test = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=testtext[i], labelset=testlabel[i]) validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels, _, _ = cr_test.getCorpus([0, 1000]) validDocMatrixes = transToTensor(validDocMatrixes, theano.config.floatX) validDocSentenceNums = transToTensor(validDocSentenceNums, numpy.int32) validSentenceWordNums = transToTensor(validSentenceWordNums, numpy.int32) validLabels = transToTensor(validLabels, numpy.int32) print "Validating set size is ", len(validDocMatrixes.get_value()) print "Data loaded." print "Compiling test computing graph." valid_model.append(theano.function( [], [cost, error, layer2[i].y_pred, docLabel, T.transpose(layer2[i].p_y_given_x)[1]], givens={ corpus: validDocMatrixes, docSentenceCount: validDocSentenceNums, sentenceWordCount: validSentenceWordNums, docLabel: validLabels } )) print "Compiled." costNum, errorNum, pred_label, real_label, pred_prob = valid_model[i]() print "Valid current model :", data_names[i] print "Cost: ", costNum print "Error: ", errorNum fpr, tpr, _ = roc_curve(real_label, pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(real_label, pred_label) if 1 in threshold: index_of_one = list(threshold).index(1) print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "threshold: ", threshold[index_of_one] if mode == "test": return print "Start to train." epoch = 0 n_epochs = 10 ite = 0 # ####Validate the model#### # for dataset_index in xrange(data_count): # costNum, errorNum, pred_label, real_label, pred_prob = valid_model[dataset_index]() # print "Valid current model :", data_names[dataset_index] # print "Cost: ", costNum # print "Error: ", errorNum # # fpr, tpr, _ = roc_curve(real_label, pred_prob) # roc_auc = auc(fpr, tpr) # print "data_name: ", data_name # print "ROC: ", roc_auc # fpr, tpr, threshold = roc_curve(real_label, pred_label) # index_of_one = list(threshold).index(1) # print "TPR: ", tpr[index_of_one] # print "FPR: ", fpr[index_of_one] # print "threshold: ", threshold[index_of_one] while (epoch < n_epochs): epoch = epoch + 1 ####################### for i in range(max(n_batches)): for dataset_index in xrange(data_count): if i >= n_batches[dataset_index]: continue # for list-type data print "dataset_index: %d, i: %d" %(dataset_index, i) costNum, errorNum, pred_label, real_label = train_model[dataset_index](i) ite = ite + 1 # for padding data if(ite % 10 == 0): print print "Dataset name: ", data_names[dataset_index] print "@iter: ", ite print "Cost: ", costNum print "Error: ", errorNum # Validate the model for dataset_index in xrange(data_count): costNum, errorNum, pred_label, real_label, pred_prob = valid_model[dataset_index]() print "Valid current model :", data_names[dataset_index] print "Cost: ", costNum print "Error: ", errorNum fpr, tpr, _ = roc_curve(real_label, pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(real_label, pred_label) index_of_one = list(threshold).index(1) print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "threshold: ", threshold[index_of_one] # Save model print "Saving parameters." saveParamsVal(para_path, params) print "Saved."
def work(mode, data_name, test_dataname, pooling_mode="average_exc_pad"): print "mode: ", mode print "data_name: ", data_name print "pooling_mode: ", pooling_mode print "Started!" rng = numpy.random.RandomState(23455) docSentenceCount = T.ivector("docSentenceCount") sentenceWordCount = T.ivector("sentenceWordCount") corpus = T.matrix("corpus") docLabel = T.ivector('docLabel') # for list-type data layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \ sentenceLayerNodesNum=50, \ sentenceLayerNodesSize=[5, 200], \ docLayerNodesNum=10, \ docLayerNodesSize=[3, 50], pooling_mode=pooling_mode) # layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \ # sentenceLayerNodesNum=100, \ # sentenceLayerNodesSize=[5, 200], \ # docLayerNodesNum=100, \ # docLayerNodesSize=[3, 100], # pooling_mode=pooling_mode) layer2 = LogisticRegression(input=layer0.output, n_in=10, n_out=2) # construct the parameter array. params = layer2.params + layer0.params # Load the parameters last time, optionally. # data_name = "car" para_path = "data/" + data_name + "/model_nohidden/" + pooling_mode + ".model" traintext = "data/" + data_name + "/train/text" trainlabel = "data/" + data_name + "/train/label" testtext = "data/" + test_dataname + "/test/text" testlabel = "data/" + test_dataname + "/test/label" loadParamsVal(para_path, params) if(mode == "train" or mode == "test"): learning_rate = 0.1 error = layer2.errors(docLabel) cost = layer2.negative_log_likelihood(docLabel) grads = T.grad(cost, params) updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] print "Loading test data." cr_test = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=testtext, labelset=testlabel) validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels, _, _ = cr_test.getCorpus([0, 1000]) # print "Right answer: " # print zip(validIds, validLabels) validDocMatrixes = transToTensor(validDocMatrixes, theano.config.floatX) validDocSentenceNums = transToTensor(validDocSentenceNums, numpy.int32) validSentenceWordNums = transToTensor(validSentenceWordNums, numpy.int32) validLabels = transToTensor(validLabels, numpy.int32) print "Data loaded." valid_model = theano.function( [], [cost, error, layer2.y_pred, docLabel, T.transpose(layer2.p_y_given_x)[1]], givens={ corpus: validDocMatrixes, docSentenceCount: validDocSentenceNums, sentenceWordCount: validSentenceWordNums, docLabel: validLabels } ) # ####Validate the model#### costNum, errorNum, pred_label, real_label, pred_prob = valid_model() print "Valid current model:" print "Cost: ", costNum print "Error: ", errorNum # print "Valid Pred: ", pred_label # print "pred_prob: ", pred_prob fpr, tpr, _ = roc_curve(real_label, pred_prob) if mode == "test": print "tpr_all: ", tpr print "fpr_all: ", fpr roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "test_dataname: ", test_dataname print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(real_label, pred_label) index_of_one = list(threshold).index(1) ar = (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2 print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "AR: ", ar print "threshold: ", threshold[index_of_one] if mode == "test": valid_model.free() return errorNum, roc_auc, tpr[index_of_one], fpr[index_of_one], ar print "Loading train data." cr_train = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=traintext, labelset=trainlabel) docMatrixes, docSentenceNums, sentenceWordNums, ids, labels, _, _ = cr_train.getCorpus([0, 100000]) # print "Right answer: " # print zip(ids, labels) docMatrixes = transToTensor(docMatrixes, theano.config.floatX) docSentenceNums = transToTensor(docSentenceNums, numpy.int32) sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32) labels = transToTensor(labels, numpy.int32) # valid_cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/valid/split", labelset="data/valid/label.txt") print index = T.lscalar("index") batchSize = 10 n_batches = (len(docSentenceNums.get_value()) - 1) / batchSize + 1 print print "Train set size is ", len(docMatrixes.get_value()) print "Validating set size is ", len(validDocMatrixes.get_value()) print "Batch size is ", batchSize print "Number of training batches is ", n_batches print "Compiling computing graph." # for list-type data train_model = theano.function( [index], [cost, error, layer2.y_pred, docLabel], updates=updates, givens={ corpus: docMatrixes, docSentenceCount: docSentenceNums[index * batchSize: (index + 1) * batchSize + 1], sentenceWordCount: sentenceWordNums, docLabel: labels[index * batchSize: (index + 1) * batchSize] } ) print "Compiled." print "Start to train." epoch = 0 n_epochs = 10 ite = 0 while (epoch < n_epochs): epoch = epoch + 1 ####################### for i in range(n_batches): # for list-type data costNum, errorNum, pred_label, real_label = train_model(i) ite = ite + 1 # for padding data # costNum, errorNum = train_model(docMatrixes, labels) # del docMatrixes, docSentenceNums, sentenceWordNums, labels # print ".", if(ite % 10 == 0): print print "@iter: ", ite print "Cost: ", costNum print "Error: ", errorNum # Validate the model costNum, errorNum, pred_label, real_label, pred_prob = valid_model() print "Valid current model:" print "Cost: ", costNum print "Error: ", errorNum # print "pred_prob: ", pred_prob # print "Valid Pred: ", pred_label fpr, tpr, _ = roc_curve(real_label, pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "test_dataname: ", test_dataname print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(real_label, pred_label) index_of_one = list(threshold).index(1) print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2 print "threshold: ", threshold[index_of_one] # Save model print "Saving parameters." saveParamsVal(para_path, params) print "Saved." valid_model.free() train_model.free() elif(mode == "deploy"): print "Compiling computing graph." output_model = theano.function( [corpus, docSentenceCount, sentenceWordCount], [layer2.y_pred] ) print "Compiled." cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/train_valid/split") count = 21000 while(count <= 21000): docMatrixes, docSentenceNums, sentenceWordNums, ids = cr.getCorpus([count, count + 100]) docMatrixes = numpy.matrix( docMatrixes, dtype=theano.config.floatX ) docSentenceNums = numpy.array( docSentenceNums, dtype=numpy.int32 ) sentenceWordNums = numpy.array( sentenceWordNums, dtype=numpy.int32 ) print "start to predict." pred_y = output_model(docMatrixes, docSentenceNums, sentenceWordNums) print "End predicting." print "Writing resfile." # print zip(ids, pred_y[0]) f = file("data/test/res/res" + str(count), "w") f.write(str(zip(ids, pred_y[0]))) f.close() print "Written." + str(count) count += 100
def work(mode, data_name, test_dataname, pooling_mode="average_exc_pad"): print "mode: ", mode print "data_name: ", data_name print "pooling_mode: ", pooling_mode print "Started!" data_names = data_name.split(":") data_count = len(data_names) print "Train dataset:" for i in xrange(data_count): print "%d: %s" % (i, data_names[i]) print "Test dataset:" test_data_names = test_dataname.split(":") test_data_count = len(test_data_names) for i in xrange(test_data_count): print "%d: %s" % (i, test_data_names[i]) if test_data_count != data_count: raise Exception( "The amount of test and train dataset must be the same.") rng = numpy.random.RandomState(23455) docSentenceCount = T.ivector("docSentenceCount") sentenceWordCount = T.ivector("sentenceWordCount") corpus = T.matrix("corpus") docLabel = T.ivector('docLabel') sentenceW = None sentenceB = None docW = None docB = None hidden_layer_w = None hidden_layer_b = None logistic_layer_w = None logistic_layer_b = None layer0 = list() layer1 = list() layer2 = list() local_params = list() # for list-type data for i in xrange(data_count): layer0.append(DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \ sentenceLayerNodesNum=50, \ sentenceLayerNodesSize=[5, 200], \ docLayerNodesNum=10, \ docLayerNodesSize=[3, 50], sentenceW=sentenceW, sentenceB=sentenceB, docW=docW, docB=docB, pooling_mode=pooling_mode)) sentenceW = layer0[i].sentenceW sentenceB = layer0[i].sentenceB docW = layer0[i].docW docB = layer0[i].docB layer1.append( HiddenLayer(rng, input=layer0[i].output, n_in=layer0[i].outputDimension, n_out=10, activation=T.tanh, W=hidden_layer_w, b=hidden_layer_b)) hidden_layer_w = layer1[i].W hidden_layer_b = layer1[i].b layer2.append( LogisticRegression(input=layer1[i].output, n_in=10, n_out=2, W=logistic_layer_w, b=logistic_layer_b)) # logistic_layer_w = layer2[i].W # logistic_layer_b = layer2[i].b local_params.append(layer2[i].params) share_params = list(layer0[0].params + layer1[0].params) # construct the parameter array. params = list(layer0[0].params) + layer1[0].params for i in xrange(data_count): params += layer2[i].params # data_name = "car" para_path = "data/" + data_name + "/share_hidden_low_model/" + pooling_mode + ".model" traintext = [ "data/" + data_names[i] + "/train/text" for i in xrange(data_count) ] trainlabel = [ "data/" + data_names[i] + "/train/label" for i in xrange(data_count) ] testtext = [ "data/" + test_data_names[i] + "/test/text" for i in xrange(data_count) ] testlabel = [ "data/" + test_data_names[i] + "/test/label" for i in xrange(data_count) ] # Load the parameters last time, optionally. loadParamsVal(para_path, params) if (mode == "train" or mode == "test"): train_model = list() valid_model = list() print "Loading train data." batchSize = 10 share_learning_rate = 0.1 local_learning_rate = 0.1 n_batches = list() print "Loading test data." all_pred_label = list() all_real_label = list() all_pred_prob = list() for i in xrange(data_count): cr_train = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=traintext[i], labelset=trainlabel[i]) docMatrixes, docSentenceNums, sentenceWordNums, ids, labels, _, _ = cr_train.getCorpus( [0, 100000]) docMatrixes = transToTensor(docMatrixes, theano.config.floatX) docSentenceNums = transToTensor(docSentenceNums, numpy.int32) sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32) labels = transToTensor(labels, numpy.int32) index = T.lscalar("index") n_batches.append((len(docSentenceNums.get_value()) - 1 - 1) / batchSize + 1) print "Dataname: %s" % data_names[i] print "Train set size is ", len(docMatrixes.get_value()) print "Batch size is ", batchSize print "Number of training batches is ", n_batches[i] error = layer2[i].errors(docLabel) cost = layer2[i].negative_log_likelihood(docLabel) share_grads = T.grad(cost, share_params) share_updates = [ (param_i, param_i - share_learning_rate * grad_i) for param_i, grad_i in zip(share_params, share_grads) ] grads = T.grad(cost, local_params[i]) local_updates = [ (param_i, param_i - local_learning_rate * grad_i) for param_i, grad_i in zip(local_params[i], grads) ] updates = share_updates + local_updates print "Compiling train computing graph." if mode == "train": train_model.append( theano.function( [index], [cost, error, layer2[i].y_pred, docLabel], updates=updates, givens={ corpus: docMatrixes, docSentenceCount: docSentenceNums[index * batchSize:(index + 1) * batchSize + 1], sentenceWordCount: sentenceWordNums, docLabel: labels[index * batchSize:(index + 1) * batchSize] })) print "Compiled." print "Load test dataname: %s" % test_data_names[i] cr_test = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=testtext[i], labelset=testlabel[i]) validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels, _, _ = cr_test.getCorpus( [0, 1000]) validDocMatrixes = transToTensor(validDocMatrixes, theano.config.floatX) validDocSentenceNums = transToTensor(validDocSentenceNums, numpy.int32) validSentenceWordNums = transToTensor(validSentenceWordNums, numpy.int32) validLabels = transToTensor(validLabels, numpy.int32) print "Validating set size is ", len(validDocMatrixes.get_value()) print "Data loaded." print "Compiling test computing graph." valid_model.append( theano.function( [], [ cost, error, layer2[i].y_pred, docLabel, T.transpose(layer2[i].p_y_given_x)[1] ], givens={ corpus: validDocMatrixes, docSentenceCount: validDocSentenceNums, sentenceWordCount: validSentenceWordNums, docLabel: validLabels })) print "Compiled." costNum, errorNum, pred_label, real_label, pred_prob = valid_model[ i]() all_pred_label.extend(pred_label) all_real_label.extend(real_label) all_pred_prob.extend(pred_prob) print "Valid current model :", data_names[i] print "Cost: ", costNum print "Error: ", errorNum fpr, tpr, _ = roc_curve(real_label, pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(real_label, pred_label) if 1 in threshold: index_of_one = list(threshold).index(1) print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2 print "threshold: ", threshold[index_of_one] print "Valid current model :", data_names errorNum = 1 - accuracy_score(all_real_label, all_pred_label) print "Error: ", errorNum fpr, tpr, _ = roc_curve(all_real_label, all_pred_prob) if mode == "test": print "tpr_all: ", tpr print "fpr_all: ", fpr roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(all_real_label, all_pred_label) if 1 in threshold: index_of_one = list(threshold).index(1) print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2 print "threshold: ", threshold[index_of_one] if mode == "test": return print "Start to train." epoch = 0 n_epochs = 10 ite = 0 while (epoch < n_epochs): epoch = epoch + 1 ####################### for i in range(max(n_batches)): for dataset_index in xrange(data_count): if i >= n_batches[dataset_index]: continue # for list-type data costNum, errorNum, pred_label, real_label = train_model[ dataset_index](i) ite = ite + 1 # for padding data if (ite % 10 == 0): print print "Dataset name: ", data_names[dataset_index] print "@iter: ", ite print "Cost: ", costNum print "Error: ", errorNum # Validate the model all_pred_label = list() all_real_label = list() all_pred_prob = list() for dataset_index in xrange(data_count): costNum, errorNum, pred_label, real_label, pred_prob = valid_model[ dataset_index]() all_pred_label.extend(pred_label) all_real_label.extend(real_label) all_pred_prob.extend(pred_prob) print "Valid current model :", data_names[dataset_index] print "Cost: ", costNum print "Error: ", errorNum fpr, tpr, _ = roc_curve(real_label, pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(real_label, pred_label) index_of_one = list(threshold).index(1) print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2 print "threshold: ", threshold[index_of_one] print "Valid current model :", data_names errorNum = 1 - accuracy_score(all_real_label, all_pred_label) print "Error: ", errorNum fpr, tpr, _ = roc_curve(all_real_label, all_pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(all_real_label, all_pred_label) index_of_one = list(threshold).index(1) print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2 print "threshold: ", threshold[index_of_one] # Save model print "Saving parameters." saveParamsVal(para_path, params) print "Saved."
def work(model_name, dataset_name, pooling_mode): print "model_name: ", model_name print "dataset_name: ", dataset_name print "pooling_mode: ", pooling_mode print "Started!" rng = numpy.random.RandomState(23455) sentenceWordCount = T.ivector("sentenceWordCount") corpus = T.matrix("corpus") # docLabel = T.ivector('docLabel') # for list-type data layer0 = DocEmbeddingNNOneDoc(corpus, sentenceWordCount, rng, wordEmbeddingDim=200, \ sentenceLayerNodesNum=100, \ sentenceLayerNodesSize=[5, 200], \ docLayerNodesNum=100, \ docLayerNodesSize=[3, 100], pooling_mode=pooling_mode) layer1_output_num = 100 layer1 = HiddenLayer( rng, input=layer0.output, n_in=layer0.outputDimension, n_out=layer1_output_num, activation=T.tanh ) layer2 = LogisticRegression(input=layer1.output, n_in=100, n_out=2) cost = layer2.negative_log_likelihood(1 - layer2.y_pred) # calculate sentence sentence_score sentence_grads = T.grad(cost, layer0.sentenceResults) sentence_score = T.diag(T.dot(sentence_grads, T.transpose(layer0.sentenceResults))) # calculate word sentence_score against the whole network word_grad = T.grad(cost, corpus) word_score = T.diag(T.dot(word_grad, T.transpose(corpus))) # calculate word cell_scores = T.grad(cost, layer1.output) # calculate word score against cells word_score_against_cell = [T.diag(T.dot(T.grad(layer1.output[i], corpus), T.transpose(corpus))) for i in xrange(layer1_output_num)] # construct the parameter array. params = layer2.params + layer1.params + layer0.params # Load the parameters last time, optionally. model_path = "data/" + dataset_name + "/model_100,100,100,100,parameters/" + pooling_mode + ".model" loadParamsVal(model_path, params) print "Compiling computing graph." output_model = theano.function( [corpus, sentenceWordCount], [layer2.y_pred, sentence_score, word_score, layer1.output, cell_scores] + word_score_against_cell ) print "Compiled." input_filename = "data/" + dataset_name + "/train/small_text" cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=input_filename) count = 0 while(count < cr.getDocNum()): info = cr.getCorpus([count, count + 1]) count += 1 if info is None: print "Pass" continue docMatrixes, _, sentenceWordNums, ids, sentences, _ = info docMatrixes = numpy.matrix( docMatrixes, dtype=theano.config.floatX ) sentenceWordNums = numpy.array( sentenceWordNums, dtype=numpy.int32 ) print "start to predict: %s." % ids[0] info = output_model(docMatrixes, sentenceWordNums) pred_y = info[0] g = info[1] word_scores = info[2] cell_outputs = info[3] cell_scores = info[4] word_scores_against_cell = info[5:] if len(word_scores_against_cell) != len(cell_outputs): print "The dimension of word_socre and word are different." raise Exception("The dimension of word_socre and word are different.") print "End predicting." print "Writing resfile." score_sentence_list = zip(g, sentences) score_sentence_list.sort(key=lambda x:-x[0]) current_doc_dir = "data/output/" + model_name + "/" + pooling_mode + "/" + dataset_name + "/" + str(pred_y[0]) + "/" + ids[0] if not os.path.exists(current_doc_dir): os.makedirs(current_doc_dir) # sentence sentence_score with codecs.open(current_doc_dir + "/sentence_score", "w", 'utf-8', "ignore") as f: f .write("pred_y: %i\n" % pred_y[0]) for g0, s in score_sentence_list: f.write("%f\t%s\n" % (g0, string.join(s, " "))) wordList = list() for s in sentences: wordList.extend(s) print "length of word_scores", len(word_scores) print "length of wordList", len(wordList) score_word_list = zip(wordList , word_scores) with codecs.open(current_doc_dir + "/nn_word", "w", 'utf-8', "ignore") as f: for word, word_score in score_word_list: f.write("%s\t%f\n" % (word, word_score)) with codecs.open(current_doc_dir + "/nn_word_merged", "w", 'utf-8', "ignore") as f: merged_score_word_list = merge_kv(score_word_list) for word, word_score in merged_score_word_list: f.write("%s\t%f\n" % (word, word_score)) if not os.path.exists(current_doc_dir + "/nc_word"): os.makedirs(current_doc_dir + "/nc_word") neu_num = 0 for w, c_output, c_score in zip(word_scores_against_cell, cell_outputs, cell_scores): with codecs.open(current_doc_dir + "/nc_word/" + str(neu_num), "w", 'utf-8', "ignore") as f: f.write("cell sentence_score: %lf\n" % c_output) for word, word_score in zip(wordList, w): f.write("%s\t%f\n" % (word, word_score)) merged_score_word_list = merge_kv(zip(wordList, w)) with codecs.open(current_doc_dir + "/nc_word/" + str(neu_num) + "_merged", "w", 'utf-8', "ignore") as f: f.write("cell_scores: %lf\n" % c_score) f.write("cell_output: %lf\n" % c_output) for word, word_score in merged_score_word_list: f.write("%s\t%f\n" % (word, word_score)) neu_num += 1 print "Written." + str(count) print "All finished!"
def work(mode, data_name, test_dataname, pooling_mode): print "mode: ", mode print "data_name: ", data_name print "Started!" data_names = data_name.split(":") data_count = len(data_names) print "Train dataset:" for i in xrange(data_count): print "%d: %s" % (i, data_names[i]) print "Test dataset:" test_data_names = test_dataname.split(":") test_data_count = len(test_data_names) for i in xrange(test_data_count): print "%d: %s" % (i, test_data_names[i]) if test_data_count != data_count: raise Exception("The amount of test and train dataset must be the same.") rng = numpy.random.RandomState(23455) docSentenceCount = T.ivector("docSentenceCount") sentenceWordCount = T.ivector("sentenceWordCount") corpus = T.matrix("corpus") docLabel = T.ivector('docLabel') hidden_layer_w = None hidden_layer_b = None logistic_layer_w = None logistic_layer_b = None layer0 = list() layer1 = list() layer2 = list() local_params = list() # for list-type data for i in xrange(data_count): layer0.append(DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \ sentenceLayerNodesNum=100, \ sentenceLayerNodesSize=[5, 200], \ docLayerNodesNum=100, \ docLayerNodesSize=[3, 100])) layer1.append(HiddenLayer( rng, input=layer0[i].output, n_in=layer0[i].outputDimension, n_out=100, activation=T.tanh, W=hidden_layer_w, b=hidden_layer_b )) hidden_layer_w = layer1[i].W hidden_layer_b = layer1[i].b layer2.append(LogisticRegression(input=layer1[i].output, n_in=100, n_out=2, W=logistic_layer_w, b=logistic_layer_b)) logistic_layer_w = layer2[i].W logistic_layer_b = layer2[i].b local_params.append(layer0[i].params) share_params = layer2[0].params + layer1[0].params # construct the parameter array. params = layer2[0].params + layer1[0].params for i in xrange(data_count): params += layer0[i].params # data_name = "car" para_path = "data/" + data_name + "/model/scnn.model" traintext = ["data/" + data_names[i] + "/train/text" for i in xrange(data_count)] trainlabel = ["data/" + data_names[i] + "/train/label" for i in xrange(data_count)] testtext = ["data/" + test_data_names[i] + "/test/text" for i in xrange(data_count)] testlabel = ["data/" + test_data_names[i] + "/test/label" for i in xrange(data_count)] # Load the parameters last time, optionally. loadParamsVal(para_path, params) if(mode == "train"): train_model = list() valid_model = list() print "Loading train data." batchSize = 10 share_learning_rate = 0.01 local_learning_rate = 0.1 n_batches = list() for i in xrange(data_count): cr_train = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=traintext[i], labelset=trainlabel[i]) docMatrixes, docSentenceNums, sentenceWordNums, ids, labels = cr_train.getCorpus([0, 100000]) docMatrixes = transToTensor(docMatrixes, theano.config.floatX) docSentenceNums = transToTensor(docSentenceNums, numpy.int32) sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32) labels = transToTensor(labels, numpy.int32) index = T.lscalar("index") n_batches.append((len(docSentenceNums.get_value()) - 1 - 1) / batchSize + 1) print "Dataname: %s" % data_names[i] print "Train set size is ", len(docMatrixes.get_value()) print "Batch size is ", batchSize print "Number of training batches is ", n_batches[i] error = layer2[i].errors(docLabel) cost = layer2[i].negative_log_likelihood(docLabel) share_grads = T.grad(cost, share_params) share_updates = [ (param_i, param_i - share_learning_rate * grad_i) for param_i, grad_i in zip(share_params, share_grads) ] grads = T.grad(cost, local_params[i]) local_updates = [ (param_i, param_i - local_learning_rate * grad_i) for param_i, grad_i in zip(local_params[i], grads) ] updates = share_updates + local_updates print "Compiling train computing graph." train_model.append(theano.function( [index], [cost, error, layer2[i].y_pred, docLabel], updates=updates, givens={ corpus: docMatrixes, docSentenceCount: docSentenceNums[index * batchSize: (index + 1) * batchSize + 1], sentenceWordCount: sentenceWordNums, docLabel: labels[index * batchSize: (index + 1) * batchSize] } )) print "Compiled." print "Start to train." epoch = 0 n_epochs = 10 ite = 0 # ####Validate the model#### for dataset_index in xrange(data_count): costNum, errorNum, pred_label, real_label, pred_prob = valid_model[dataset_index]() print "Valid current model :", data_names[dataset_index] print "Cost: ", costNum print "Error: ", errorNum fpr, tpr, _ = roc_curve(real_label, pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "ROC: ", roc_auc print "All finished!"
def work(model_name, dataset_name, pooling_mode): print "model_name: ", model_name print "dataset_name: ", dataset_name print "pooling_mode: ", pooling_mode print "Started!" rng = numpy.random.RandomState(23455) sentenceWordCount = T.ivector("sentenceWordCount") corpus = T.matrix("corpus") # docLabel = T.ivector('docLabel') # for list-type data layer0 = DocEmbeddingNNOneDoc( corpus, sentenceWordCount, rng, wordEmbeddingDim=200, sentenceLayerNodesNum=100, sentenceLayerNodesSize=[5, 200], docLayerNodesNum=100, docLayerNodesSize=[3, 100], pooling_mode=pooling_mode, ) layer1_output_num = 100 layer1 = HiddenLayer( rng, input=layer0.output, n_in=layer0.outputDimension, n_out=layer1_output_num, activation=T.tanh ) layer2 = LogisticRegression(input=layer1.output, n_in=100, n_out=2) cost = layer2.negative_log_likelihood(1 - layer2.y_pred) # calculate sentence sentence_score sentence_grads = T.grad(cost, layer0.sentenceResults) sentence_score = T.diag(T.dot(sentence_grads, T.transpose(layer0.sentenceResults))) # calculate word sentence_score against the whole network word_grad = T.grad(cost, corpus) word_score = T.diag(T.dot(word_grad, T.transpose(corpus))) # calculate word cell_scores = T.grad(cost, layer1.output) # calculate word score against cells word_score_against_cell = [ T.diag(T.dot(T.grad(layer1.output[i], corpus), T.transpose(corpus))) for i in xrange(layer1_output_num) ] # construct the parameter array. params = layer2.params + layer1.params + layer0.params # Load the parameters last time, optionally. model_path = "data/" + dataset_name + "/model_100,100,100,100,parameters/" + pooling_mode + ".model" loadParamsVal(model_path, params) print "Compiling computing graph." output_model = theano.function( [corpus, sentenceWordCount], [layer2.y_pred, sentence_score, word_score, layer1.output, cell_scores] + word_score_against_cell, ) print "Compiled." input_filename = "data/" + dataset_name + "/train/small_text" cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=input_filename) count = 0 while count < cr.getDocNum(): info = cr.getCorpus([count, count + 1]) count += 1 if info is None: print "Pass" continue docMatrixes, _, sentenceWordNums, ids, sentences, _ = info docMatrixes = numpy.matrix(docMatrixes, dtype=theano.config.floatX) sentenceWordNums = numpy.array(sentenceWordNums, dtype=numpy.int32) print "start to predict: %s." % ids[0] info = output_model(docMatrixes, sentenceWordNums) pred_y = info[0] g = info[1] word_scores = info[2] cell_outputs = info[3] cell_scores = info[4] word_scores_against_cell = info[5:] if len(word_scores_against_cell) != len(cell_outputs): print "The dimension of word_socre and word are different." raise Exception("The dimension of word_socre and word are different.") print "End predicting." print "Writing resfile." score_sentence_list = zip(g, sentences) score_sentence_list.sort(key=lambda x: -x[0]) current_doc_dir = ( "data/output/" + model_name + "/" + pooling_mode + "/" + dataset_name + "/" + str(pred_y[0]) + "/" + ids[0] ) if not os.path.exists(current_doc_dir): os.makedirs(current_doc_dir) # sentence sentence_score with codecs.open(current_doc_dir + "/sentence_score", "w", "utf-8", "ignore") as f: f.write("pred_y: %i\n" % pred_y[0]) for g0, s in score_sentence_list: f.write("%f\t%s\n" % (g0, string.join(s, " "))) wordList = list() for s in sentences: wordList.extend(s) print "length of word_scores", len(word_scores) print "length of wordList", len(wordList) score_word_list = zip(wordList, word_scores) with codecs.open(current_doc_dir + "/nn_word", "w", "utf-8", "ignore") as f: for word, word_score in score_word_list: f.write("%s\t%f\n" % (word, word_score)) with codecs.open(current_doc_dir + "/nn_word_merged", "w", "utf-8", "ignore") as f: merged_score_word_list = merge_kv(score_word_list) for word, word_score in merged_score_word_list: f.write("%s\t%f\n" % (word, word_score)) if not os.path.exists(current_doc_dir + "/nc_word"): os.makedirs(current_doc_dir + "/nc_word") neu_num = 0 for w, c_output, c_score in zip(word_scores_against_cell, cell_outputs, cell_scores): with codecs.open(current_doc_dir + "/nc_word/" + str(neu_num), "w", "utf-8", "ignore") as f: f.write("cell sentence_score: %lf\n" % c_output) for word, word_score in zip(wordList, w): f.write("%s\t%f\n" % (word, word_score)) merged_score_word_list = merge_kv(zip(wordList, w)) with codecs.open(current_doc_dir + "/nc_word/" + str(neu_num) + "_merged", "w", "utf-8", "ignore") as f: f.write("cell_scores: %lf\n" % c_score) f.write("cell_output: %lf\n" % c_output) for word, word_score in merged_score_word_list: f.write("%s\t%f\n" % (word, word_score)) neu_num += 1 print "Written." + str(count) print "All finished!"
def work(mode, data_name, test_dataname): print "mode: ", mode print "data_name: ", data_name print "Started!" data_names = data_name.split(":") data_count = len(data_names) print "Train dataset:" for i in xrange(data_count): print "%d: %s" % (i, data_names[i]) print "Test dataset:" test_data_names = test_dataname.split(":") test_data_count = len(test_data_names) for i in xrange(test_data_count): print "%d: %s" % (i, test_data_names[i]) if test_data_count != data_count: raise Exception("The amount of test and train dataset must be the same.") rng = numpy.random.RandomState(23455) docSentenceCount = T.ivector("docSentenceCount") sentenceWordCount = T.ivector("sentenceWordCount") corpus = T.matrix("corpus") docLabel = T.ivector('docLabel') hidden_layer_w = None hidden_layer_b = None logistic_layer_w = None logistic_layer_b = None layer0 = list() layer1 = list() layer2 = list() local_params = list() # for list-type data for i in xrange(data_count): layer0.append(DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \ sentenceLayerNodesNum=100, \ sentenceLayerNodesSize=[5, 200], \ docLayerNodesNum=100, \ docLayerNodesSize=[3, 100])) layer1.append(HiddenLayer( rng, input=layer0[i].output, n_in=layer0[i].outputDimension, n_out=100, activation=T.tanh, W=hidden_layer_w, b=hidden_layer_b )) hidden_layer_w = layer1[i].W hidden_layer_b = layer1[i].b layer2.append(LogisticRegression(input=layer1[i].output, n_in=100, n_out=2, W=logistic_layer_w, b=logistic_layer_b)) logistic_layer_w = layer2[i].W logistic_layer_b = layer2[i].b local_params.append(layer2[i].params + layer1[i].params + layer0[i].params) # construct the parameter array. params = layer2[0].params + layer1[0].params for i in xrange(data_count): params += layer0[i].params # Load the parameters last time, optionally. # data_name = "car" para_path = "data/" + data_name + "/model/scnn.model" traintext = ["data/" + data_names[i] + "/train/text" for i in xrange(data_count)] trainlabel = ["data/" + data_names[i] + "/train/label" for i in xrange(data_count)] testtext = ["data/" + test_data_names[i] + "/test/text" for i in xrange(data_count)] testlabel = ["data/" + test_data_names[i] + "/test/label" for i in xrange(data_count)] loadParamsVal(para_path, params) if(mode == "train"): train_model = list() valid_model = list() print "Loading train data." batchSize = 10 learning_rate = 0.1 n_batches = list() print "Loading test data." for i in xrange(data_count): cr_train = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=traintext[i], labelset=trainlabel[i]) docMatrixes, docSentenceNums, sentenceWordNums, ids, labels = cr_train.getCorpus([0, 100000]) docMatrixes = transToTensor(docMatrixes, theano.config.floatX) docSentenceNums = transToTensor(docSentenceNums, numpy.int32) sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32) labels = transToTensor(labels, numpy.int32) index = T.lscalar("index") n_batches.append((len(docSentenceNums.get_value()) - 1) / batchSize + 1) print "Dataname: %s" % data_names[i] print "Train set size is ", len(docMatrixes.get_value()) print "Batch size is ", batchSize print "Number of training batches is ", n_batches[i] error = layer2[i].errors(docLabel) cost = layer2[i].negative_log_likelihood(docLabel) grads = T.grad(cost, local_params[i]) updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(local_params[i], grads) ] print "Compiling train computing graph." train_model.append(theano.function( [index], [cost, error, layer2[i].y_pred, docLabel], updates=updates, givens={ corpus: docMatrixes, docSentenceCount: docSentenceNums[index * batchSize: (index + 1) * batchSize + 1], sentenceWordCount: sentenceWordNums, docLabel: labels[index * batchSize: (index + 1) * batchSize] } )) print "Compiled." print "Load test dataname: %s" % test_data_names[i] cr_test = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=testtext[i], labelset=testlabel[i]) validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels = cr_test.getCorpus([0, 1000]) validDocMatrixes = transToTensor(validDocMatrixes, theano.config.floatX) validDocSentenceNums = transToTensor(validDocSentenceNums, numpy.int32) validSentenceWordNums = transToTensor(validSentenceWordNums, numpy.int32) validLabels = transToTensor(validLabels, numpy.int32) print "Validating set size is ", len(validDocMatrixes.get_value()) print "Data loaded." print "Compiling test computing graph." valid_model.append(theano.function( [], [cost, error, layer2[i].y_pred, docLabel, T.transpose(layer2[i].p_y_given_x)[1]], givens={ corpus: validDocMatrixes, docSentenceCount: validDocSentenceNums, sentenceWordCount: validSentenceWordNums, docLabel: validLabels } )) print "Compiled." # for list-type data print "Start to train." epoch = 0 n_epochs = 2000 ite = 0 # ####Validate the model#### for dataset_index in xrange(data_count): costNum, errorNum, pred_label, real_label, pred_prob = valid_model[i]() print "Valid current model :", data_names[dataset_index] print "Cost: ", costNum print "Error: ", errorNum # print "Valid Pred: ", pred_label # print "pred_prob: ", pred_prob fpr, tpr, _ = roc_curve(real_label, pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "test_dataname: ", test_dataname print "ROC: ", roc_auc while (epoch < n_epochs): epoch = epoch + 1 ####################### for i in range(max(n_batches)): for dataset_index in xrange(data_count): if i >= n_batches[dataset_index]: continue # for list-type data costNum, errorNum, pred_label, real_label = train_model[dataset_index](i) ite = ite + 1 # for padding data # costNum, errorNum = train_model(docMatrixes, labels) # del docMatrixes, docSentenceNums, sentenceWordNums, labels # print ".", if(ite % 10 == 0): print print "Dataset name: ", data_names[dataset_index] print "@iter: ", ite print "Cost: ", costNum print "Error: ", errorNum # Validate the model for dataset_index in xrange(data_count): costNum, errorNum, pred_label, real_label, pred_prob = valid_model[i]() print "Valid current model :", data_names[dataset_index] print "Cost: ", costNum print "Error: ", errorNum # print "Valid Pred: ", pred_label # print "pred_prob: ", pred_prob fpr, tpr, _ = roc_curve(real_label, pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "test_dataname: ", test_dataname print "ROC: ", roc_auc # Save model print "Saving parameters." saveParamsVal(para_path, params) print "Saved." # elif(mode == "deploy"): # print "Compiling computing graph." # output_model = theano.function( # [corpus, docSentenceCount, sentenceWordCount], # [layer2.y_pred] # ) # print "Compiled." # cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/train_valid/split") # count = 21000 # while(count <= 21000): # docMatrixes, docSentenceNums, sentenceWordNums, ids = cr.getCorpus([count, count + 100]) # docMatrixes = numpy.matrix( # docMatrixes, # dtype=theano.config.floatX # ) # docSentenceNums = numpy.array( # docSentenceNums, # dtype=numpy.int32 # ) # sentenceWordNums = numpy.array( # sentenceWordNums, # dtype=numpy.int32 # ) # print "start to predict." # pred_y = output_model(docMatrixes, docSentenceNums, sentenceWordNums) # print "End predicting." # print "Writing resfile." # # print zip(ids, pred_y[0]) # f = file("data/test/res/res" + str(count), "w") # f.write(str(zip(ids, pred_y[0]))) # f.close() # print "Written." + str(count) # count += 100 print "All finished!"
def work(mode, data_name, test_dataname, pooling_mode): print "mode: ", mode print "data_name: ", data_name print "Started!" data_names = data_name.split(":") data_count = len(data_names) print "Train dataset:" for i in xrange(data_count): print "%d: %s" % (i, data_names[i]) print "Test dataset:" test_data_names = test_dataname.split(":") test_data_count = len(test_data_names) for i in xrange(test_data_count): print "%d: %s" % (i, test_data_names[i]) if test_data_count != data_count: raise Exception( "The amount of test and train dataset must be the same.") rng = numpy.random.RandomState(23455) docSentenceCount = T.ivector("docSentenceCount") sentenceWordCount = T.ivector("sentenceWordCount") corpus = T.matrix("corpus") docLabel = T.ivector('docLabel') hidden_layer_w = None hidden_layer_b = None logistic_layer_w = None logistic_layer_b = None layer0 = list() layer1 = list() layer2 = list() local_params = list() # for list-type data for i in xrange(data_count): layer0.append(DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \ sentenceLayerNodesNum=100, \ sentenceLayerNodesSize=[5, 200], \ docLayerNodesNum=100, \ docLayerNodesSize=[3, 100])) layer1.append( HiddenLayer(rng, input=layer0[i].output, n_in=layer0[i].outputDimension, n_out=100, activation=T.tanh, W=hidden_layer_w, b=hidden_layer_b)) hidden_layer_w = layer1[i].W hidden_layer_b = layer1[i].b layer2.append( LogisticRegression(input=layer1[i].output, n_in=100, n_out=2, W=logistic_layer_w, b=logistic_layer_b)) logistic_layer_w = layer2[i].W logistic_layer_b = layer2[i].b local_params.append(layer0[i].params) share_params = layer2[0].params + layer1[0].params # construct the parameter array. params = layer2[0].params + layer1[0].params for i in xrange(data_count): params += layer0[i].params # data_name = "car" para_path = "data/" + data_name + "/model/scnn.model" traintext = [ "data/" + data_names[i] + "/train/text" for i in xrange(data_count) ] trainlabel = [ "data/" + data_names[i] + "/train/label" for i in xrange(data_count) ] testtext = [ "data/" + test_data_names[i] + "/test/text" for i in xrange(data_count) ] testlabel = [ "data/" + test_data_names[i] + "/test/label" for i in xrange(data_count) ] # Load the parameters last time, optionally. loadParamsVal(para_path, params) if (mode == "train"): train_model = list() valid_model = list() print "Loading train data." batchSize = 10 share_learning_rate = 0.01 local_learning_rate = 0.1 n_batches = list() for i in xrange(data_count): cr_train = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=traintext[i], labelset=trainlabel[i]) docMatrixes, docSentenceNums, sentenceWordNums, ids, labels = cr_train.getCorpus( [0, 100000]) docMatrixes = transToTensor(docMatrixes, theano.config.floatX) docSentenceNums = transToTensor(docSentenceNums, numpy.int32) sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32) labels = transToTensor(labels, numpy.int32) index = T.lscalar("index") n_batches.append((len(docSentenceNums.get_value()) - 1 - 1) / batchSize + 1) print "Dataname: %s" % data_names[i] print "Train set size is ", len(docMatrixes.get_value()) print "Batch size is ", batchSize print "Number of training batches is ", n_batches[i] error = layer2[i].errors(docLabel) cost = layer2[i].negative_log_likelihood(docLabel) share_grads = T.grad(cost, share_params) share_updates = [ (param_i, param_i - share_learning_rate * grad_i) for param_i, grad_i in zip(share_params, share_grads) ] grads = T.grad(cost, local_params[i]) local_updates = [ (param_i, param_i - local_learning_rate * grad_i) for param_i, grad_i in zip(local_params[i], grads) ] updates = share_updates + local_updates print "Compiling train computing graph." train_model.append( theano.function( [index], [cost, error, layer2[i].y_pred, docLabel], updates=updates, givens={ corpus: docMatrixes, docSentenceCount: docSentenceNums[index * batchSize:(index + 1) * batchSize + 1], sentenceWordCount: sentenceWordNums, docLabel: labels[index * batchSize:(index + 1) * batchSize] })) print "Compiled." print "Start to train." epoch = 0 n_epochs = 10 ite = 0 # ####Validate the model#### for dataset_index in xrange(data_count): costNum, errorNum, pred_label, real_label, pred_prob = valid_model[ dataset_index]() print "Valid current model :", data_names[dataset_index] print "Cost: ", costNum print "Error: ", errorNum fpr, tpr, _ = roc_curve(real_label, pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "ROC: ", roc_auc print "All finished!"