def work(): print "Started!" print "Loading data." cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/split", labelset="data/traindataset2zgb") docMatrixes, docSentenceNums, sentenceWordNums, labels = transToTensor(cr.getCorpus([0, 12])) # valid_cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/valid/split", labelset="data/valid/label.txt") validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validLabels = transToTensor(cr.getCorpus([800, 870])) print "Data loaded." learning_rate = 0.1 docSentenceCount = T.vector("docSentenceCount") sentenceWordCount = T.vector("sentenceWordCount") corpus = T.matrix("corpus") docLabel = T.ivector('docLabel') index = T.lscalar("index") rng = numpy.random.RandomState(23455) batchSize = 1 mr =numpy.max([len(docMatrixes.get_value()), len(validDocMatrixes.get_value())]) n_batches = (len(docSentenceNums.get_value()) -1 ) / batchSize print "Train set size is ", len(docMatrixes.get_value()) print "Validating set size is ", len(validDocMatrixes.get_value()) print "Batch size is ", batchSize print "Number of training batches is ", n_batches # for list-type data layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \ maxRandge=mr, \ sentenceLayerNodesNum=100, \ sentenceLayerNodesSize=[5, 200], \ docLayerNodesNum=100, \ docLayerNodesSize=[3, 100]) # for padding data # layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, corpus_shape=(batchSize, cr.getMaxDocSentenceNum(), cr.getMaxSentenceWordNum(), cr.getDim()), \ # maxRandge=mr, \ # sentenceLayerNodesNum=100, \ # sentenceLayerNodesSize=5, \ # docLayerNodesNum=200, \ # docLayerNodesSize=3) layer1 = HiddenLayer( rng, input=layer0.output, n_in=layer0.outputDimension, n_out=100, activation=T.tanh ) layer2 = LogisticRegression(input=layer1.output, n_in=100, n_out=2) error = layer2.errors(docLabel) cost = layer2.negative_log_likelihood(docLabel) # construct the parameter array. params = layer2.params + layer1.params + layer0.params # Load the parameters last time, optionally. loadParamsVal(params) grads = T.grad(cost, params) updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] print "Compiling computing graph." valid_model = theano.function( [], [cost, error], givens={ corpus: validDocMatrixes, docSentenceCount: validDocSentenceNums, sentenceWordCount: validSentenceWordNums, docLabel: validLabels } ) # for list-type data train_model = theano.function( [index], [cost, error], updates=updates, givens={ corpus: docMatrixes, docSentenceCount: docSentenceNums[index * batchSize: (index + 1) * batchSize + 1], sentenceWordCount: sentenceWordNums, docLabel: labels[index * batchSize: (index + 1) * batchSize] } ) # for padding data # train_model = theano.function( # [corpus, docLabel], # [cost, error], # updates=updates, # ) print "Compiled." print "Start to train." epoch = 0 n_epochs = 200 ite = 0 # ####Validate the model#### costNum, errorNum = valid_model() print "Valid current model:" print "Cost: ", costNum print "Error: ", errorNum while (epoch < n_epochs): epoch = epoch + 1 ####################### for i in range(n_batches): # for list-type data costNum, errorNum = train_model(i) ite = ite + 1 # for padding data # costNum, errorNum = train_model(docMatrixes, labels) # del docMatrixes, docSentenceNums, sentenceWordNums, labels # print ".", if(ite % 1 == 0): print print "@iter: ", ite print "Cost: ", costNum print "Error: ", errorNum # Validate the model costNum, errorNum = valid_model() print "Valid current model:" print "Cost: ", costNum print "Error: ", errorNum # Save model print "Saving parameters." saveParamsVal(params) print "Saved." print "All finished!"
def work(): print "Started!" print "Loading data." cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/split", labelset="data/traindataset2zgb") # valid_cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/valid/split", labelset="data/valid/label.txt") validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validLabels = cr.getCorpus([800, 870]) print "Data loaded." mr = numpy.max([cr.getMaxDocSentenceNum(), cr.getMaxDocSentenceNum()]) learning_rate = 0.01 docSentenceCount = T.fvector("docSentenceCount") sentenceWordCount = T.fmatrix("sentenceWordCount") corpus = T.ftensor4("corpus") docLabel = T.ivector('docLabel') rng = numpy.random.RandomState(23455) batchSize = 40 # for list-type data layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \ maxRandge=mr, \ sentenceLayerNodesNum=100, \ sentenceLayerNodesSize=[5, 200], \ docLayerNodesNum=100, \ docLayerNodesSize=[3, 100]) # for padding data # layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, corpus_shape=(batchSize, cr.getMaxDocSentenceNum(), cr.getMaxSentenceWordNum(), cr.getDim()), \ # maxRandge=mr, \ # sentenceLayerNodesNum=100, \ # sentenceLayerNodesSize=5, \ # docLayerNodesNum=200, \ # docLayerNodesSize=3) layer1 = HiddenLayer( rng, input=layer0.output, n_in=layer0.outputDimension, n_out=100, activation=T.tanh ) layer2 = LogisticRegression(input=layer1.output, n_in=100, n_out=2) error = layer2.errors(docLabel) cost = layer2.negative_log_likelihood(docLabel) # construct the parameter array. params = layer2.params + layer1.params + layer0.params # Load the parameters last time, optionally. loadParamsVal(params) grads = T.grad(cost, params) updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] print "Compiling computing graph." # for list-type data train_model = theano.function( [corpus, docSentenceCount, sentenceWordCount, docLabel], [cost, error], updates=updates, ) valid_model = theano.function( [corpus, docSentenceCount, sentenceWordCount, docLabel], [cost, error] ) # for padding data # train_model = theano.function( # [corpus, docLabel], # [cost, error], # updates=updates, # ) print "Compiled." print "Start to train." epoch = 0 n_epochs = 200 ite = 0 while (epoch < n_epochs): epoch = epoch + 1 # ####Validate the model#### # for list-type data costNum, errorNum = valid_model(validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validLabels) # for padding data # costNum, errorNum = train_model(docMatrixes, labels) print "Valid current model:" print "Cost: ", costNum print "Error: ", errorNum ####################### for i in range(1000): # if(i * batchSize >= 800): # break docInfo = cr.getCorpus([i * batchSize, numpy.min([ (i + 1) * batchSize, 800])]) if(docInfo is None): break ite = ite + 1 docMatrixes, docSentenceNums, sentenceWordNums, labels = docInfo # for list-type data costNum, errorNum = train_model(docMatrixes, docSentenceNums, sentenceWordNums, labels) # for padding data # costNum, errorNum = train_model(docMatrixes, labels) del docMatrixes, docSentenceNums, sentenceWordNums, labels # print ".", if(ite % 1 == 0): print print "@iter: ", ite print "Cost: ", costNum print "Error: ", errorNum print "Saving parameters." saveParamsVal(params) print "Saved." print "All finished!"
def work(argv): print "Started!" rng = numpy.random.RandomState(23455) sentenceWordCount = T.ivector("sentenceWordCount") corpus = T.matrix("corpus") docLabel = T.ivector('docLabel') # for list-type data layer0 = DocEmbeddingNNOneDoc(corpus, sentenceWordCount, rng, wordEmbeddingDim=200, \ sentenceLayerNodesNum=100, \ sentenceLayerNodesSize=[5, 200], \ docLayerNodesNum=100, \ docLayerNodesSize=[3, 100]) layer1 = HiddenLayer(rng, input=layer0.output, n_in=layer0.outputDimension, n_out=100, activation=T.tanh) layer2 = LogisticRegression(input=layer1.output, n_in=100, n_out=2) cost = layer2.negative_log_likelihood(1 - layer2.y_pred) grads = T.grad(cost, layer0.sentenceResults) score = T.diag(T.dot(grads, T.transpose(layer0.sentenceResults))) # construct the parameter array. params = layer2.params + layer1.params + layer0.params # Load the parameters last time, optionally. loadParamsVal(params) print "Compiling computing graph." output_model = theano.function([corpus, sentenceWordCount], [layer2.y_pred, score]) print "Compiled." cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/train_valid/split") count = 0 while (count <= 1000): info = cr.getCorpus([count, count + 1]) count += 1 if info is None: print "Pass" continue docMatrixes, _, sentenceWordNums, ids, sentences = info docMatrixes = numpy.matrix(docMatrixes, dtype=theano.config.floatX) sentenceWordNums = numpy.array(sentenceWordNums, dtype=numpy.int32) print "start to predict: %s." % ids[0] pred_y, g = output_model(docMatrixes, sentenceWordNums) print "End predicting." print "Writing resfile." score_sentence_list = zip(g, sentences) score_sentence_list.sort(key=lambda x: -x[0]) with codecs.open("data/output/" + str(pred_y[0]) + "/" + ids[0], "w", 'utf-8', "ignore") as f: f.write("pred_y: %i\n" % pred_y[0]) for g0, s in score_sentence_list: f.write("%f\t%s\n" % (g0, string.join(s, " "))) # print zip(ids, pred_y[0]) # f = file("data/test/res/res" + str(count), "w") # f.write(str(zip(ids, pred_y[0]))) # f.close() print "Written." + str(count) print "All finished!"
def work(argv): print "Started!" rng = numpy.random.RandomState(23455) sentenceWordCount = T.ivector("sentenceWordCount") corpus = T.matrix("corpus") docLabel = T.ivector('docLabel') # for list-type data layer0 = DocEmbeddingNNOneDoc(corpus, sentenceWordCount, rng, wordEmbeddingDim=200, \ sentenceLayerNodesNum=100, \ sentenceLayerNodesSize=[5, 200], \ docLayerNodesNum=100, \ docLayerNodesSize=[3, 100]) layer1 = HiddenLayer( rng, input=layer0.output, n_in=layer0.outputDimension, n_out=100, activation=T.tanh ) layer2 = LogisticRegression(input=layer1.output, n_in=100, n_out=2) cost = layer2.negative_log_likelihood(1 - layer2.y_pred) grads = T.grad(cost, layer0.sentenceResults) score = T.diag(T.dot(grads, T.transpose(layer0.sentenceResults))) # construct the parameter array. params = layer2.params + layer1.params + layer0.params # Load the parameters last time, optionally. loadParamsVal(params) print "Compiling computing graph." output_model = theano.function( [corpus, sentenceWordCount], [layer2.y_pred, score] ) print "Compiled." cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/train_valid/split") count = 0 while(count <= 1000): info = cr.getCorpus([count, count + 1]) count += 1 if info is None: print "Pass" continue docMatrixes, _, sentenceWordNums, ids, sentences = info docMatrixes = numpy.matrix( docMatrixes, dtype=theano.config.floatX ) sentenceWordNums = numpy.array( sentenceWordNums, dtype=numpy.int32 ) print "start to predict: %s." % ids[0] pred_y, g = output_model(docMatrixes, sentenceWordNums) print "End predicting." print "Writing resfile." score_sentence_list = zip(g, sentences) score_sentence_list.sort(key=lambda x:-x[0]) with codecs.open("data/output/" + str(pred_y[0]) + "/" + ids[0], "w", 'utf-8', "ignore") as f: f .write("pred_y: %i\n" % pred_y[0]) for g0, s in score_sentence_list: f.write("%f\t%s\n" % (g0, string.join(s, " "))) # print zip(ids, pred_y[0]) # f = file("data/test/res/res" + str(count), "w") # f.write(str(zip(ids, pred_y[0]))) # f.close() print "Written." + str(count) print "All finished!"
def work(mode, data_name, test_dataname): print "mode: ", mode print "data_name: ", data_name print "Started!" rng = numpy.random.RandomState(23455) docSentenceCount = T.ivector("docSentenceCount") sentenceWordCount = T.ivector("sentenceWordCount") corpus = T.matrix("corpus") docLabel = T.ivector('docLabel') # for list-type data layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \ sentenceLayerNodesNum=100, \ sentenceLayerNodesSize=[5, 200], \ docLayerNodesNum=100, \ docLayerNodesSize=[3, 100]) layer1 = HiddenLayer( rng, input=layer0.output, n_in=layer0.outputDimension, n_out=100, activation=T.tanh ) layer2 = LogisticRegression(input=layer1.output, n_in=100, n_out=2) # construct the parameter array. params = layer2.params + layer1.params + layer0.params # Load the parameters last time, optionally. # data_name = "car" para_path = "data/" + data_name + "/model/scnn.model" traintext = "data/" + data_name + "/train/text" trainlabel = "data/" + data_name + "/train/label" testtext = "data/" + test_dataname + "/test/text" testlabel = "data/" + test_dataname + "/test/label" loadParamsVal(para_path, params) if(mode == "train"): print "Loading train data." cr_train = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=traintext, labelset=trainlabel) docMatrixes, docSentenceNums, sentenceWordNums, ids, labels = cr_train.getCorpus([0, 100000]) # print "Right answer: " # print zip(ids, labels) docMatrixes = transToTensor(docMatrixes, theano.config.floatX) docSentenceNums = transToTensor(docSentenceNums, numpy.int32) sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32) labels = transToTensor(labels, numpy.int32) # valid_cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/valid/split", labelset="data/valid/label.txt") print print "Loading test data." cr_test = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=testtext, labelset=testlabel) validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels = cr_test.getCorpus([0, 1000]) # print "Right answer: " # print zip(validIds, validLabels) validDocMatrixes = transToTensor(validDocMatrixes, theano.config.floatX) validDocSentenceNums = transToTensor(validDocSentenceNums, numpy.int32) validSentenceWordNums = transToTensor(validSentenceWordNums, numpy.int32) validLabels = transToTensor(validLabels, numpy.int32) print "Data loaded." learning_rate = 0.1 index = T.lscalar("index") batchSize = 10 n_batches = (len(docSentenceNums.get_value()) - 1) / batchSize + 1 print print "Train set size is ", len(docMatrixes.get_value()) print "Validating set size is ", len(validDocMatrixes.get_value()) print "Batch size is ", batchSize print "Number of training batches is ", n_batches error = layer2.errors(docLabel) cost = layer2.negative_log_likelihood(docLabel) grads = T.grad(cost, params) updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] print "Compiling computing graph." valid_model = theano.function( [], [cost, error, layer2.y_pred, docLabel, T.transpose(layer2.p_y_given_x)[1]], givens={ corpus: validDocMatrixes, docSentenceCount: validDocSentenceNums, sentenceWordCount: validSentenceWordNums, docLabel: validLabels } ) # for list-type data train_model = theano.function( [index], [cost, error, layer2.y_pred, docLabel], updates=updates, givens={ corpus: docMatrixes, docSentenceCount: docSentenceNums[index * batchSize: (index + 1) * batchSize + 1], sentenceWordCount: sentenceWordNums, docLabel: labels[index * batchSize: (index + 1) * batchSize] } ) print "Compiled." print "Start to train." epoch = 0 n_epochs = 2000 ite = 0 # ####Validate the model#### costNum, errorNum, pred_label, real_label, pred_prob = valid_model() print "Valid current model:" print "Cost: ", costNum print "Error: ", errorNum print "Valid Pred: ", pred_label print "pred_prob: ", pred_prob fpr, tpr, _ = roc_curve(real_label, pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "test_dataname: ", test_dataname print "ROC: ", roc_auc while (epoch < n_epochs): epoch = epoch + 1 ####################### for i in range(n_batches): # for list-type data costNum, errorNum, pred_label, real_label = train_model(i) ite = ite + 1 # for padding data # costNum, errorNum = train_model(docMatrixes, labels) # del docMatrixes, docSentenceNums, sentenceWordNums, labels # print ".", if(ite % 10 == 0): print print "@iter: ", ite print "Cost: ", costNum print "Error: ", errorNum # Validate the model costNum, errorNum, pred_label, real_label, pred_prob = valid_model() print "Valid current model:" print "Cost: ", costNum print "Error: ", errorNum print "pred_prob: ", pred_prob # print "Valid Pred: ", pred_label fpr, tpr, _ = roc_curve(real_label, pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "test_dataname: ", test_dataname print "ROC: ", roc_auc # Save model print "Saving parameters." saveParamsVal(para_path, params) print "Saved." elif(mode == "deploy"): print "Compiling computing graph." output_model = theano.function( [corpus, docSentenceCount, sentenceWordCount], [layer2.y_pred] ) print "Compiled." cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/train_valid/split") count = 21000 while(count <= 21000): docMatrixes, docSentenceNums, sentenceWordNums, ids = cr.getCorpus([count, count + 100]) docMatrixes = numpy.matrix( docMatrixes, dtype=theano.config.floatX ) docSentenceNums = numpy.array( docSentenceNums, dtype=numpy.int32 ) sentenceWordNums = numpy.array( sentenceWordNums, dtype=numpy.int32 ) print "start to predict." pred_y = output_model(docMatrixes, docSentenceNums, sentenceWordNums) print "End predicting." print "Writing resfile." # print zip(ids, pred_y[0]) f = file("data/test/res/res" + str(count), "w") f.write(str(zip(ids, pred_y[0]))) f.close() print "Written." + str(count) count += 100 print "All finished!"
def work(mode, data_name, test_dataname): print "mode: ", mode print "data_name: ", data_name print "Started!" rng = numpy.random.RandomState(23455) docSentenceCount = T.ivector("docSentenceCount") sentenceWordCount = T.ivector("sentenceWordCount") corpus = T.matrix("corpus") docLabel = T.ivector('docLabel') # for list-type data layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \ sentenceLayerNodesNum=100, \ sentenceLayerNodesSize=[5, 200], \ docLayerNodesNum=100, \ docLayerNodesSize=[3, 100]) layer1 = HiddenLayer(rng, input=layer0.output, n_in=layer0.outputDimension, n_out=100, activation=T.tanh) layer2 = LogisticRegression(input=layer1.output, n_in=100, n_out=2) # construct the parameter array. params = layer2.params + layer1.params + layer0.params # Load the parameters last time, optionally. # data_name = "car" para_path = "data/" + data_name + "/model/scnn.model" traintext = "data/" + data_name + "/train/text" trainlabel = "data/" + data_name + "/train/label" testtext = "data/" + test_dataname + "/test/text" testlabel = "data/" + test_dataname + "/test/label" loadParamsVal(para_path, params) if (mode == "train"): print "Loading train data." cr_train = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=traintext, labelset=trainlabel) docMatrixes, docSentenceNums, sentenceWordNums, ids, labels = cr_train.getCorpus( [0, 100000]) # print "Right answer: " # print zip(ids, labels) docMatrixes = transToTensor(docMatrixes, theano.config.floatX) docSentenceNums = transToTensor(docSentenceNums, numpy.int32) sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32) labels = transToTensor(labels, numpy.int32) # valid_cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/valid/split", labelset="data/valid/label.txt") print print "Loading test data." cr_test = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=testtext, labelset=testlabel) validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels = cr_test.getCorpus( [0, 1000]) # print "Right answer: " # print zip(validIds, validLabels) validDocMatrixes = transToTensor(validDocMatrixes, theano.config.floatX) validDocSentenceNums = transToTensor(validDocSentenceNums, numpy.int32) validSentenceWordNums = transToTensor(validSentenceWordNums, numpy.int32) validLabels = transToTensor(validLabels, numpy.int32) print "Data loaded." learning_rate = 0.1 index = T.lscalar("index") batchSize = 10 n_batches = (len(docSentenceNums.get_value()) - 1) / batchSize + 1 print print "Train set size is ", len(docMatrixes.get_value()) print "Validating set size is ", len(validDocMatrixes.get_value()) print "Batch size is ", batchSize print "Number of training batches is ", n_batches error = layer2.errors(docLabel) cost = layer2.negative_log_likelihood(docLabel) grads = T.grad(cost, params) updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] print "Compiling computing graph." valid_model = theano.function( [], [ cost, error, layer2.y_pred, docLabel, T.transpose(layer2.p_y_given_x)[1] ], givens={ corpus: validDocMatrixes, docSentenceCount: validDocSentenceNums, sentenceWordCount: validSentenceWordNums, docLabel: validLabels }) # for list-type data train_model = theano.function( [index], [cost, error, layer2.y_pred, docLabel], updates=updates, givens={ corpus: docMatrixes, docSentenceCount: docSentenceNums[index * batchSize:(index + 1) * batchSize + 1], sentenceWordCount: sentenceWordNums, docLabel: labels[index * batchSize:(index + 1) * batchSize] }) print "Compiled." print "Start to train." epoch = 0 n_epochs = 2000 ite = 0 # ####Validate the model#### costNum, errorNum, pred_label, real_label, pred_prob = valid_model() print "Valid current model:" print "Cost: ", costNum print "Error: ", errorNum print "Valid Pred: ", pred_label print "pred_prob: ", pred_prob fpr, tpr, _ = roc_curve(real_label, pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "test_dataname: ", test_dataname print "ROC: ", roc_auc while (epoch < n_epochs): epoch = epoch + 1 ####################### for i in range(n_batches): # for list-type data costNum, errorNum, pred_label, real_label = train_model(i) ite = ite + 1 # for padding data # costNum, errorNum = train_model(docMatrixes, labels) # del docMatrixes, docSentenceNums, sentenceWordNums, labels # print ".", if (ite % 10 == 0): print print "@iter: ", ite print "Cost: ", costNum print "Error: ", errorNum # Validate the model costNum, errorNum, pred_label, real_label, pred_prob = valid_model( ) print "Valid current model:" print "Cost: ", costNum print "Error: ", errorNum print "pred_prob: ", pred_prob # print "Valid Pred: ", pred_label fpr, tpr, _ = roc_curve(real_label, pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "test_dataname: ", test_dataname print "ROC: ", roc_auc # Save model print "Saving parameters." saveParamsVal(para_path, params) print "Saved." elif (mode == "deploy"): print "Compiling computing graph." output_model = theano.function( [corpus, docSentenceCount, sentenceWordCount], [layer2.y_pred]) print "Compiled." cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/train_valid/split") count = 21000 while (count <= 21000): docMatrixes, docSentenceNums, sentenceWordNums, ids = cr.getCorpus( [count, count + 100]) docMatrixes = numpy.matrix(docMatrixes, dtype=theano.config.floatX) docSentenceNums = numpy.array(docSentenceNums, dtype=numpy.int32) sentenceWordNums = numpy.array(sentenceWordNums, dtype=numpy.int32) print "start to predict." pred_y = output_model(docMatrixes, docSentenceNums, sentenceWordNums) print "End predicting." print "Writing resfile." # print zip(ids, pred_y[0]) f = file("data/test/res/res" + str(count), "w") f.write(str(zip(ids, pred_y[0]))) f.close() print "Written." + str(count) count += 100 print "All finished!"