def trainword(keyword, window_radius=3, learning_rate=0.1, n_epochs=10, batch_size=1, nkerns=1, filter_height=3, filter_width=50, pool_height=1, pool_width=1, loginput_num=50, vector_size=50, normalized=False, sequence=0): print '==training parameters==' print 'window_radius: ' + str(window_radius) print 'vector_size: ' + str(vector_size) print 'filter_height: ' + str(filter_height) print 'filter_width: ' + str(filter_width) print 'pool_height: ' + str(pool_height) print 'pool_width: ' + str(pool_width) print 'nkerns: ' + str(nkerns) print 'loginput_num: ' + str(loginput_num) print 'learning_rate: ' + str(learning_rate) print 'n_epochs: ' + str(n_epochs) print 'batch_size: ' + str(batch_size) rng = numpy.random.RandomState(23455) datasets = load_data_word(keyword, window_radius, vector_size, normalized, sequence) train_set_x, train_set_y, trainsentence = datasets[0][0] valid_set_x, valid_set_y, validsentence = datasets[0][1] test_set_x, test_set_y, testsentence = datasets[0][2] senselist = datasets[1] n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size print n_train_batches, n_valid_batches, n_test_batches index = T.lscalar() x = T.matrix('x') y = T.ivector('y') print '... building the model for ' + keyword layer0_input = x.reshape( (batch_size, 1, 2 * window_radius + 1, vector_size)) layer0 = WsdConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 2 * window_radius + 1, vector_size), filter_shape=(nkerns, 1, filter_height, filter_width), poolsize=(pool_height, pool_width)) layer1_input = layer0.output.flatten(2) #layer1_input = layer0_input.flatten(2) layer1 = HiddenLayer( rng, input=layer1_input, #n_in=(2*window_radius+1)*(vector_size+1-filter_width+1-pool_width), n_in=nkerns * int( (2 * window_radius + 2 - filter_height) / float(pool_height)) * int((vector_size + 1 - filter_width) / float(pool_width)), n_out=loginput_num, activation=T.tanh) layer2 = LogisticRegression(input=layer1.output, n_in=loginput_num, n_out=20) cost = layer2.negative_log_likelihood(y) test_model = theano.function( [index], layer2.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer2.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) output_size = theano.function( [index], [layer0.output.shape, layer1_input.shape, layer1.output.shape], givens={x: test_set_x[index * batch_size:(index + 1) * batch_size]}) output_model = theano.function( [index], [layer2.y_pred], givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]}) output_test = theano.function( [index], [layer2.y_pred], givens={x: test_set_x[index * batch_size:(index + 1) * batch_size]}) output_test2 = theano.function( [index], [layer0.output, layer1_input], #[layer1_input], givens={x: test_set_x[index * batch_size:(index + 1) * batch_size]}) print output_test2(0) params = layer2.params + layer1.params + layer0.params grads = T.grad(cost, params) updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) print '... training' # early-stopping parameters patience = 12000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_params = 0 best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] #for index in range(0, n_valid_batches): # print output_model(index) # print valid_set_y[index * batch_size: (index + 1) * batch_size].eval() this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter #best_params = [copy.deepcopy(layer0.params), copy.deepcopy(layer1.params), copy.deepcopy(layer2.params)] # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] #print params[0].eval() #print (params[0].eval() == layer2.params[0].eval()) #print validation_losses for index in range(0, n_valid_batches): for i in range(0, batch_size): true_i = batch_size * index + i #print output_model(index) print validsentence[true_i], '\t', senselist[ output_model(index)[0][i]], '\t', senselist[ valid_set_y[true_i].eval()] #print test_losses test_score = numpy.mean(test_losses) for index in range(0, n_test_batches): for i in range(0, batch_size): true_i = batch_size * index + i #print output_model(index) print testsentence[true_i], '\t', senselist[ output_test(index)[0][i]], '\t', senselist[ test_set_y[true_i].eval()] print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') for index in range(0, n_test_batches): for i in range(0, batch_size): true_i = batch_size * index + i #print output_model(index) print testsentence[true_i], '\t', senselist[output_test( index)[0][i]], '\t', senselist[test_set_y[true_i].eval()] #print output_test2(0) ''' for index in range(0, n_test_batches): for i in range(0, batch_size): true_i = batch_size*index+i #print output_model(index) print testsentence[true_i], '\t',senselist[output_test(index)[0][i]], '\t', senselist[test_set_y[true_i].eval()] ''' print( 'Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return [best_validation_loss * 100., test_score * 100.]
def crf(keyword, window_size, vector_size, valid_to_test=0, sequence=0): datasets = load_data_word(keyword, window_size, vector_size, sequence) train_set_x, train_set_y, trainsentence = datasets[0][0] valid_set_x, valid_set_y, validsentence = datasets[0][1] test_set_x, test_set_y, testsentence = datasets[0][2] senselist = datasets[1] content = '' testcontent = '' vectorcontent = '' testvectorcontent = '' for i in range(0, len(trainsentence)): vector = train_set_x[i].eval() for j in range(0, len(trainsentence[i])): for k in range(vector_size * j, vector_size * j + vector_size): vectorcontent += str(vector[k]) + ' ' if j != len(trainsentence[i]) / 2: vectorcontent += ' 0 X\n' else: vectorcontent += ' 1 ' + senselist[ train_set_y[i].eval()] + '\n' vectorcontent += '\n' for j in range(0, len(trainsentence[i]) / 2): if trainsentence[i][j] == ' ': continue content += trainsentence[i][j] + ' 0 ' + 'X\n' content += trainsentence[i][len(trainsentence[i]) / 2] + ' 1 ' + senselist[ train_set_y[i].eval()] + '\n' for j in range(len(trainsentence[i]) / 2 + 1, len(trainsentence[i])): if trainsentence[i][j] == ' ': continue content += trainsentence[i][j] + ' 0 ' + 'X\n' content += '\n' if valid_to_test: for i in range(0, len(validsentence)): vector = valid_set_x[i].eval() for j in range(0, len(validsentence[i])): for k in range(vector_size * j, vector_size * j + vector_size): testvectorcontent += str(vector[k]) + ' ' if j != len(validsentence[i]) / 2: testvectorcontent += ' 0 X\n' else: testvectorcontent += ' 1 ' + senselist[ valid_set_y[i].eval()] + '\n' testvectorcontent += '\n' for j in range(0, len(validsentence[i]) / 2): if validsentence[i][j] == ' ': continue testcontent += validsentence[i][j] + ' 0 ' + 'X\n' testcontent += validsentence[i][len(validsentence[i]) / 2] + ' 1 ' + senselist[ valid_set_y[i].eval()] + '\n' for j in range( len(validsentence[i]) / 2 + 1, len(validsentence[i])): if validsentence[i][j] == ' ': continue testcontent += validsentence[i][j] + ' 0 ' + 'X\n' testcontent += '\n' else: for i in range(0, len(validsentence)): vector = valid_set_x[i].eval() for j in range(0, len(validsentence[i])): for k in range(vector_size * j, vector_size * j + vector_size): vectorcontent += str(vector[k]) + ' ' if j != len(validsentence[i]) / 2: vectorcontent += ' 0 X\n' else: vectorcontent += ' 1 ' + senselist[ valid_set_y[i].eval()] + '\n' vectorcontent += '\n' for j in range(0, len(validsentence[i]) / 2): if validsentence[i][j] == ' ': continue content += validsentence[i][j] + ' 0 ' + 'X\n' content += validsentence[i][len(validsentence[i]) / 2] + ' 1 ' + senselist[ valid_set_y[i].eval()] + '\n' for j in range( len(validsentence[i]) / 2 + 1, len(validsentence[i])): if validsentence[i][j] == ' ': continue content += validsentence[i][j] + ' 0 ' + 'X\n' content += '\n' for i in range(0, len(testsentence)): vector = test_set_x[i].eval() for j in range(0, len(testsentence[i])): for k in range(vector_size * j, vector_size * j + vector_size): testvectorcontent += str(vector[k]) + ' ' if j != len(testsentence[i]) / 2: testvectorcontent += ' 0 X\n' else: testvectorcontent += ' 1 ' + senselist[ test_set_y[i].eval()] + '\n' testvectorcontent += '\n' for j in range(0, len(testsentence[i]) / 2): if testsentence[i][j] == ' ': continue testcontent += testsentence[i][j] + ' 0 ' + 'X\n' testcontent += testsentence[i][len(testsentence[i]) / 2] + ' 1 ' + senselist[ test_set_y[i].eval()] + '\n' for j in range(len(testsentence[i]) / 2 + 1, len(testsentence[i])): if testsentence[i][j] == ' ': continue testcontent += testsentence[i][j] + ' 0 ' + 'X\n' testcontent += '\n' vectoroutput = codecs.open('crf//train//' + keyword + '_vector_crf.txt', 'wb', 'utf-8') vectoroutput.write(vectorcontent) vectoroutput.close() output = codecs.open('crf//train//' + keyword + '_crf.txt', 'wb', 'utf-8') output.write(content) output.close() testvectoroutput = codecs.open('crf//test//' + keyword + '_vector_crf.txt', 'wb', 'utf-8') testvectoroutput.write(testvectorcontent) testvectoroutput.close() testoutput = codecs.open('crf//test//' + keyword + '_crf.txt', 'wb', 'utf-8') testoutput.write(testcontent) testoutput.close() time.sleep(1) os.system('crf_learn crf/template crf/train/' + keyword.encode('utf-8') + '_crf.txt crf/model/' + keyword.encode('utf-8') + '_crf') time.sleep(1) os.system('crf_test -m crf/model/' + keyword.encode('utf-8') + '_crf crf/test/' + keyword.encode('utf-8') + '_crf.txt > crf/output/' + keyword.encode('utf-8') + '_crf.txt') ''' os.system('crf_learn crf/template crf/train/'+keyword.encode('utf-8')+'_vector_crf.txt crf/model/'+keyword.encode('utf-8')+'_vector_crf') time.sleep(1) os.system('crf_test -m crf/model/'+keyword.encode('utf-8')+'_vector_crf crf/test/'+keyword.encode('utf-8')+'_vector_crf.txt > crf/output/'+keyword.encode('utf-8')+'_vector_crf.txt') ''' #check resuilt print '== normal test ==' num = 0 taggednum = 0 predictnum = 0 rightnum = 0 inp = open('crf/output/' + keyword.encode('utf-8') + '_crf.txt', 'rb').read() print inp words = inp.split('\n') for word in words: if word == '': continue tokens = word.split('\t') if tokens[0] == keyword.encode('utf-8') and tokens[1] != 'X': num += 1 if tokens[2] != 'X': taggednum += 1 if tokens[3] != 'X': predictnum += 1 if tokens[2] == tokens[3]: rightnum += 1 print 'tagged: ' + str(taggednum) print 'predict: ' + str(predictnum) print 'right: ' + str(rightnum) precision = 1. * rightnum / predictnum recall = 1. * rightnum / taggednum if precision + recall == 0: F = 0.0 else: F = 2 * precision * recall / (precision + recall) print 'precision: ' + str(precision) + ' recall: ' + str( recall) + ' F: ' + str(F) return F print '== vector test ==' num = 0 taggednum = 0 predictnum = 0 rightnum = 0 inp = open('crf/output/' + keyword.encode('utf-8') + '_vector_crf.txt', 'rb').read() words = inp.split('\n') for word in words: if word == '': continue tokens = word.split('\t') if tokens[vector_size] == '1': num += 1 if tokens[vector_size + 1] != 'X': taggednum += 1 if tokens[vector_size + 2] != 'X': predictnum += 1 if tokens[vector_size + 1] == tokens[vector_size + 2]: rightnum += 1 print 'tagged: ' + str(taggednum) print 'predict: ' + str(predictnum) print 'right: ' + str(rightnum) precision = 1. * rightnum / predictnum recall = 1. * rightnum / taggednum if precision + recall == 0: F = 0.0 else: F = 2 * precision * recall / (precision + recall) print 'precision: ' + str(precision) + ' recall: ' + str( recall) + ' F: ' + str(F) return F