Exemple #1
0
def trainword(keyword,
              window_radius=3,
              learning_rate=0.1,
              n_epochs=10,
              batch_size=1,
              nkerns=1,
              filter_height=3,
              filter_width=50,
              pool_height=1,
              pool_width=1,
              loginput_num=50,
              vector_size=50,
              normalized=False,
              sequence=0):

    print '==training parameters=='
    print 'window_radius: ' + str(window_radius)
    print 'vector_size: ' + str(vector_size)
    print 'filter_height: ' + str(filter_height)
    print 'filter_width: ' + str(filter_width)
    print 'pool_height: ' + str(pool_height)
    print 'pool_width: ' + str(pool_width)
    print 'nkerns: ' + str(nkerns)
    print 'loginput_num: ' + str(loginput_num)
    print 'learning_rate: ' + str(learning_rate)
    print 'n_epochs: ' + str(n_epochs)
    print 'batch_size: ' + str(batch_size)

    rng = numpy.random.RandomState(23455)
    datasets = load_data_word(keyword, window_radius, vector_size, normalized,
                              sequence)

    train_set_x, train_set_y, trainsentence = datasets[0][0]
    valid_set_x, valid_set_y, validsentence = datasets[0][1]
    test_set_x, test_set_y, testsentence = datasets[0][2]

    senselist = datasets[1]

    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size
    print n_train_batches, n_valid_batches, n_test_batches

    index = T.lscalar()

    x = T.matrix('x')
    y = T.ivector('y')

    print '... building the model for ' + keyword

    layer0_input = x.reshape(
        (batch_size, 1, 2 * window_radius + 1, vector_size))

    layer0 = WsdConvPoolLayer(rng,
                              input=layer0_input,
                              image_shape=(batch_size, 1,
                                           2 * window_radius + 1, vector_size),
                              filter_shape=(nkerns, 1, filter_height,
                                            filter_width),
                              poolsize=(pool_height, pool_width))

    layer1_input = layer0.output.flatten(2)
    #layer1_input = layer0_input.flatten(2)

    layer1 = HiddenLayer(
        rng,
        input=layer1_input,
        #n_in=(2*window_radius+1)*(vector_size+1-filter_width+1-pool_width),
        n_in=nkerns * int(
            (2 * window_radius + 2 - filter_height) / float(pool_height)) *
        int((vector_size + 1 - filter_width) / float(pool_width)),
        n_out=loginput_num,
        activation=T.tanh)

    layer2 = LogisticRegression(input=layer1.output,
                                n_in=loginput_num,
                                n_out=20)

    cost = layer2.negative_log_likelihood(y)

    test_model = theano.function(
        [index],
        layer2.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        [index],
        layer2.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    output_size = theano.function(
        [index],
        [layer0.output.shape, layer1_input.shape, layer1.output.shape],
        givens={x: test_set_x[index * batch_size:(index + 1) * batch_size]})

    output_model = theano.function(
        [index], [layer2.y_pred],
        givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]})

    output_test = theano.function(
        [index], [layer2.y_pred],
        givens={x: test_set_x[index * batch_size:(index + 1) * batch_size]})

    output_test2 = theano.function(
        [index],
        [layer0.output, layer1_input],
        #[layer1_input],
        givens={x: test_set_x[index * batch_size:(index + 1) * batch_size]})
    print output_test2(0)
    params = layer2.params + layer1.params + layer0.params

    grads = T.grad(cost, params)

    updates = [(param_i, param_i - learning_rate * grad_i)
               for param_i, grad_i in zip(params, grads)]

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })

    print '... training'
    # early-stopping parameters
    patience = 12000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = numpy.inf
    best_params = 0
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print 'training @ iter = ', iter
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in xrange(n_valid_batches)
                ]
                #for index in range(0, n_valid_batches):
                #    print output_model(index)
                #    print valid_set_y[index * batch_size: (index + 1) * batch_size].eval()
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter
                    #best_params = [copy.deepcopy(layer0.params), copy.deepcopy(layer1.params), copy.deepcopy(layer2.params)]

                    # test it on the test set
                    test_losses = [
                        test_model(i) for i in xrange(n_test_batches)
                    ]
                    #print params[0].eval()
                    #print (params[0].eval() == layer2.params[0].eval())
                    #print validation_losses
                    for index in range(0, n_valid_batches):
                        for i in range(0, batch_size):
                            true_i = batch_size * index + i
                            #print output_model(index)
                            print validsentence[true_i], '\t', senselist[
                                output_model(index)[0][i]], '\t', senselist[
                                    valid_set_y[true_i].eval()]
                    #print test_losses
                    test_score = numpy.mean(test_losses)
                    for index in range(0, n_test_batches):
                        for i in range(0, batch_size):
                            true_i = batch_size * index + i
                            #print output_model(index)
                            print testsentence[true_i], '\t', senselist[
                                output_test(index)[0][i]], '\t', senselist[
                                    test_set_y[true_i].eval()]
                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    for index in range(0, n_test_batches):
        for i in range(0, batch_size):
            true_i = batch_size * index + i
            #print output_model(index)
            print testsentence[true_i], '\t', senselist[output_test(
                index)[0][i]], '\t', senselist[test_set_y[true_i].eval()]
    #print output_test2(0)
    '''
    for index in range(0, n_test_batches):
        for i in range(0, batch_size):
            true_i = batch_size*index+i
            #print output_model(index)
            print testsentence[true_i], '\t',senselist[output_test(index)[0][i]], '\t', senselist[test_set_y[true_i].eval()]
    '''
    print(
        'Best validation score of %f %% obtained at iteration %i, '
        'with test performance %f %%' %
        (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
    return [best_validation_loss * 100., test_score * 100.]
Exemple #2
0
def crf(keyword, window_size, vector_size, valid_to_test=0, sequence=0):
    datasets = load_data_word(keyword, window_size, vector_size, sequence)

    train_set_x, train_set_y, trainsentence = datasets[0][0]
    valid_set_x, valid_set_y, validsentence = datasets[0][1]
    test_set_x, test_set_y, testsentence = datasets[0][2]

    senselist = datasets[1]

    content = ''
    testcontent = ''
    vectorcontent = ''
    testvectorcontent = ''

    for i in range(0, len(trainsentence)):
        vector = train_set_x[i].eval()
        for j in range(0, len(trainsentence[i])):
            for k in range(vector_size * j, vector_size * j + vector_size):
                vectorcontent += str(vector[k]) + ' '
            if j != len(trainsentence[i]) / 2:
                vectorcontent += ' 0 X\n'
            else:
                vectorcontent += ' 1 ' + senselist[
                    train_set_y[i].eval()] + '\n'
        vectorcontent += '\n'
        for j in range(0, len(trainsentence[i]) / 2):
            if trainsentence[i][j] == ' ':
                continue
            content += trainsentence[i][j] + ' 0 ' + 'X\n'
        content += trainsentence[i][len(trainsentence[i]) /
                                    2] + ' 1 ' + senselist[
                                        train_set_y[i].eval()] + '\n'
        for j in range(len(trainsentence[i]) / 2 + 1, len(trainsentence[i])):
            if trainsentence[i][j] == ' ':
                continue
            content += trainsentence[i][j] + ' 0 ' + 'X\n'
        content += '\n'

    if valid_to_test:
        for i in range(0, len(validsentence)):
            vector = valid_set_x[i].eval()
            for j in range(0, len(validsentence[i])):
                for k in range(vector_size * j, vector_size * j + vector_size):
                    testvectorcontent += str(vector[k]) + ' '
                if j != len(validsentence[i]) / 2:
                    testvectorcontent += ' 0 X\n'
                else:
                    testvectorcontent += ' 1 ' + senselist[
                        valid_set_y[i].eval()] + '\n'
            testvectorcontent += '\n'
            for j in range(0, len(validsentence[i]) / 2):
                if validsentence[i][j] == ' ':
                    continue
                testcontent += validsentence[i][j] + ' 0 ' + 'X\n'
            testcontent += validsentence[i][len(validsentence[i]) /
                                            2] + ' 1 ' + senselist[
                                                valid_set_y[i].eval()] + '\n'
            for j in range(
                    len(validsentence[i]) / 2 + 1, len(validsentence[i])):
                if validsentence[i][j] == ' ':
                    continue
                testcontent += validsentence[i][j] + ' 0 ' + 'X\n'
            testcontent += '\n'
    else:
        for i in range(0, len(validsentence)):
            vector = valid_set_x[i].eval()
            for j in range(0, len(validsentence[i])):
                for k in range(vector_size * j, vector_size * j + vector_size):
                    vectorcontent += str(vector[k]) + ' '
                if j != len(validsentence[i]) / 2:
                    vectorcontent += ' 0 X\n'
                else:
                    vectorcontent += ' 1 ' + senselist[
                        valid_set_y[i].eval()] + '\n'
            vectorcontent += '\n'
            for j in range(0, len(validsentence[i]) / 2):
                if validsentence[i][j] == ' ':
                    continue
                content += validsentence[i][j] + ' 0 ' + 'X\n'
            content += validsentence[i][len(validsentence[i]) /
                                        2] + ' 1 ' + senselist[
                                            valid_set_y[i].eval()] + '\n'
            for j in range(
                    len(validsentence[i]) / 2 + 1, len(validsentence[i])):
                if validsentence[i][j] == ' ':
                    continue
                content += validsentence[i][j] + ' 0 ' + 'X\n'
            content += '\n'

    for i in range(0, len(testsentence)):
        vector = test_set_x[i].eval()
        for j in range(0, len(testsentence[i])):
            for k in range(vector_size * j, vector_size * j + vector_size):
                testvectorcontent += str(vector[k]) + ' '
            if j != len(testsentence[i]) / 2:
                testvectorcontent += ' 0 X\n'
            else:
                testvectorcontent += ' 1 ' + senselist[
                    test_set_y[i].eval()] + '\n'
        testvectorcontent += '\n'
        for j in range(0, len(testsentence[i]) / 2):
            if testsentence[i][j] == ' ':
                continue
            testcontent += testsentence[i][j] + ' 0 ' + 'X\n'
        testcontent += testsentence[i][len(testsentence[i]) /
                                       2] + ' 1 ' + senselist[
                                           test_set_y[i].eval()] + '\n'
        for j in range(len(testsentence[i]) / 2 + 1, len(testsentence[i])):
            if testsentence[i][j] == ' ':
                continue
            testcontent += testsentence[i][j] + ' 0 ' + 'X\n'
        testcontent += '\n'

    vectoroutput = codecs.open('crf//train//' + keyword + '_vector_crf.txt',
                               'wb', 'utf-8')
    vectoroutput.write(vectorcontent)
    vectoroutput.close()
    output = codecs.open('crf//train//' + keyword + '_crf.txt', 'wb', 'utf-8')
    output.write(content)
    output.close()

    testvectoroutput = codecs.open('crf//test//' + keyword + '_vector_crf.txt',
                                   'wb', 'utf-8')
    testvectoroutput.write(testvectorcontent)
    testvectoroutput.close()
    testoutput = codecs.open('crf//test//' + keyword + '_crf.txt', 'wb',
                             'utf-8')
    testoutput.write(testcontent)
    testoutput.close()

    time.sleep(1)

    os.system('crf_learn crf/template crf/train/' + keyword.encode('utf-8') +
              '_crf.txt crf/model/' + keyword.encode('utf-8') + '_crf')
    time.sleep(1)
    os.system('crf_test -m crf/model/' + keyword.encode('utf-8') +
              '_crf crf/test/' + keyword.encode('utf-8') +
              '_crf.txt > crf/output/' + keyword.encode('utf-8') + '_crf.txt')
    '''
    os.system('crf_learn crf/template crf/train/'+keyword.encode('utf-8')+'_vector_crf.txt crf/model/'+keyword.encode('utf-8')+'_vector_crf')
    time.sleep(1)
    os.system('crf_test -m crf/model/'+keyword.encode('utf-8')+'_vector_crf crf/test/'+keyword.encode('utf-8')+'_vector_crf.txt > crf/output/'+keyword.encode('utf-8')+'_vector_crf.txt')
    '''
    #check resuilt

    print '== normal test =='
    num = 0
    taggednum = 0
    predictnum = 0
    rightnum = 0
    inp = open('crf/output/' + keyword.encode('utf-8') + '_crf.txt',
               'rb').read()
    print inp
    words = inp.split('\n')
    for word in words:
        if word == '':
            continue
        tokens = word.split('\t')
        if tokens[0] == keyword.encode('utf-8') and tokens[1] != 'X':
            num += 1
            if tokens[2] != 'X':
                taggednum += 1
                if tokens[3] != 'X':
                    predictnum += 1
                    if tokens[2] == tokens[3]:
                        rightnum += 1

    print 'tagged: ' + str(taggednum)
    print 'predict: ' + str(predictnum)
    print 'right: ' + str(rightnum)

    precision = 1. * rightnum / predictnum
    recall = 1. * rightnum / taggednum
    if precision + recall == 0:
        F = 0.0
    else:
        F = 2 * precision * recall / (precision + recall)
    print 'precision: ' + str(precision) + ' recall: ' + str(
        recall) + ' F: ' + str(F)
    return F

    print '== vector test =='
    num = 0
    taggednum = 0
    predictnum = 0
    rightnum = 0
    inp = open('crf/output/' + keyword.encode('utf-8') + '_vector_crf.txt',
               'rb').read()
    words = inp.split('\n')
    for word in words:
        if word == '':
            continue
        tokens = word.split('\t')
        if tokens[vector_size] == '1':
            num += 1
            if tokens[vector_size + 1] != 'X':
                taggednum += 1
                if tokens[vector_size + 2] != 'X':
                    predictnum += 1
                    if tokens[vector_size + 1] == tokens[vector_size + 2]:
                        rightnum += 1

    print 'tagged: ' + str(taggednum)
    print 'predict: ' + str(predictnum)
    print 'right: ' + str(rightnum)

    precision = 1. * rightnum / predictnum
    recall = 1. * rightnum / taggednum
    if precision + recall == 0:
        F = 0.0
    else:
        F = 2 * precision * recall / (precision + recall)
    print 'precision: ' + str(precision) + ' recall: ' + str(
        recall) + ' F: ' + str(F)
    return F