Example #1
0
def xtest_print_all_srl_tag():
    print '*' * 20
    print 'print all srl tags of cornll 05'
    home = os.path.expanduser('~')
    filename = os.path.join(home,"data/conll05/training-set")
    raw_corpora = Conll05.loadraw(filename)
    all_tags = set()
    for sent in raw_corpora:
        iobsents = Conll05.sentence2iobsentece(sent)
        for iobsent in iobsents:
            tags = set([i[2] for i in iobsent[1]])
            all_tags = all_tags.union(tags)
    for tag in all_tags:
        print '\'%s\',' % (tag)
Example #2
0
def xtest_Cornll05():
    print '*' * 20
    print 'test cornll 05'
    home = os.path.expanduser('~')
    filename = os.path.join(home,"data/conll05/training-set")
    raw_corpora = Conll05.loadraw(filename)
    print 'raw corpora size=%d' % (len(raw_corpora))
    max_sent_len = -1
    sum_sent_len = 0
    cnt = 0
    srl_corpora_size = 0
    '''
    for sent in raw_corpora:
        cnt += 1
        sz = len(sent)
        sum_sent_len += sz
        if sz > max_sent_len:
            max_sent_len = sz
        #print sent
        srl_corpora_size += len(sent[0]) - 2
        iobsent = Conll05.sentence2iobsentece(sent)
        #srl_corpora_size += len(iobsent)
    print "maxium sentece length=%d" % (max_sent_len)
    print 'sum sentece length=%d' % (sum_sent_len)
    print 'avg sentece length=%d' % (sum_sent_len/cnt)
    print 'srl corpora size=%d' % (srl_corpora_size)
    '''
    '''
    ss = rawstr.split('\n')
    sentence = list()
    for line in ss:
        sss = line.split()
        if len(sss) == 0:
            continue
        sss = sss[:2] + sss[6:]
        sentence.append(sss)
    for i in Conll05.sentence2iobsentece(sentence):
        print i
        print
    '''
    print 'test cornll 05 done'
    print '*' * 20
def test_srl_conv_network():
    print '*' * 20
    print 'test_srl_conv_network'
    home = os.path.expanduser('~')
    filename = os.path.join(home,'data/conll05/training-set')
    raw_corpora = Conll05.loadraw(filename)
    srl_sents = []
    for sent in raw_corpora:
        iobsent = Conll05.sentence2iobsentece(sent)
        srl_sents += iobsent

    words = Corpora(pading_lst=[Word.padding_word(),Word.padding_word2()])
    pos = Corpora(pading_lst=[Word.padding_pos(),Word.padding_pos2()])
    srl_problem = SrlProblem(words,pos,srl_sents)
    max_term_per_sent = 141
    window_size = 11
    pos_conv_size = 15
    max_size = max_term_per_sent + window_size - 1
    print 'corpora has words',len(Conll05.words)
    print 'corpora has pos',len(Conll05.pos)
    print 'corpora has tags',len(Conll05.tags)
    print 'window_size' , window_size
    print 'pos_conv_size', pos_conv_size
    print 'max_term_per_sent', max_term_per_sent
    print 'max_size',max_size


    validation_frequency = 100

    model_params = dict()
    model_params['L1_reg'] = 0.0
    model_params['L2_reg'] = 0.1

    # max_sentence_length = max_term_per_sent + window_size - 1
    # which is the maximum length of each sentence with padding
    model_params['max_sentence_length'] = max_size
    # which is also conv window size
    model_params['window_size'] = window_size
    model_params['word_num'] = len(Conll05.words)
    model_params['POS_num'] = len(Conll05.pos)
    # how many pos should we consider in model
    model_params['verbpos_num'] = window_size + 1
    model_params['wordpos_num'] = window_size + 1
    model_params['position_conv_half_window'] = (window_size - 1) / 2

    # the dimension of word vector
    model_params['word_feature_num'] = 30
    # the dimension of POS vector
    model_params['POS_feature_num'] = 30
    # the dimension of word's position vector
    model_params['wordpos_feature_num'] = 30
    # the dimension of verb's position vector
    model_params['verbpos_feature_num'] = 30

    model_params['conv_window'] = window_size
    model_params['conv_hidden_feature_num'] = 20

    model_params['hidden_layer_size'] = 100
    model_params['tags_num'] = len(SrlTypes.SRLTYPE_ID_MAP) + 1

    model_params['learning_rate'] = 0.3

    print 'model params'
    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(model_params)

    rng = np.random.RandomState(1234)
    model = SentenceLevelNeuralModel(rng,model_params)

    # exp params
    n_epochs = 1000
    batch_iter_num = 10
    validation_frequency = 2

    epoch = 0
    done_looping = False

    from collections import Counter
    cnt = Counter()
    while (epoch <= n_epochs) and (not done_looping):
        iter = 1
        for idx,data in enumerate(srl_problem.get_batch(batch_size = 10000,
            window_size = window_size,
            max_size = max_size)):
            X,Y,sent_len,masks = data
            print 'data %d, X shape %s,Y shape %s,sent_len shape %s,masks shape %s' % (idx,str(X.shape),str(Y.shape),str(sent_len.shape),str(masks.shape))
            X = X.astype(np.int32)
            Y = Y.astype(np.int32)
            #np.savetxt('/home/kingsfield/data/Y',Y,delimiter=',')
            #np.save('/home/kingsfield/data/Y',Y)
            #np.save('/home/kingsfield/data/masks',masks)
            sent_len = sent_len.astype(np.int32)
            masks = masks.astype(np.int32)


            for row,l in zip(Y,sent_len):
                for idx in xrange(l):
                    cnt[row[idx]]+= 1

            if iter % validation_frequency == 0:
                error,pred,time_cost = model.valid(X,Y,sent_len,masks)
                #np.savetxt('/home/kingsfield/data/pred',pred,delimiter=',')
                #np.save('/home/kingsfield/data/pred',pred)
                print >> sys.stderr, 'epoch %i, minibatch %i/%i, validation error %f %%,cost time %f' % \
                     (epoch, iter,100,error * 100.,time_cost)
                pass
            else:
                minibatch_avg_cost,time_cost = model.fit_batch(X,Y,sent_len,masks)
                print >> sys.stderr, 'epoch %i, minibatch %i/%i, minibatch cost %f,cost time %f' % \
                        (epoch,iter,100,minibatch_avg_cost,time_cost)
            iter += 1
        epoch += 1
    s = sum(cnt.values())
    idmap = dict([v,k] for k,v in SrlTypes.SRLTYPE_ID_MAP.items())
    for k,v in cnt.most_common():
        print '\t',k,idmap[k],v,v * 100. / s
    print len(cnt),len(SrlTypes.SRLTYPE_ID_MAP)
    s1 = set([idmap[i] for i in cnt.keys()])
    s2 = set(SrlTypes.SRLTYPE_ID_MAP.keys())
    print 's1-s2',s1-s2
    print 's2-s1',s2-s1