コード例 #1
0
def parse_eval_main(rightf,testf,pf=None):
    rtrees=[x.strip().decode('utf8') for x in file(rightf)]
    ttrees=[x.strip().decode('utf8') for x in file(testf)]
    right=0
    get_all=0
    right_all=0
    i=0###############
    print 'trees num:',len(rtrees),len(ttrees)
    print '##########'
    longsen=0
    while i<len(rtrees):
        if len(rtrees[i])==0:
            i+=1
            continue
        rt=read_tree(rtrees[i])
        word_len=len(rt.get_words())
        #print word_len
        if word_len<20:
            i+=1
            continue
        longsen+=1
        tt=read_tree(ttrees[i])
        ri,ga,ra=get_two_eval(rt,tt)
        right+=ri
        get_all+=ga
        right_all+=ra
##        if not ga==ra==ri:
##            print 'right',rtrees[i]
##            print 'wrong',ttrees[i]
        i+=1
    ##########
    print '##########',longsen
    if get_all*right_all*right==0:
        print 'error,F=LP=LR=0'
        return None
    print right_all,get_all,right
    if pf!=None:
        num=count_piece_num(pf)
        get_all-=num
        right_all-=num
        right-=num
    LP=right*1.0/get_all
    LR=right*1.0/right_all
    F=2*LP*LR/(LP+LR)
    print right_all,get_all,right
    print 'LP:',round(LP,4)
    print 'LR:',round(LR,4)
    print 'F1:',round(F,4)
    return LP,LR,F
コード例 #2
0
def CTB_main(psj=None,sen=None):
##    sen=raw_input('请输入中文句子~\n')
##    sen=sen.decode('gbk')
##    print 'sen:',sen
    ###tag model###todo old tag model
    #tag_model=tag_model_class(joint=False)
    #tag_joint_model=tag_model_class(joint=True)
    #tag_models=[tag_model,tag_joint_model]
    tag_models=[{},{}]# empty, tag's model based on crf model
    pcfg_model=cPickle.load(file(pcfg_pickle_file,'r'))
    #tag_models=[tag_model,tag_model]
    wll=[]
    if sen!=None:
        #sen='( (IP (IP (NP-PN (NR 上海)(NR 浦东))(VP (VP (LCP (NP (NT 近年))(LC 来))(VP (VCD (VV 颁布)(VV 实行))(VP* (AS 了)(NP (CP (IP (VP (VV 涉及)(NP (NP (NP (NN 经济)(NP* (PU 、)(NP* (NN 贸易)(NP* (PU 、)(NP* (NN 建设)(NP* (PU 、)(NP* (NN 规划)(NP* (PU 、)(NP* (NN 科技)(NP* (PU 、)(NN 文教)))))))))))(ETC 等))(NP (NN 领域)))))(DEC 的))(NP* (QP (CD 七十一)(CLP (M 件)))(NP (NN 法规性)(NN 文件)))))))(VP* (PU ,)(VP (VP* (VV 确保)(AS 了))(NP (DNP (NP (NP-PN (NR 浦东))(NP (NN 开发)))(DEG 的))(NP* (ADJP (JJ 有序))(NP (NN 进行))))))))(PU 。)) )'
        #sen=sen.decode('gbk')
        t=read_tree(sen)
        wl=t.get_words()#[(word,pos),,,]
        wll=[wl]
    if psj!=None:
        wll=read_pos_json(psj)
    #0for 未切分,1 for 已切分,2 for 已pos
    treel=[]
    for wl in wll:
        print wl
        treel.append(CTB_parse_sen(wl,2,tag_models,pcfg_model))
    #print tree.show()
    return treel
コード例 #3
0
def CTB_split_tag_main(src_file,resf):
    tl=[x.strip().decode('utf8') for x in file(src_file)]#[:10]
    print len(tl)
    res=[]
    #####
    a=open(resf,'w')
    a.close()
    #####
    i=0
    for line in tl:
        if len(line)==0:
            continue
        t=read_tree(line)
        if len(t.get_words())==1:
            continue
        if t.tag=='S' and len(t.son)==1:
            t=t.son[0]
        t=CCG_head(t)
        t=update_level(t)
        features=tree_get_tag_feature(t)
        res.extend(features)
        i+=1
        if i%2000==0:
            print i
            write_file_add(resf,res)
            res=[]
    write_file_add(resf,res)
def test_main(tf,resf):#测试语料,已经句法分析的树
    # tag_model=tag_model_class(joint=False)
    # tag_joint_model=tag_model_class(joint=True)
    # tag_models=[tag_model,tag_joint_model]
    # tag_models=[tag_model,tag_model]
    tag_models=[None,None]
    pcfg_model=cPickle.load(file(pcfg_pickle_file,'r'))
    print 'test file:',tf
    senl=[x.strip().decode('utf8') for x in file(tf)]
    #senl=senl[60:61]
    res=[]
    i=0
    total_w_len=0
    total_c_len=0
    for asen in senl[:]:
        if len(asen)<1:
            continue
        t=read_tree(asen)
        wt=t.get_words()
        words=[x[0] for x in wt]
        total_w_len+=len(words)
        total_c_len+=len(''.join(words))
        wl=t.get_words()#[(word,pos),,,]
        new_t=CTB_parse_sen(wl,2,tag_models,pcfg_model)
        res.append(new_t[0][0].show())
        i+=1
        print i#,asen
    #######
    mean_w_len=total_w_len*1.0/len(senl)
    mean_c_len=total_c_len*1.0/len(senl)
    print '句子平均词数:'.decode('gbk'),mean_w_len
    print '句子平均字数:'.decode('gbk'),mean_c_len
    write_file(resf,res)