def parse_eval_main(rightf,testf,pf=None): rtrees=[x.strip().decode('utf8') for x in file(rightf)] ttrees=[x.strip().decode('utf8') for x in file(testf)] right=0 get_all=0 right_all=0 i=0############### print 'trees num:',len(rtrees),len(ttrees) print '##########' longsen=0 while i<len(rtrees): if len(rtrees[i])==0: i+=1 continue rt=read_tree(rtrees[i]) word_len=len(rt.get_words()) #print word_len if word_len<20: i+=1 continue longsen+=1 tt=read_tree(ttrees[i]) ri,ga,ra=get_two_eval(rt,tt) right+=ri get_all+=ga right_all+=ra ## if not ga==ra==ri: ## print 'right',rtrees[i] ## print 'wrong',ttrees[i] i+=1 ########## print '##########',longsen if get_all*right_all*right==0: print 'error,F=LP=LR=0' return None print right_all,get_all,right if pf!=None: num=count_piece_num(pf) get_all-=num right_all-=num right-=num LP=right*1.0/get_all LR=right*1.0/right_all F=2*LP*LR/(LP+LR) print right_all,get_all,right print 'LP:',round(LP,4) print 'LR:',round(LR,4) print 'F1:',round(F,4) return LP,LR,F
def CTB_main(psj=None,sen=None): ## sen=raw_input('请输入中文句子~\n') ## sen=sen.decode('gbk') ## print 'sen:',sen ###tag model###todo old tag model #tag_model=tag_model_class(joint=False) #tag_joint_model=tag_model_class(joint=True) #tag_models=[tag_model,tag_joint_model] tag_models=[{},{}]# empty, tag's model based on crf model pcfg_model=cPickle.load(file(pcfg_pickle_file,'r')) #tag_models=[tag_model,tag_model] wll=[] if sen!=None: #sen='( (IP (IP (NP-PN (NR 上海)(NR 浦东))(VP (VP (LCP (NP (NT 近年))(LC 来))(VP (VCD (VV 颁布)(VV 实行))(VP* (AS 了)(NP (CP (IP (VP (VV 涉及)(NP (NP (NP (NN 经济)(NP* (PU 、)(NP* (NN 贸易)(NP* (PU 、)(NP* (NN 建设)(NP* (PU 、)(NP* (NN 规划)(NP* (PU 、)(NP* (NN 科技)(NP* (PU 、)(NN 文教)))))))))))(ETC 等))(NP (NN 领域)))))(DEC 的))(NP* (QP (CD 七十一)(CLP (M 件)))(NP (NN 法规性)(NN 文件)))))))(VP* (PU ,)(VP (VP* (VV 确保)(AS 了))(NP (DNP (NP (NP-PN (NR 浦东))(NP (NN 开发)))(DEG 的))(NP* (ADJP (JJ 有序))(NP (NN 进行))))))))(PU 。)) )' #sen=sen.decode('gbk') t=read_tree(sen) wl=t.get_words()#[(word,pos),,,] wll=[wl] if psj!=None: wll=read_pos_json(psj) #0for 未切分,1 for 已切分,2 for 已pos treel=[] for wl in wll: print wl treel.append(CTB_parse_sen(wl,2,tag_models,pcfg_model)) #print tree.show() return treel
def CTB_split_tag_main(src_file,resf): tl=[x.strip().decode('utf8') for x in file(src_file)]#[:10] print len(tl) res=[] ##### a=open(resf,'w') a.close() ##### i=0 for line in tl: if len(line)==0: continue t=read_tree(line) if len(t.get_words())==1: continue if t.tag=='S' and len(t.son)==1: t=t.son[0] t=CCG_head(t) t=update_level(t) features=tree_get_tag_feature(t) res.extend(features) i+=1 if i%2000==0: print i write_file_add(resf,res) res=[] write_file_add(resf,res)
def test_main(tf,resf):#测试语料,已经句法分析的树 # tag_model=tag_model_class(joint=False) # tag_joint_model=tag_model_class(joint=True) # tag_models=[tag_model,tag_joint_model] # tag_models=[tag_model,tag_model] tag_models=[None,None] pcfg_model=cPickle.load(file(pcfg_pickle_file,'r')) print 'test file:',tf senl=[x.strip().decode('utf8') for x in file(tf)] #senl=senl[60:61] res=[] i=0 total_w_len=0 total_c_len=0 for asen in senl[:]: if len(asen)<1: continue t=read_tree(asen) wt=t.get_words() words=[x[0] for x in wt] total_w_len+=len(words) total_c_len+=len(''.join(words)) wl=t.get_words()#[(word,pos),,,] new_t=CTB_parse_sen(wl,2,tag_models,pcfg_model) res.append(new_t[0][0].show()) i+=1 print i#,asen ####### mean_w_len=total_w_len*1.0/len(senl) mean_c_len=total_c_len*1.0/len(senl) print '句子平均词数:'.decode('gbk'),mean_w_len print '句子平均字数:'.decode('gbk'),mean_c_len write_file(resf,res)