def get_undone(kb,beam_size): #split finished and todo may be be the same tree newkb=[] string_d=defaultdict(int) for nl in kb: #todo,delete the same tree string=''.join([node.show() for node in nl[0]]) if string_d[string]==0: newkb.append(nl) string_d[string]+=1 kb=list(newkb) kb=sorted(kb,key=lambda x:-x[-1]) #sorted by pcfg kb=kb[:beam_size] #best beam_size done=[item for item in kb if len(item[0])==1] #todo 假如是单枝的可能还要继续,done undone=[item for item in kb if len(item[0])!=1] return done,undone
def split_sen(fn,resf): lines=[x.strip().decode('utf8') for x in file(fn)]#[:1] res=[] i=0 for line in lines: if len(line.strip())>0: t=read_tree(line) tl=split_main(t) for node in tl: ## while not node.isleaf and len(node.son)==1: ## node=node.son[0] res.append(node.show()) res.append('') i+=1 if i%1000==0: print i write_file(resf,res)