def kbest_conll_to_sents(fh,ignore_errs=True): while True: count = int(fh.next().strip()) k=[] for i,sent in zip(xrange(count), yutils.tokenize_blanks(fh)): if ignore_errs and sent[0][0][0]=="@": continue k.append( [to_tok(l) for l in sent] ) yield k
def conll_to_sents(file): """ yield sentences from corpus, each sentence is a list of tokens :param file: corpus file :return: yield sentences """ for sent in tokenize_blanks(file): yield [to_tok(l) for l in sent]
def conll_to_sents2(fh,ignore_errs=True): from common import ROOT for sent in yutils.tokenize_blanks(fh): if ignore_errs and sent[0][0][0]=="@": continue sent = [to_tok(l) for l in sent] for tok in sent: par = tok['parent'] if par==0: tok['partok']=ROOT elif par==-1: tok['partok']=None else: tok['partok']=sent[par-1] yield sent
def add_parents_annotation(sents, parents_file): sents = list(sents) parents_annotations = list(yutils.tokenize_blanks(file(parents_file))) assert len(sents)==len(parents_annotations) for s,p in zip(sents, parents_annotations): assert len(s)==len(p) for tok,parents in zip(s,p): id = int(parents[0]) pars = [int(x.split(":")[0]) for x in parents[1:]] scrs = [x.split(":")[1] for x in parents[1:]] assert(id==tok['id']) tok['cand_parents'] = pars return sents
def conll_to_sents(fh, ignore_errs=True): for sent in yutils.tokenize_blanks(fh): if ignore_errs and sent[0][0][0] == "@": continue lines = [] for x in sent: if x[0].strip().startswith("#"): continue if x[6].strip() == "_" or x[7].strip() == "_": continue if len(x) != 10: continue lines.append(x) if len(lines) > 0: yield [to_tok(l) for l in lines]
def conll_to_sents_strids(fh,ignore_errs=True): for sent in yutils.tokenize_blanks(fh): if ignore_errs and sent[0][0][0]=="@": continue yield [to_tok_str(l) for l in sent]
projective=decoder.decode(1).next() new_parents=[p for (p,c) in sorted(projective,key=lambda x:x[1])] return new_parents def projectivize_conll(conll_sent): parents = [int(tok[-4]) for tok in conll_sent] new_parents=projectivize_parents(parents) for tok,p in zip(conll_sent,new_parents): tok[-4]=str(p) if __name__=='__main__': MODE = "parents" MODE = "conll" if MODE=="parents": for line in file(sys.argv[1]): parents = [int(x.split(":")[0]) for x in line.strip().split()] if not parents: continue print "\t".join(map(str,projectivize_parents(parents))) if MODE=="conll": import yutils for sent in yutils.tokenize_blanks(file(sys.argv[1])): projectivize_conll(sent) for tok in sent: print "\t".join(tok) print sys.stderr.write(".") sys.stderr.flush()
def conll_to_sents(fh,ignore_errs=True): for sent in yutils.tokenize_blanks(fh): if ignore_errs and sent[0][0][0]=="@": continue yield [to_tok(l) for l in sent]