Esempio n. 1
0
def kbest_conll_to_sents(fh,ignore_errs=True):
   while True:
      count = int(fh.next().strip())
      k=[]
      for i,sent in zip(xrange(count), yutils.tokenize_blanks(fh)):
         if ignore_errs and sent[0][0][0]=="@": continue
         k.append( [to_tok(l) for l in sent] )
      yield k
Esempio n. 2
0
def kbest_conll_to_sents(fh,ignore_errs=True):
   while True:
      count = int(fh.next().strip())
      k=[]
      for i,sent in zip(xrange(count), yutils.tokenize_blanks(fh)):
         if ignore_errs and sent[0][0][0]=="@": continue
         k.append( [to_tok(l) for l in sent] )
      yield k
Esempio n. 3
0
def conll_to_sents(file):
    """
    yield sentences from corpus, each sentence is a list of tokens
    :param file: corpus file
    :return: yield sentences
    """
    for sent in tokenize_blanks(file):
        yield [to_tok(l) for l in sent]
Esempio n. 4
0
def conll_to_sents2(fh,ignore_errs=True):
   from common import ROOT
   for sent in yutils.tokenize_blanks(fh):
      if ignore_errs and sent[0][0][0]=="@": continue
      sent = [to_tok(l) for l in sent]
      for tok in sent:
         par = tok['parent']
         if par==0: tok['partok']=ROOT
         elif par==-1: tok['partok']=None
         else: tok['partok']=sent[par-1]
      yield sent
Esempio n. 5
0
def conll_to_sents2(fh,ignore_errs=True):
   from common import ROOT
   for sent in yutils.tokenize_blanks(fh):
      if ignore_errs and sent[0][0][0]=="@": continue
      sent = [to_tok(l) for l in sent]
      for tok in sent:
         par = tok['parent']
         if par==0: tok['partok']=ROOT
         elif par==-1: tok['partok']=None
         else: tok['partok']=sent[par-1]
      yield sent
Esempio n. 6
0
def add_parents_annotation(sents, parents_file):
   sents = list(sents)
   parents_annotations = list(yutils.tokenize_blanks(file(parents_file)))
   assert len(sents)==len(parents_annotations)
   for s,p in zip(sents, parents_annotations):
      assert len(s)==len(p)
      for tok,parents in zip(s,p):
         id = int(parents[0])
         pars = [int(x.split(":")[0]) for x in parents[1:]]
         scrs = [x.split(":")[1] for x in parents[1:]]
         assert(id==tok['id'])
         tok['cand_parents'] = pars
   return sents
Esempio n. 7
0
def add_parents_annotation(sents, parents_file):
   sents = list(sents)
   parents_annotations = list(yutils.tokenize_blanks(file(parents_file)))
   assert len(sents)==len(parents_annotations)
   for s,p in zip(sents, parents_annotations):
      assert len(s)==len(p)
      for tok,parents in zip(s,p):
         id = int(parents[0])
         pars = [int(x.split(":")[0]) for x in parents[1:]]
         scrs = [x.split(":")[1] for x in parents[1:]]
         assert(id==tok['id'])
         tok['cand_parents'] = pars
   return sents
Esempio n. 8
0
def conll_to_sents(fh, ignore_errs=True):
    for sent in yutils.tokenize_blanks(fh):
        if ignore_errs and sent[0][0][0] == "@":
            continue
        lines = []
        for x in sent:
            if x[0].strip().startswith("#"):
                continue
            if x[6].strip() == "_" or x[7].strip() == "_":
                continue
            if len(x) != 10:
                continue
            lines.append(x)
        if len(lines) > 0:
            yield [to_tok(l) for l in lines]
Esempio n. 9
0
def conll_to_sents_strids(fh,ignore_errs=True):
   for sent in yutils.tokenize_blanks(fh):
      if ignore_errs and sent[0][0][0]=="@": continue
      yield [to_tok_str(l) for l in sent]
Esempio n. 10
0
   projective=decoder.decode(1).next()
   new_parents=[p for (p,c) in sorted(projective,key=lambda x:x[1])]
   return new_parents 

def projectivize_conll(conll_sent):
   parents = [int(tok[-4]) for tok in conll_sent]
   new_parents=projectivize_parents(parents)
   for tok,p in zip(conll_sent,new_parents):
      tok[-4]=str(p)


if __name__=='__main__':
   MODE = "parents"
   MODE = "conll"

   if MODE=="parents":
      for line in file(sys.argv[1]):
         parents = [int(x.split(":")[0]) for x in line.strip().split()]
         if not parents: continue
         print "\t".join(map(str,projectivize_parents(parents)))
   if MODE=="conll":
      import yutils
      for sent in yutils.tokenize_blanks(file(sys.argv[1])):
         projectivize_conll(sent)
         for tok in sent:
            print "\t".join(tok)
         print
         sys.stderr.write(".")
         sys.stderr.flush()

Esempio n. 11
0
def conll_to_sents(fh,ignore_errs=True):
   for sent in yutils.tokenize_blanks(fh):
      if ignore_errs and sent[0][0][0]=="@": continue
      yield [to_tok(l) for l in sent]