def predict(args):
    """
    main() to launch everything
    """

    if not os.path.exists(args.model[0]):
        print >> sys.stderr, "no model", args.model[0]
        sys.exit(1)

    print >> sys.stderr, "loading models"
    relmodel=second_layer.Model(args.model[0],u"rel")    
    rel=second_layer.Relativizers(relmodel)
    ccmodel=second_layer.Model(args.model[0],u"ccprop")
    ccprop=second_layer.ConjPropagation(ccmodel)
    xs=second_layer.Xsubjects()
    
    print >> sys.stderr, "predicting"
    count=0
    for comments,sent in read_conll(args.input):
        t=tree.Tree(sent)
        rel.predict(t)
        xs.predict(t)
        ccprop.predict(t)
        xs.predict(t)
        for comm in comments:
            print >> sys.stdout, comm.encode(u"utf-8")
        if args.no_conllu:
            t.tree_to_conll()
        else:
            t.to_conllu()
        count+=1
        if count%100==0:
            print >> sys.stderr, count
        
    print >> sys.stderr, "sentences:",count
 def parse(self,inp,outp):
     """outp should be a file open for writing unicode"""
     for sent in read_conll(inp):
         beam=[State(sent,syn=False)]
         while not self.beam_ready(beam):
             beam=self.give_next_state(beam) #This looks wasteful, but it is what the beam will do anyway
         fill_conll(sent,beam[0])
         write_conll(outp,sent)
Beispiel #3
0
 def parse(self, inp, outp):
     """outp should be a file open for writing unicode"""
     for sent in read_conll(inp):
         beam = [State(sent, syn=False)]
         while not self.beam_ready(beam):
             beam = self.give_next_state(
                 beam
             )  #This looks wasteful, but it is what the beam will do anyway
         fill_conll(sent, beam[0])
         write_conll(outp, sent)
Beispiel #4
0
 def collect(cls, model_name, corpus, cutoff=2, conll_format="conll09"):
     form = formats[conll_format]
     pairs = defaultdict(lambda: 0)
     for sent in read_conll(corpus):
         for token in sent:
             gov = int(token[form.HEAD])
             if gov == 0: continue  # ROOT
             gov_pos = sent[gov - 1][form.POS]
             pos, deprel = token[form.POS], token[form.DEPREL]
             pairs[(gov_pos, pos, deprel)] += 1
     types = {}
     for (gov_pos, pos, deprel), value in pairs.iteritems():
         if value > cutoff:
             types.setdefault((gov_pos, pos), set()).add(deprel)
     # now we have collected the dictionary, pickle it and create a model instance
     f = codecs.open(model_name, u"wb")
     cPickle.dump(types, f)
     f.close()
     model = cls(types)
     return model
 def collect(cls,model_name,corpus,cutoff=2,conll_format="conll09"):
     form=formats[conll_format]
     pairs=defaultdict(lambda:0)
     for sent in read_conll(corpus):
         for token in sent:
             gov=int(token[form.HEAD])
             if gov==0: continue # ROOT
             gov_pos=sent[gov-1][form.POS]
             pos,deprel=token[form.POS],token[form.DEPREL]
             pairs[(gov_pos,pos,deprel)]+=1
     types={}
     for (gov_pos,pos,deprel),value in pairs.iteritems():
         if value>cutoff:
             types.setdefault((gov_pos,pos),set()).add(deprel)
     # now we have collected the dictionary, pickle it and create a model instance
     f=codecs.open(model_name,u"wb")
     cPickle.dump(types,f)
     f.close()
     model=cls(types)
     return model
def train(args):
    """
    main() to launch everything
    """

    if not args.no_ccprop:
        if not os.path.exists(os.path.join(args.output,u"ccprop")):
            os.makedirs(os.path.join(args.output,u"ccprop"))
        cc_trainf=codecs.open(os.path.join(args.output,u"ccprop","train.txt"),"wt",u"utf-8")
        ccprop=second_layer.ConjPropagation()
    else:
        ccprop=None
        
    if not args.no_rel:
        if not os.path.exists(os.path.join(args.output,u"rel")):
            os.makedirs(os.path.join(args.output,u"rel"))
        rel_trainf=codecs.open(os.path.join(args.output,u"rel","train.txt"),"wt",u"utf-8")
        rel=second_layer.Relativizers()
    else:
        rel=None
    
    count=0
    print >> sys.stderr, "collecting training data"
    for comments,sent in read_conll(args.input):
        t=tree.Tree(sent)
        if rel is not None:
            rel.learn(t,rel_trainf)
        if ccprop is not None:
            ccprop.learn(t,cc_trainf)
        count+=1
    print >> sys.stderr, "sentences:",count

    print >> sys.stderr, "converting training files"
    if not args.no_ccprop:
        cc_trainf.close()
        convert_toNumbers(False,u"ccprop",args.output)
        
        
    if not args.no_rel:
        rel_trainf.close()
        convert_toNumbers(False,u"rel",args.output)
 def train(self,inp,progress=0.0,quiet=False):
     """If inp is string, it will be interpreted as a file, otherwise as open file reading unicode"""
     total=0
     failed=0
     non=0
     for sent in read_conll(inp):
         total+=1
         gs_tree=Tree.new_from_conll(conll=sent,syn=True)
         non_projs=gs_tree.is_nonprojective()
         if len(non_projs)>0:
             gs_tree.define_projective_order(non_projs)
             non+=1
         try:
             gs_transitions=self.extract_transitions(gs_tree,sent)
             self.train_one_sent(gs_transitions,sent,progress) # sent is a conll sentence
         except ValueError:
             traceback.print_exc()
             failed+=1 
     if not quiet:
         print u"Failed to parse:",failed
         print u"Total number of trees:",total
         print u"Non-projectives:",non
         print u"Progress:",progress
Beispiel #8
0
 def train(self, inp, progress=0.0, quiet=False):
     """If inp is string, it will be interpreted as a file, otherwise as open file reading unicode"""
     total = 0
     failed = 0
     non = 0
     for sent in read_conll(inp):
         total += 1
         gs_tree = Tree.new_from_conll(conll=sent, syn=True)
         non_projs = gs_tree.is_nonprojective()
         if len(non_projs) > 0:
             gs_tree.define_projective_order(non_projs)
             non += 1
         try:
             gs_transitions = self.extract_transitions(gs_tree, sent)
             self.train_one_sent(gs_transitions, sent,
                                 progress)  # sent is a conll sentence
         except ValueError:
             traceback.print_exc()
             failed += 1
     if not quiet:
         print u"Failed to parse:", failed
         print u"Total number of trees:", total
         print u"Non-projectives:", non
         print u"Progress:", progress