def predict(args): """ main() to launch everything """ if not os.path.exists(args.model[0]): print >> sys.stderr, "no model", args.model[0] sys.exit(1) print >> sys.stderr, "loading models" relmodel=second_layer.Model(args.model[0],u"rel") rel=second_layer.Relativizers(relmodel) ccmodel=second_layer.Model(args.model[0],u"ccprop") ccprop=second_layer.ConjPropagation(ccmodel) xs=second_layer.Xsubjects() print >> sys.stderr, "predicting" count=0 for comments,sent in read_conll(args.input): t=tree.Tree(sent) rel.predict(t) xs.predict(t) ccprop.predict(t) xs.predict(t) for comm in comments: print >> sys.stdout, comm.encode(u"utf-8") if args.no_conllu: t.tree_to_conll() else: t.to_conllu() count+=1 if count%100==0: print >> sys.stderr, count print >> sys.stderr, "sentences:",count
def parse(self,inp,outp): """outp should be a file open for writing unicode""" for sent in read_conll(inp): beam=[State(sent,syn=False)] while not self.beam_ready(beam): beam=self.give_next_state(beam) #This looks wasteful, but it is what the beam will do anyway fill_conll(sent,beam[0]) write_conll(outp,sent)
def parse(self, inp, outp): """outp should be a file open for writing unicode""" for sent in read_conll(inp): beam = [State(sent, syn=False)] while not self.beam_ready(beam): beam = self.give_next_state( beam ) #This looks wasteful, but it is what the beam will do anyway fill_conll(sent, beam[0]) write_conll(outp, sent)
def collect(cls, model_name, corpus, cutoff=2, conll_format="conll09"): form = formats[conll_format] pairs = defaultdict(lambda: 0) for sent in read_conll(corpus): for token in sent: gov = int(token[form.HEAD]) if gov == 0: continue # ROOT gov_pos = sent[gov - 1][form.POS] pos, deprel = token[form.POS], token[form.DEPREL] pairs[(gov_pos, pos, deprel)] += 1 types = {} for (gov_pos, pos, deprel), value in pairs.iteritems(): if value > cutoff: types.setdefault((gov_pos, pos), set()).add(deprel) # now we have collected the dictionary, pickle it and create a model instance f = codecs.open(model_name, u"wb") cPickle.dump(types, f) f.close() model = cls(types) return model
def collect(cls,model_name,corpus,cutoff=2,conll_format="conll09"): form=formats[conll_format] pairs=defaultdict(lambda:0) for sent in read_conll(corpus): for token in sent: gov=int(token[form.HEAD]) if gov==0: continue # ROOT gov_pos=sent[gov-1][form.POS] pos,deprel=token[form.POS],token[form.DEPREL] pairs[(gov_pos,pos,deprel)]+=1 types={} for (gov_pos,pos,deprel),value in pairs.iteritems(): if value>cutoff: types.setdefault((gov_pos,pos),set()).add(deprel) # now we have collected the dictionary, pickle it and create a model instance f=codecs.open(model_name,u"wb") cPickle.dump(types,f) f.close() model=cls(types) return model
def train(args): """ main() to launch everything """ if not args.no_ccprop: if not os.path.exists(os.path.join(args.output,u"ccprop")): os.makedirs(os.path.join(args.output,u"ccprop")) cc_trainf=codecs.open(os.path.join(args.output,u"ccprop","train.txt"),"wt",u"utf-8") ccprop=second_layer.ConjPropagation() else: ccprop=None if not args.no_rel: if not os.path.exists(os.path.join(args.output,u"rel")): os.makedirs(os.path.join(args.output,u"rel")) rel_trainf=codecs.open(os.path.join(args.output,u"rel","train.txt"),"wt",u"utf-8") rel=second_layer.Relativizers() else: rel=None count=0 print >> sys.stderr, "collecting training data" for comments,sent in read_conll(args.input): t=tree.Tree(sent) if rel is not None: rel.learn(t,rel_trainf) if ccprop is not None: ccprop.learn(t,cc_trainf) count+=1 print >> sys.stderr, "sentences:",count print >> sys.stderr, "converting training files" if not args.no_ccprop: cc_trainf.close() convert_toNumbers(False,u"ccprop",args.output) if not args.no_rel: rel_trainf.close() convert_toNumbers(False,u"rel",args.output)
def train(self,inp,progress=0.0,quiet=False): """If inp is string, it will be interpreted as a file, otherwise as open file reading unicode""" total=0 failed=0 non=0 for sent in read_conll(inp): total+=1 gs_tree=Tree.new_from_conll(conll=sent,syn=True) non_projs=gs_tree.is_nonprojective() if len(non_projs)>0: gs_tree.define_projective_order(non_projs) non+=1 try: gs_transitions=self.extract_transitions(gs_tree,sent) self.train_one_sent(gs_transitions,sent,progress) # sent is a conll sentence except ValueError: traceback.print_exc() failed+=1 if not quiet: print u"Failed to parse:",failed print u"Total number of trees:",total print u"Non-projectives:",non print u"Progress:",progress
def train(self, inp, progress=0.0, quiet=False): """If inp is string, it will be interpreted as a file, otherwise as open file reading unicode""" total = 0 failed = 0 non = 0 for sent in read_conll(inp): total += 1 gs_tree = Tree.new_from_conll(conll=sent, syn=True) non_projs = gs_tree.is_nonprojective() if len(non_projs) > 0: gs_tree.define_projective_order(non_projs) non += 1 try: gs_transitions = self.extract_transitions(gs_tree, sent) self.train_one_sent(gs_transitions, sent, progress) # sent is a conll sentence except ValueError: traceback.print_exc() failed += 1 if not quiet: print u"Failed to parse:", failed print u"Total number of trees:", total print u"Non-projectives:", non print u"Progress:", progress