def __init__(self,datapath,features): #self.diclist = self.load_dict(datapath) self.stemmer = Stemmer() self.features= features self.classifier = MegamClassifier( modelfile=datapath+".megam") return
class FunctionLabeller: def __init__(self,datapath,features): #self.diclist = self.load_dict(datapath) self.stemmer = Stemmer() self.features= features self.classifier = MegamClassifier( modelfile=datapath+".megam") return def label_tree(self,root): decorate_tree(root) self.label_functions(root) def label_functions(self,root): if not root.is_terminal_sym(): if find_head(root).label == "VN": # root.label in ["SENT","Sinf","Ssub","Sint","Srel","VN","VPinf","VPpart"] : # dep_node_list= [] for child in root.children: if child.funlabel == None: if child.label == "VN": dep_node_list += self.do_clitics(child) # marie : skip nodes that are supposed to be None #elif not child.head: elif not child.head and not(child.is_terminal_sym() or child.label == 'COORD'): dep_node_list.append(child) lines_seq = [] for elt in dep_node_list: featvector = grab_features(root,elt,self.features.keys()) # call stemmer simply to escape punctuation marks etc. for i in xrange(len(featvector)): featname = self.features.keys()[i] if featname.endswith("WORD"): featvector[i] = self.stemmer.stem(featvector[i],stem_len=100) lines_seq.append(featvector) funlabelsequence = self.label_sequence(lines_seq,self.features.keys()) # return list de classes avec ordre préservé class[i] = dep[i] for i in range(len(dep_node_list)): if funlabelsequence[i] == "None": funlabelsequence[i] = None dep_node_list[i].funlabel = funlabelsequence[i] for child in root.children: self.label_functions(child) def do_clitics(self,tree): """ This actually grabs data lines for every node under VN """ for child in tree.children: dep_node_list = [] if child.label in ['CL','CLS','CLO','CLR'] and not child.head: dep_node_list.append(child) return dep_node_list def label_sequence( self, seq_list, feat_names ): labels = [] # call constructor Sequence seq = Sequence( seq_list, feat_names ) # pointwise classification of dependent for i in range( len(seq.dependents) ): inst = PointwiseInstance( seq.dependents, i ) # classify cl = self.classifier.get_best_label( inst.feature_vector() ) # store label for each dependent labels.append( cl ) return labels # def label_sequence( self, seq_list, feat_names, beamsize=3 ): # # call constructor Sequence # seq = Sequence( seq_list, feat_names ) # dependents = seq.dependents # # maintain N-best sequences of dependent assignments # sequences = [([],0.0)] # log prob. # for i in range( len(dependents) ): # n_best_sequences = [] # # compute static features (these are cached) # cached_inst = PointwiseInstance( dependents, i ) # for j in range( len(sequences) ): # seq_j,log_pr_j = sequences[j] # deps_j = seq_j+dependents[i:] # # add sequential features # inst = PointwiseInstance( deps_j, i ) # inst.fv = deepcopy(cached_inst.fv) # inst.add_sequential_features( deps_j, i ) # # get pr distrib for different classes # label_pr_distrib = self.classifier.get_label_probs(inst.feature_vector()) # # extend sequence with dependent i # for (cl,pr) in label_pr_distrib: # dep = deepcopy(dependents[i]) # dep.cl = cl # n_best_sequences.append((seq_j+[dep],log_pr_j+math.log(pr))) # # sort sequences # n_best_sequences.sort(lambda x,y:cmp(x[1],y[1])) # # keep N best # sequences = n_best_sequences[-beamsize:] # # best sequence is sequence with highest prob. # best_sequence = sequences[-1][0] # # return labels for best_sequence # return [d.cl for d in best_sequence] def load_dict(self,datapath): f = open(datapath+".dict") diclist = pickle.load(f) f.close() return diclist