Example #1
0
 def generate_training_data(self, infile, feat_options={}, encoding='utf-8'):
     data_file_name = tempfile.mktemp()
     data_file = codecs.open(data_file_name,'w',encoding)
     inst_ct = 0
     for s in BrownReader(infile):
         # build token list for each sentence (urgh! FIXME)
         tokens = []
         for wd,tag in s:
             token = Token( string=wd, pos=tag )
             token.label = token.pos # label is POS tag
             tokens.append( token )
         # create training instance for each token
         for i in range(len(tokens)):
             inst_ct += 1
             os.write(0, "%s" %"\b"*len(str(inst_ct))+str(inst_ct))
             inst = Instance( label=tokens[i].label,\
                              index=i, tokens=tokens,\
                              feat_selection=feat_options,\
                              lex_dict=self.lex_dict,\
                              tag_dict=self.tag_dict,\
                              cache=self.cache )
             inst.get_features()
             print >> data_file, inst.__str__()
             # print >> sys.stderr, inst.__str__().encode('utf8')
     os.write(0,'\n')
     data_file.close()
     return data_file_name
Example #2
0
 def tag_token_sequence(self, tokens, feat_options={}, beam_size=3):
     ''' N-best breath search for the best tag sequence for each sentence'''
     # maintain N-best sequences of tagged tokens
     sequences = [([],0.0)]  # log prob.
     for i,token in enumerate(tokens):
         n_best_sequences = []
         # cache static features
         cached_inst = Instance( label=tokens[i].label,
                                 index=i, tokens=tokens,
                                 feat_selection=feat_options,
                                 lex_dict=self.lex_dict,
                                 tag_dict=self.tag_dict,
                                 cache=self.cache )
         cached_inst.get_static_features()
         # get possible tags: union of tags found in tag_dict and
         # lex_dict
         wd = token.string
         legit_tags1 = self.tag_dict.get(wd,{})
         legit_tags2 = {} # self.lex_dict.get(wd,{}) 
         for j,seq in enumerate(sequences):
             seq_j,log_pr_j = sequences[j]
             tokens_j = seq_j+tokens[i:] # tokens with previous labels
             # classify token
             inst = Instance( label=tokens[i].label,
                              index=i, tokens=tokens_j, 
                              feat_selection=feat_options,
                              lex_dict=self.lex_dict,
                              tag_dict=self.tag_dict,
                              cache=self.cache )
             inst.fv = cached_inst.fv[:]
             inst.get_sequential_features()
             label_pr_distrib = self.classifier.class_distribution(inst.fv)
             # extend sequence j with current token
             for (cl,pr) in label_pr_distrib:
                 # make sure that cl is a legal tag
                 if legit_tags1 or legit_tags2:
                     if (cl not in legit_tags1) and (cl not in legit_tags2):
                         continue
                 labelled_token = Token(string=token.string,pos=token.pos,\
                                        comment=token.comment,\
                                        label=cl,label_pr_distrib=label_pr_distrib)
                 n_best_sequences.append((seq_j+[labelled_token],log_pr_j+math.log(pr)))
         # sort sequences
         n_best_sequences.sort( key=operator.itemgetter(1) )
         # keep N best
         sequences = n_best_sequences[-beam_size:]
     # return sequence with highest prob. 
     best_sequence = sequences[-1][0]
     # print >> sys.stderr, "Best tok seq:", [(t.string,t.label) for t in best_sequence]
     return best_sequence