Ejemplo n.º 1
0
 def __init__(self, grammar, input, options={}, *args, **kwargs):
     super(Baseline3Tagger, self).__init__(grammar, input, options, *args, **kwargs)
     process_chord_input(self)
     
     #### Tag the input sequence ####
     self._tagged_data = []
     self._batch_ranges = []
     # Group the input into pairs
     inpairs = group_pairs(self.input, none_final=True)
     # Get all the possible signs from the grammar
     for index,pair in enumerate(inpairs):
         features = {
             'duration' : self.durations[index],
             'time' : self.times[index],
         }
         word_signs = []
         # Now assign a probability to each tag, given the observation
         for tag in self.model.category_count.keys():
             sign = self.grammar.get_sign_for_word_by_tag(self.input[index], tag, extra_features=features)
             if sign is not None:
                 probability = self.model.get_prob_cat_given_chord_pair(tag, *pair)
                 word_signs.append((sign, tag, probability))
         word_signs = list(reversed(sorted([(sign, tag, prob) for sign,tag,prob in word_signs], key=lambda x:x[2])))
         self._tagged_data.append(word_signs)
         
         # Work out the sizes of the batches to return these in
         batches = batch_sizes([p for __,__,p in word_signs], self.batch_ratio)
         # Transform these into a form that's easier to use for getting the signs
         so_far = 0
         batch_ranges = []
         for batch in batches:
             batch_ranges.append((so_far,so_far+batch))
             so_far += batch
         self._batch_ranges.append(batch_ranges)
Ejemplo n.º 2
0
 def _tags_from_output(self, output):
     tags = []
     # Split up the output text to extract tags and probabilities
     for line in output.split("\n"):
         line = line.strip()
         if len(line):
             cols = line.split("\t")
             num_results = int(cols[2])
             results = []
             all_tags = []
             # Get the tags and probs from the output
             for result_num in range(num_results):
                 cat = cols[3+result_num*2]
                 prob = float(cols[4+result_num*2])
                 results.append((cat, prob))
                 all_tags.append(cat)
             
             # Check all the tags are covered and add them with 0 prob if not
             for tag in self.tag_list:
                 if tag not in all_tags:
                     results.append((tag, 0.0))
             
             tags.append(list(reversed(sorted(results, key=lambda x:x[1]))))
     
     if len(tags) != self.input_length:
         raise CandcTaggingError, "C&C output did not give a correct "\
             "set of tags: %s" % output
     
     # Redistribute the tag probability to account for unseen tags
     if self.options['unseen_tag_prob'] > 0.0:
         unseen_prob = self.options['unseen_tag_prob']
         # Scale down everything that has a probability
         prob_scale = 1.0 - unseen_prob
         for i in range(len(tags)):
             # Add reserved mass equally to every tag
             prob_add = unseen_prob / len(tags[i])
             tags[i] = [(tag,(prob*prob_scale+prob_add)) for \
                                 tag,prob in tags[i]]
     
     skip_tags = []
     # Work out what tags we're going to ignore altogether
     if self.options['ignore-unknown']:
         for tag_sequence in tags:
             for tag,prob in tag_sequence:
                 if tag not in self.grammar.families:
                     # This tag's not in the grammar: just ignore it
                     skip_tags.append(tag)
                     logger.warn("Ignoring tag '%s', which is not in "\
                         "the grammar." % tag)
     #~ #### I've already done this above
     #~ # Some tags get given zero probability by the model, either because 
     #~ #  it's not smoothing enough, or because of rounding errors
     #~ # We do a basic smoothing here, giving everything with 0 probability 
     #~ #  a probability smaller than the smallest the model assigned
     #~ smoothed_tags = []
     #~ for tag_probs in tags:
         #~ zeros = sum(prob == 0.0 for (tag,prob) in tag_probs)
         #~ # No need to smooth if everything got some prob
         #~ if zeros:
             #~ smallest = min(prob for (tag,prob) in tag_probs if prob > 0.0)
             #~ if smallest == 1.0:
                 #~ # This occasionally happens and messes things up
                 #~ # Just reserve a small amount for the zeros in this case
                 #~ smallest = 0.001
             #~ # Divide the smallest probability among the zero prob tags 
             #~ #  and discount the others
             #~ smooth_prob = smallest / zeros
             #~ discount = 1.0-(smallest)
             #~ tag_probs = [(tag, prob*discount if prob > 0.0 
                                              #~ else smooth_prob) 
                                         #~ for (tag,prob) in tag_probs]
         #~ smoothed_tags.append(tag_probs)
     #~ print smoothed_tags
     
     signs = [[] for i in range(self.input_length)]
     # Get an actual sign for each word/tag combination
     for index,word in enumerate(self.tokens):
         for (tag,prob) in tags[index]:
             if tag not in skip_tags:
                 # Consult the grammar to get a suitable sign if we can
                 sign = self.grammar.get_sign_for_word_by_tag(
                                         word,
                                         tag,
                                         extra_features={
                                             'time' : self.times[index],
                                             'duration' : self.durations[index]
                                         })
                 signs[index].append((sign,tag, prob))
             
     self.batch_sizes = []
     for results in signs:
         # Work out the batches that these should be returned in
         self.batch_sizes.append(batch_sizes([p for __,__,p in results], self.tag_batch_ratio))
     return signs