Beispiel #1
0
    def __init__(self, grammar, input, options={}, *args, **kwargs):
        """
        Tags using an ngram model backed by NLTK.
        
        """
        super(NgramTagger, self).__init__(grammar, input, options, *args,
                                          **kwargs)
        process_chord_input(self)

        #### Tag the input sequence ####
        self._tagged_data = []
        self._batch_ranges = []
        # Group the input into pairs to get observations
        inpairs = group_pairs(self.input, none_final=True)
        # Convert the pairs into observations
        observations = [
            observation_from_chord_pair(pair[0], pair[1], self.model.chordmap)
            for pair in inpairs
        ]

        # Use the ngram model to get tag probabilities for each input by
        # computing the forward probability matrix
        if self.options['decode'] == "viterbi":
            probabilities = self.model.viterbi_probabilities(observations)
        elif self.options['decode'] == "forward":
            probabilities = self.model.forward_probabilities(observations)
        else:
            probabilities = self.model.forward_backward_probabilities(
                observations)

        word_tag_probs = []

        for index, probs in enumerate(probabilities):
            features = {
                'duration': self.durations[index],
                'time': self.times[index],
            }
            word_signs = []
            # Now assign a probability to each tag, given the observation
            for tag in self.model.tags:
                # Read a full sign out of the grammar
                sign = self.grammar.get_sign_for_word_by_tag(
                    self.input[index], tag, extra_features=features)
                if sign is not None:
                    # Read off the probability from the matrix
                    probability = probs[tag]
                    word_signs.append((sign, tag, probability))

            # Randomly sort the list first to make sure equal probabilities are randomly ordered
            word_signs = [(sign, tag, prob) for sign, tag, prob in word_signs]
            random.shuffle(word_signs)
            # Now sort by probability
            word_signs = list(reversed(sorted(word_signs, key=lambda x: x[2])))
            self._tagged_data.append(word_signs)

            # Store the list of probabilities for tags, which we'll use
            #  after we've tagged every word to work out the sizes
            #  of the tag batches
            word_tag_probs.append([p for __, __, p in word_signs])

        if self.options['best']:
            # Only return one for each word
            self._batch_ranges = [[(0, 1)] for i in range(len(self.input))]
        else:
            # Work out the number of tags to return in each batch
            batch_sizes = beamed_batch_sizes(word_tag_probs, self.batch_ratio)
            # So far, this has assigned a probability to every possible
            #  tag. We don't want the tagger ever to return the least
            #  probably batch of tags, unless it's the only one.
            #batch_sizes = [batches[:-1] if len(batches) > 1 else batches for batches in batch_sizes]
            # Transform these into a form that's easier to use for getting the signs
            self._batch_ranges = [[(sum(batches[:i]),sum(batches[:i+1])) for i in range(len(batches))] \
                                    for batches in batch_sizes]
Beispiel #2
0
 def __init__(self, grammar, input, options={}, *args, **kwargs):
     """
     Tags using an ngram model backed by NLTK.
     
     """
     super(NgramTagger, self).__init__(grammar, input, options, *args, **kwargs)
     process_chord_input(self)
     
     #### Tag the input sequence ####
     self._tagged_data = []
     self._batch_ranges = []
     # Group the input into pairs to get observations
     inpairs = group_pairs(self.input, none_final=True)
     # Convert the pairs into observations
     observations = [observation_from_chord_pair(pair[0], pair[1], self.model.chordmap) for pair in inpairs]
     
     # Use the ngram model to get tag probabilities for each input by 
     # computing the forward probability matrix
     if self.options['decode'] == "viterbi":
         probabilities = self.model.viterbi_probabilities(observations)
     elif self.options['decode'] == "forward":
         probabilities = self.model.forward_probabilities(observations)
     else:
         probabilities = self.model.forward_backward_probabilities(observations)
         
     word_tag_probs = []
     
     for index,probs in enumerate(probabilities):
         features = {
             'duration' : self.durations[index],
             'time' : self.times[index],
         }
         word_signs = []
         # Now assign a probability to each tag, given the observation
         for tag in self.model.tags:
             # Read a full sign out of the grammar
             sign = self.grammar.get_sign_for_word_by_tag(self.input[index], tag, extra_features=features)
             if sign is not None:
                 # Read off the probability from the matrix
                 probability = probs[tag]
                 word_signs.append((sign, tag, probability))
         
         # Randomly sort the list first to make sure equal probabilities are randomly ordered
         word_signs = [(sign, tag, prob) for sign,tag,prob in word_signs]
         random.shuffle(word_signs)
         # Now sort by probability
         word_signs = list(reversed(sorted(word_signs, key=lambda x:x[2])))
         self._tagged_data.append(word_signs)
         
         # Store the list of probabilities for tags, which we'll use 
         #  after we've tagged every word to work out the sizes
         #  of the tag batches
         word_tag_probs.append([p for __,__,p in word_signs])
     
     if self.options['best']:
         # Only return one for each word
         self._batch_ranges = [[(0,1)] for i in range(len(self.input))]
     else:
         # Work out the number of tags to return in each batch
         batch_sizes = beamed_batch_sizes(word_tag_probs, self.batch_ratio)
         # So far, this has assigned a probability to every possible 
         #  tag. We don't want the tagger ever to return the least 
         #  probably batch of tags, unless it's the only one.
         #batch_sizes = [batches[:-1] if len(batches) > 1 else batches for batches in batch_sizes]
         # Transform these into a form that's easier to use for getting the signs
         self._batch_ranges = [[(sum(batches[:i]),sum(batches[:i+1])) for i in range(len(batches))] \
                                 for batches in batch_sizes]
Beispiel #3
0
 def __init__(self, grammar, input, options={}, *args, **kwargs):
     super(MultiChordNgramTagger, self).__init__(grammar, input, options, *args, **kwargs)
     process_chord_input(self)
     
     #### Tag the input sequence ####
     self._tagged_times = []
     self._tagged_spans = []
     self._batch_ranges = []
     word_tag_probs = []
     
     # Map the chord types as the model requires
     chord_map = self.model.chordmap
     
     if isinstance(self.wrapped_input, ChordInput):
         chords = self.wrapped_input.to_db_input().chords
         observations = [(chord.root, chord_map[chord.type]) for chord in chords]
         self.input = chords
     elif isinstance(self.wrapped_input, DbInput):
         observations = [(chord.root, chord_map[chord.type]) for chord in self.wrapped_input.chords]
     elif isinstance(self.wrapped_input, WeightedChordLabelInput):
         observations = lattice_to_emissions(input, chord_map=chord_map)
         
     # Use the ngram model to get tag probabilities for each input by 
     # computing the forward probability matrix
     if self.options['decode'] == "forward":
         probabilities = self.model.forward_probabilities(observations)
     else:
         probabilities = self.model.forward_backward_probabilities(observations)
     
     # Filter out zero probability states and order by desc prob
     probabilities = [
         reversed(sorted(\
             [(state,prob) for (state,prob) in timestep.items() if prob > 0.0], \
                 key=lambda x:x[1])) \
             for timestep in probabilities]
     
     for index,probs in enumerate(probabilities):
         features = {
             'duration' : self.durations[index],
             'time' : self.times[index],
         }
         
         word_signs = []
         for (state,prob) in probs:
             root,schema = state
             # Instantiate a sign for this state
             features['root'] = root
             signs = self.grammar.get_signs_for_tag(schema, features)
             # There should only be one of these
             if not signs:
                 continue
             else:
                 sign = signs[0]
             word_signs.append((sign, (root, schema), prob))
         
         self._tagged_times.append(word_signs)
         
         # Store the list of probabilities for tags, which we'll use 
         #  after we've tagged every word to work out the sizes
         #  of the tag batches
         word_tag_probs.append([p for __,__,p in word_signs])
     
     if self.options['best']:
         # Only return one for each word
         batch_ranges = [[(0,1)] for i in range(len(self.input))]
     else:
         # Work out the number of tags to return in each batch
         batch_sizes = beamed_batch_sizes(word_tag_probs, self.batch_ratio, max_batch=self.options['max_batch'])
         # Transform these into a form that's easier to use for getting the signs
         batch_ranges = [[(sum(batches[:i]),sum(batches[:i+1])) for i in range(len(batches))] \
                                 for batches in batch_sizes]
     
     # Step through adding each to see which we should also add to combine 
     #  repetitions of identical schema,root pairs
     def prob_combiner(probs):
         return sum(probs, 0.0) / float(len(probs))
     combiner = SpanCombiner()
     added = True
     offset = 0
     while added:
         added = False
         batch_spans = []
         for time in range(len(batch_ranges)):
             if offset < len(batch_ranges[time]):
                 start, end = batch_ranges[time][offset]
                 for sign_offset in range(start, end):
                     sign, (root,schema), prob = self._tagged_times[time][sign_offset]
                     added = True
                     # Add the length 1 span
                     batch_spans.append((time, time+1, (sign,(root,schema),prob)))
                     # Add this to the combiner to see if it combines 
                     #  with anything we've previously added
                     combined = combiner.combine_edge(
                                         (time, time+1, (root,schema)),
                                         properties=prob,
                                         prop_combiner=prob_combiner)
                     # Add each additional span with the same sign
                     for (span_start, span_end) in combined:
                         # Set the probability of the combined categories
                         new_prob = combiner.edge_properties[
                                     (span_start, span_end, (root,schema))]
                         # Set timing properties of this spanning category
                         features = {
                             'duration' : sum(
                                     self.durations[span_start:span_end]),
                             'time' : self.times[span_start],
                             'root' : root,
                         }
                         # Technically there could be multiple of these, 
                         #  though in fact there never are
                         new_signs = \
                             self.grammar.get_signs_for_tag(schema, features)
                         for new_sign in new_signs:
                             batch_spans.append(
                                 (span_start, span_end, 
                                     (new_sign, (root,schema), new_prob)))
         self._tagged_spans.append(batch_spans)
         offset += 1