def viterbi(self, input_distribution, incremental_best=False): """Standard non incremental (sequence-level) viterbi over input_distribution input Keyword arguments: input_distribution -- the emmision probabilities of each step in the sequence, array of width n_classes incremental_best -- whether the tag sequence prefix is stored for each step in the sequence (slightly 'hack-remental' """ incrementalBest = [] sentlen = len(input_distribution) self.viterbi_init() for word_index in range(0, sentlen): self.viterbi_step(input_distribution, word_index, word_index == 0) # INCREMENTAL RESULTS (hack-remental. doing it post-hoc) # the best result we have so far, not given the next one if incremental_best: inc_best_tag_sequence = self.get_best_tag_sequence() incrementalBest.append(deepcopy(inc_best_tag_sequence[1:])) # done with all words/input in the sentence/sentence # find the probability of each tag having "se" next (end of utterance) # and use that to find the overall best sequence prev_converted = self.converted[-1] prev_viterbi = self.viterbi[-1] best_previous = max( prev_viterbi.keys(), key=lambda prevtag: prev_viterbi[prevtag] + log(self.cpd_tags[ prev_converted[prevtag]].prob("se"))) self.best_tagsequence = ["se", best_previous] # invert the list of backpointers self.backpointer.reverse() # go backwards through the list of backpointers # (or in this case forward, we've inverted the backpointer list) # in each case: # the following best tag is the one listed under # the backpointer for the current best tag current_best_tag = best_previous for bp in self.backpointer: self.best_tagsequence.append(bp[current_best_tag]) current_best_tag = bp[current_best_tag] self.best_tagsequence.reverse() if incremental_best: # NB also consumes the end of utterance token! Last two the same incrementalBest.append(self.best_tagsequence[1:-1]) return incrementalBest return self.best_tagsequence[1:-1]
def get_best_n_tag_sequences(self, n, noisy_channel_source_model=None): # Do a breadth-first search # try the best final tag and its backpointers, then the second # best final tag etc. # once all final tags are done and n > len(final tags) # move to the second best penult tags for each tag # from the best to worst, then the 3rd row # it terminates when n is reached # use the history self.history = [{"viterbi": deepcopy(viterbi), # "backpointer": deepcopy(backpointer), # "converted": deepcopy(converted)}] + self.history # num_seq = n if not noisy_channel_source_model else 1000 num_seq = n best_n = [] # the tag sequences with their probability (tuple) # print "len viterbi", len(self.viterbi) # print "len backpoint", len(self.backpointer) for viterbi_depth in range(len(self.viterbi) - 1, -1, -1): if len(best_n) == num_seq: break inc_prev_viterbi = deepcopy(self.viterbi[viterbi_depth]) # inc_best_previous = max(inc_prev_viterbi.keys(), # key=lambda prevtag: # inc_prev_viterbi[prevtag]) inc_previous = sorted(inc_prev_viterbi.items(), key=lambda x: x[1], reverse=True) for tag, prob in inc_previous: # print tag, prob # prob = inc_prev_viterbi[inc_best_previous] # assert(prob != log(0)), "highest likelihood is 0!" if prob == log(0): continue inc_best_tag_sequence = [tag] # invert the list of backpointers inc_backpointer = deepcopy(self.backpointer) inc_backpointer.reverse() # go backwards through the list of backpointers # (or in this case forward, we have inverted the # backpointer list) inc_current_best_tag = tag # print "backpointer..." d = 0 for bp in inc_backpointer: d += 1 # print "depth", d, "find bp for", inc_current_best_tag inc_best_tag_sequence.append(bp[inc_current_best_tag]) inc_current_best_tag = bp[inc_current_best_tag] # print "..." inc_best_tag_sequence.reverse() best_n.append((inc_best_tag_sequence, prob)) if len(best_n) == num_seq: break best_n = sorted(best_n, key=lambda x: x[1], reverse=True) debug = False if debug: print "getting best n" for s, p in best_n: print s[-1], p print "***" assert (best_n[0][1] > log(0.0)), "best prob 0!" if not noisy_channel_source_model: # return inc_best_tag_sequence return [x[0] for x in best_n] # if noisy channel do the interpolation # need to entertain the whole beam for the channel model and source # model # channel_beam = best_n # the tag sequences with their probability # source_beam = noisy_channel.get_best_n_tag_sequences(1000) # self.interpolate_(channel_beam, source_beam) channel_beam = [ lambda x: (x[0], tag_conversion.convert_to_source_model_tags(x[0]), x[1]) for x in best_n ] best_seqs = noisy_channel_source_model.\ interpolate_probs_with_n_best( channel_beam, source_beam_width=1000, output_beam_width=n) return best_seqs
def viterbi_step(self, input_distribution, word_index, sequence_initial=False, timing_data=None): """The principal viterbi calculation for an extension to the input prefix, i.e. not reseting. """ # source_weight = 13 # higher for WML if sequence_initial: # first time requires initialization with the start of sequence tag first_viterbi = {} first_backpointer = {} first_converted = {} if self.noisy_channel_source_model: first_noisy_channel = {} for tag in self.observation_tags: # don't record anything for the START tag # print tag if tag == "s" or tag == 'se': continue # print word_index # print input_distribution.shape # print self.tagToIndexDict[tag] # print input_distribution[word_index][self.tagToIndexDict[tag]] tag_prob = self.cpd_tags["s"].prob(self.convert_tag("s", tag)) if tag_prob >= 0.00001: # allowing for margin of error if self.constraint_only: # TODO for now treating this like a {0,1} constraint tag_prob = 1.0 else: tag_prob = 0.0 prob = log(tag_prob) + \ log(input_distribution[word_index][self.tagToIndexDict[tag]]) # no timing bias to start if self.noisy_channel_source_model: # noisy channel eliminate the missing tags source_tags = tag_conversion.\ convert_to_source_model_tags([tag], uttseg=True) source_prob, node = self.noisy_channel_source_model.\ get_log_diff_of_tag_suffix(source_tags, n=1) first_noisy_channel[tag] = node # prob = (source_weight * source_prob) + \ # ((1 - source_weight) * prob) prob += (SOURCE_WEIGHT * source_prob) first_viterbi[tag] = prob first_backpointer[tag] = "s" first_converted[tag] = self.convert_tag("s", tag) assert first_converted[tag] in self.tag_set,\ first_converted[tag] + " not in: " + str(self.tag_set) # store first_viterbi (the dictionary for the first word) # in the viterbi list, and record that the best previous tag # for any first tag is "s" (start of sequence tag) self.viterbi.append(first_viterbi) self.backpointer.append(first_backpointer) self.converted.append(first_converted) if self.noisy_channel_source_model: self.noisy_channel.append(first_noisy_channel) self.add_to_history(first_viterbi, first_backpointer, first_converted) return # else we're beyond the first word # start a new dictionary where we can store, for each tag, the prob # of the best tag sequence ending in that tag # for the current word in the sentence this_viterbi = {} # we also store the best previous converted tag this_converted = {} # added for the best converted tags # start a new dictionary we we can store, for each tag, # the best previous tag this_backpointer = {} # prev_viterbi is a dictionary that stores, for each tag, the prob # of the best tag sequence ending in that tag # for the previous word in the sentence. # So it stores, for each tag, the probability of a tag sequence # up to the previous word # ending in that tag. prev_viterbi = self.viterbi[-1] prev_converted = self.converted[-1] if self.noisy_channel_source_model: this_noisy_channel = {} prev_noisy_channel = self.noisy_channel[-1] # for each tag, determine what the best previous-tag is, # and what the probability is of the best tag sequence ending. # store this information in the dictionary this_viterbi if timing_data and self.timing_model: # print timing_data # X = self.timing_model_scaler.transform(np.asarray( # [timing_data[word_index-2:word_index+1]])) # TODO may already be an array # print "calculating timing" # print timing_data X = self.timing_model_scaler.transform(np.asarray([timing_data])) input_distribution_timing = self.timing_model.predict_proba(X) # print input_distribution_timing # raw_input() for tag in self.observation_tags: # don't record anything for the START/END tag if tag in ["s", "se"]: continue # joint probability calculation: # if this tag is X and the current word is w, then # find the previous tag Y such that # the best tag sequence that ends in X # actually ends in Y X # that is, the Y that maximizes # prev_viterbi[ Y ] * P(X | Y) * P( w | X) # The following command has the same notation # that you saw in the sorted() command. best_previous = None best_prob = log(0.0) # has to be -inf for log numbers # the inner loop which makes this quadratic complexity # in the size of the tag set for prevtag in prev_viterbi.keys(): # the best converted tag, needs to access the previous one prev_converted_tag = prev_converted[prevtag] # TODO there could be several conversions for this tag converted_tag = self.convert_tag(prev_converted_tag, tag) assert converted_tag in self.tag_set, tag + " " + \ converted_tag + " prev:" + str(prev_converted_tag) tag_prob = self.cpd_tags[prev_converted_tag].prob( converted_tag) if tag_prob >= 0.000001: # allowing for margin of error if self.constraint_only: # TODO for now treating this like a {0,1} constraint tag_prob = 1.0 test = converted_tag.lower() # check for different boosts for different tags if "rps" in test: # boost for start tags # boost for rps tag_prob = tag_prob * SPARSE_WEIGHT_RPS if "rpe" in test: # boost for rp end tags tag_prob = tag_prob * SPARSE_WEIGHT_RPE if "t_" in test[:2]: # boost for t tags tag_prob = tag_prob * SPARSE_WEIGHT_T_ if "_t" in test: tag_prob = tag_prob * SPARSE_WEIGHT_T if timing_data and self.timing_model: found = False for k, v in self.simple_trp_idx2label.items(): if v in tag: timing_tag = k found = True break if not found: raw_input("warning") # using the prob from the timing classifier # array over the different classes timing_prob = input_distribution_timing[0][timing_tag] if self.constraint_only: # just adapt the prob of the timing tag # tag_prob = timing_prob # the higher the timing weight the more influence # the timing classifier has tag_prob = (TIMING_WEIGHT * timing_prob) + tag_prob # print tag, timing_tag, timing_prob else: tag_prob = (TIMING_WEIGHT * timing_prob) + tag_prob else: tag_prob = 0.0 # the principal joint log prob prob = prev_viterbi[prevtag] + log(tag_prob) + \ log(input_distribution[word_index][self.tagToIndexDict[tag]]) # gets updated by noisy channel if in this mode if self.noisy_channel_source_model: prev_n_ch_node = prev_noisy_channel[prevtag] # The noisy channel model adds the score # if we assume this tag and the backpointed path # from the prev tag # Converting all to source tags first # NB this is what is slowing things down # Need to go from the known index # in the nc model full_backtrack_method = False if full_backtrack_method: inc_best_tag_sequence = [prevtag] # invert the list of backpointers inc_backpointer = deepcopy(self.backpointer) inc_backpointer.reverse() # go backwards through the list of backpointers # (or in this case forward, we have inverted the # backpointer list) inc_current_best_tag = prevtag for b_count, bp in enumerate(inc_backpointer): inc_best_tag_sequence.append( bp[inc_current_best_tag]) inc_current_best_tag = bp[inc_current_best_tag] if b_count > 9: break inc_best_tag_sequence.reverse() inc_best_tag_sequence.append(tag) # add tag source_tags = tag_conversion.\ convert_to_source_model_tags( inc_best_tag_sequence[1:], uttseg=True) source_prob, nc_node = \ self.noisy_channel_source_model.\ get_log_diff_of_tag_suffix( source_tags, n=1) else: # NB these only change if there is a backward # looking tag if "<rm-" in tag: m = re.search("<rm-([0-9]+)\/>", tag) if m: back = min( [int(m.group(1)), len(self.backpointer)]) suffix = ["<e/>"] * back + ["<f/>"] # to get the change in probability due to this # we need to backtrack further n = len(suffix) else: suffix = tag_conversion.\ convert_to_source_model_tags([tag]) n = 1 # just monotonic extention # print back, i, source_tags source_prob, nc_node = \ self.noisy_channel_source_model.\ get_log_diff_of_tag_suffix( suffix, start_node_ID=prev_n_ch_node, n=n) prob += (SOURCE_WEIGHT * source_prob) if prob >= best_prob: best_converted = converted_tag best_previous = prevtag best_prob = prob if self.noisy_channel_source_model: best_n_c_node = nc_node # if best result is 0 do not add, pruning, could set this higher if best_prob > log(0.0): this_converted[tag] = best_converted this_viterbi[tag] = best_prob # the most likely preceding tag for this current tag this_backpointer[tag] = best_previous if self.noisy_channel_source_model: this_noisy_channel[tag] = best_n_c_node # done with all tags in this iteration # so store the current viterbi step self.viterbi.append(this_viterbi) self.backpointer.append(this_backpointer) self.converted.append(this_converted) if self.noisy_channel_source_model: self.noisy_channel.append(this_noisy_channel) self.add_to_history(this_viterbi, this_backpointer, this_converted) return