コード例 #1
0
ファイル: hmm.py プロジェクト: sumtuck/deep_disfluency
    def viterbi(self, input_distribution, incremental_best=False):
        """Standard non incremental (sequence-level) viterbi over input_distribution input

        Keyword arguments:
        input_distribution -- the emmision probabilities of each step in the sequence,
        array of width n_classes
        incremental_best -- whether the tag sequence prefix is stored for
        each step in the sequence (slightly 'hack-remental'
        """
        incrementalBest = []
        sentlen = len(input_distribution)
        self.viterbi_init()

        for word_index in range(0, sentlen):
            self.viterbi_step(input_distribution, word_index, word_index == 0)
            # INCREMENTAL RESULTS (hack-remental. doing it post-hoc)
            # the best result we have so far, not given the next one
            if incremental_best:
                inc_best_tag_sequence = self.get_best_tag_sequence()
                incrementalBest.append(deepcopy(inc_best_tag_sequence[1:]))
        # done with all words/input in the sentence/sentence
        # find the probability of each tag having "se" next (end of utterance)
        # and use that to find the overall best sequence
        prev_converted = self.converted[-1]
        prev_viterbi = self.viterbi[-1]
        best_previous = max(
            prev_viterbi.keys(),
            key=lambda prevtag: prev_viterbi[prevtag] + log(self.cpd_tags[
                prev_converted[prevtag]].prob("se")))
        self.best_tagsequence = ["se", best_previous]
        # invert the list of backpointers
        self.backpointer.reverse()
        # go backwards through the list of backpointers
        # (or in this case forward, we've inverted the backpointer list)
        # in each case:
        # the following best tag is the one listed under
        # the backpointer for the current best tag
        current_best_tag = best_previous
        for bp in self.backpointer:
            self.best_tagsequence.append(bp[current_best_tag])
            current_best_tag = bp[current_best_tag]
        self.best_tagsequence.reverse()
        if incremental_best:
            # NB also consumes the end of utterance token! Last two the same
            incrementalBest.append(self.best_tagsequence[1:-1])
            return incrementalBest
        return self.best_tagsequence[1:-1]
コード例 #2
0
ファイル: hmm.py プロジェクト: sumtuck/deep_disfluency
    def get_best_n_tag_sequences(self, n, noisy_channel_source_model=None):
        # Do a breadth-first search
        # try the best final tag and its backpointers, then the second
        # best final tag etc.
        # once all final tags are done and n > len(final tags)
        # move to the second best penult tags for each tag
        # from the best to worst, then the 3rd row
        # it terminates when n is reached
        # use the history self.history = [{"viterbi": deepcopy(viterbi),
        #                 "backpointer": deepcopy(backpointer),
        #                 "converted": deepcopy(converted)}] + self.history
        # num_seq = n if not noisy_channel_source_model else 1000
        num_seq = n
        best_n = []  # the tag sequences with their probability (tuple)
        # print "len viterbi", len(self.viterbi)
        # print "len backpoint", len(self.backpointer)
        for viterbi_depth in range(len(self.viterbi) - 1, -1, -1):
            if len(best_n) == num_seq:
                break
            inc_prev_viterbi = deepcopy(self.viterbi[viterbi_depth])
            # inc_best_previous = max(inc_prev_viterbi.keys(),
            #                        key=lambda prevtag:
            # inc_prev_viterbi[prevtag])
            inc_previous = sorted(inc_prev_viterbi.items(),
                                  key=lambda x: x[1],
                                  reverse=True)
            for tag, prob in inc_previous:
                # print tag, prob
                # prob = inc_prev_viterbi[inc_best_previous]
                # assert(prob != log(0)), "highest likelihood is 0!"
                if prob == log(0):
                    continue
                inc_best_tag_sequence = [tag]
                # invert the list of backpointers
                inc_backpointer = deepcopy(self.backpointer)
                inc_backpointer.reverse()
                # go backwards through the list of backpointers
                # (or in this case forward, we have inverted the
                # backpointer list)
                inc_current_best_tag = tag
                # print "backpointer..."
                d = 0
                for bp in inc_backpointer:
                    d += 1
                    # print "depth", d, "find bp for", inc_current_best_tag
                    inc_best_tag_sequence.append(bp[inc_current_best_tag])
                    inc_current_best_tag = bp[inc_current_best_tag]
                # print "..."
                inc_best_tag_sequence.reverse()
                best_n.append((inc_best_tag_sequence, prob))
                if len(best_n) == num_seq:
                    break
        best_n = sorted(best_n, key=lambda x: x[1], reverse=True)
        debug = False
        if debug:
            print "getting best n"
            for s, p in best_n:
                print s[-1], p
            print "***"
        assert (best_n[0][1] > log(0.0)), "best prob 0!"

        if not noisy_channel_source_model:
            # return inc_best_tag_sequence
            return [x[0] for x in best_n]
        # if noisy channel do the interpolation
        # need to entertain the whole beam for the channel model and source
        # model
        # channel_beam = best_n  # the tag sequences with their probability
        # source_beam = noisy_channel.get_best_n_tag_sequences(1000)
        # self.interpolate_(channel_beam, source_beam)
        channel_beam = [
            lambda x:
            (x[0], tag_conversion.convert_to_source_model_tags(x[0]), x[1])
            for x in best_n
        ]
        best_seqs = noisy_channel_source_model.\
            interpolate_probs_with_n_best(
                channel_beam,
                source_beam_width=1000,
                output_beam_width=n)
        return best_seqs
コード例 #3
0
ファイル: hmm.py プロジェクト: sumtuck/deep_disfluency
    def viterbi_step(self,
                     input_distribution,
                     word_index,
                     sequence_initial=False,
                     timing_data=None):
        """The principal viterbi calculation for an extension to the
        input prefix, i.e. not reseting.
        """
        # source_weight = 13 # higher for WML
        if sequence_initial:
            # first time requires initialization with the start of sequence tag
            first_viterbi = {}
            first_backpointer = {}
            first_converted = {}
            if self.noisy_channel_source_model:
                first_noisy_channel = {}
            for tag in self.observation_tags:
                # don't record anything for the START tag
                # print tag
                if tag == "s" or tag == 'se':
                    continue
                # print word_index
                # print input_distribution.shape
                # print self.tagToIndexDict[tag]
                # print input_distribution[word_index][self.tagToIndexDict[tag]]
                tag_prob = self.cpd_tags["s"].prob(self.convert_tag("s", tag))
                if tag_prob >= 0.00001:  # allowing for margin of error
                    if self.constraint_only:
                        # TODO for now treating this like a {0,1} constraint
                        tag_prob = 1.0
                else:
                    tag_prob = 0.0

                prob = log(tag_prob) + \
                    log(input_distribution[word_index][self.tagToIndexDict[tag]])
                # no timing bias to start
                if self.noisy_channel_source_model:
                    # noisy channel eliminate the missing tags
                    source_tags = tag_conversion.\
                                    convert_to_source_model_tags([tag],
                                                                 uttseg=True)
                    source_prob, node = self.noisy_channel_source_model.\
                        get_log_diff_of_tag_suffix(source_tags,
                                                   n=1)
                    first_noisy_channel[tag] = node
                    # prob = (source_weight * source_prob) + \
                    #        ((1 - source_weight) * prob)
                    prob += (SOURCE_WEIGHT * source_prob)
                first_viterbi[tag] = prob
                first_backpointer[tag] = "s"
                first_converted[tag] = self.convert_tag("s", tag)
                assert first_converted[tag] in self.tag_set,\
                    first_converted[tag] + " not in: " + str(self.tag_set)
            # store first_viterbi (the dictionary for the first word)
            # in the viterbi list, and record that the best previous tag
            # for any first tag is "s" (start of sequence tag)
            self.viterbi.append(first_viterbi)
            self.backpointer.append(first_backpointer)
            self.converted.append(first_converted)
            if self.noisy_channel_source_model:
                self.noisy_channel.append(first_noisy_channel)
            self.add_to_history(first_viterbi, first_backpointer,
                                first_converted)
            return
        # else we're beyond the first word
        # start a new dictionary where we can store, for each tag, the prob
        # of the best tag sequence ending in that tag
        # for the current word in the sentence
        this_viterbi = {}
        # we also store the best previous converted tag
        this_converted = {}  # added for the best converted tags
        # start a new dictionary we we can store, for each tag,
        # the best previous tag
        this_backpointer = {}
        # prev_viterbi is a dictionary that stores, for each tag, the prob
        # of the best tag sequence ending in that tag
        # for the previous word in the sentence.
        # So it stores, for each tag, the probability of a tag sequence
        # up to the previous word
        # ending in that tag.
        prev_viterbi = self.viterbi[-1]
        prev_converted = self.converted[-1]
        if self.noisy_channel_source_model:
            this_noisy_channel = {}
            prev_noisy_channel = self.noisy_channel[-1]
        # for each tag, determine what the best previous-tag is,
        # and what the probability is of the best tag sequence ending.
        # store this information in the dictionary this_viterbi
        if timing_data and self.timing_model:
            # print timing_data
            # X = self.timing_model_scaler.transform(np.asarray(
            # [timing_data[word_index-2:word_index+1]]))
            # TODO may already be an array
            # print "calculating timing"
            # print timing_data
            X = self.timing_model_scaler.transform(np.asarray([timing_data]))
            input_distribution_timing = self.timing_model.predict_proba(X)
            # print input_distribution_timing
            # raw_input()
        for tag in self.observation_tags:
            # don't record anything for the START/END tag
            if tag in ["s", "se"]:
                continue
            # joint probability calculation:
            # if this tag is X and the current word is w, then
            # find the previous tag Y such that
            # the best tag sequence that ends in X
            # actually ends in Y X
            # that is, the Y that maximizes
            # prev_viterbi[ Y ] * P(X | Y) * P( w | X)
            # The following command has the same notation
            # that you saw in the sorted() command.
            best_previous = None
            best_prob = log(0.0)  # has to be -inf for log numbers
            # the inner loop which makes this quadratic complexity
            # in the size of the tag set
            for prevtag in prev_viterbi.keys():
                # the best converted tag, needs to access the previous one
                prev_converted_tag = prev_converted[prevtag]
                # TODO there could be several conversions for this tag
                converted_tag = self.convert_tag(prev_converted_tag, tag)
                assert converted_tag in self.tag_set, tag + " " + \
                    converted_tag + " prev:" + str(prev_converted_tag)
                tag_prob = self.cpd_tags[prev_converted_tag].prob(
                    converted_tag)
                if tag_prob >= 0.000001:  # allowing for margin of error
                    if self.constraint_only:
                        # TODO for now treating this like a {0,1} constraint
                        tag_prob = 1.0
                    test = converted_tag.lower()
                    # check for different boosts for different tags
                    if "rps" in test:  # boost for start tags
                        # boost for rps
                        tag_prob = tag_prob * SPARSE_WEIGHT_RPS
                    if "rpe" in test:
                        # boost for rp end tags
                        tag_prob = tag_prob * SPARSE_WEIGHT_RPE
                    if "t_" in test[:2]:
                        # boost for t tags
                        tag_prob = tag_prob * SPARSE_WEIGHT_T_
                    if "_t" in test:
                        tag_prob = tag_prob * SPARSE_WEIGHT_T
                    if timing_data and self.timing_model:
                        found = False
                        for k, v in self.simple_trp_idx2label.items():
                            if v in tag:
                                timing_tag = k
                                found = True
                                break
                        if not found:
                            raw_input("warning")
                        # using the prob from the timing classifier
                        # array over the different classes
                        timing_prob = input_distribution_timing[0][timing_tag]
                        if self.constraint_only:
                            # just adapt the prob of the timing tag
                            # tag_prob = timing_prob
                            # the higher the timing weight the more influence
                            # the timing classifier has
                            tag_prob = (TIMING_WEIGHT * timing_prob) + tag_prob
                            # print tag, timing_tag, timing_prob
                        else:
                            tag_prob = (TIMING_WEIGHT * timing_prob) + tag_prob
                else:
                    tag_prob = 0.0
                # the principal joint log prob
                prob = prev_viterbi[prevtag] + log(tag_prob) + \
                    log(input_distribution[word_index][self.tagToIndexDict[tag]])

                # gets updated by noisy channel if in this mode
                if self.noisy_channel_source_model:
                    prev_n_ch_node = prev_noisy_channel[prevtag]
                    # The noisy channel model adds the score
                    # if we assume this tag and the backpointed path
                    # from the prev tag
                    # Converting all to source tags first
                    # NB this is what is slowing things down
                    # Need to go from the known index
                    # in the nc model
                    full_backtrack_method = False
                    if full_backtrack_method:
                        inc_best_tag_sequence = [prevtag]
                        # invert the list of backpointers
                        inc_backpointer = deepcopy(self.backpointer)
                        inc_backpointer.reverse()
                        # go backwards through the list of backpointers
                        # (or in this case forward, we have inverted the
                        # backpointer list)
                        inc_current_best_tag = prevtag
                        for b_count, bp in enumerate(inc_backpointer):
                            inc_best_tag_sequence.append(
                                bp[inc_current_best_tag])
                            inc_current_best_tag = bp[inc_current_best_tag]
                            if b_count > 9:
                                break
                        inc_best_tag_sequence.reverse()
                        inc_best_tag_sequence.append(tag)  # add tag
                        source_tags = tag_conversion.\
                            convert_to_source_model_tags(
                                                inc_best_tag_sequence[1:],
                                                uttseg=True)
                        source_prob, nc_node = \
                            self.noisy_channel_source_model.\
                            get_log_diff_of_tag_suffix(
                                source_tags,
                                n=1)
                    else:
                        # NB these only change if there is a backward
                        # looking tag
                        if "<rm-" in tag:
                            m = re.search("<rm-([0-9]+)\/>", tag)
                            if m:
                                back = min(
                                    [int(m.group(1)),
                                     len(self.backpointer)])
                                suffix = ["<e/>"] * back + ["<f/>"]
                            # to get the change in probability due to this
                            # we need to backtrack further
                            n = len(suffix)
                        else:
                            suffix = tag_conversion.\
                                        convert_to_source_model_tags([tag])
                            n = 1  # just monotonic extention
                            # print back, i, source_tags
                        source_prob, nc_node = \
                            self.noisy_channel_source_model.\
                            get_log_diff_of_tag_suffix(
                                        suffix,
                                        start_node_ID=prev_n_ch_node,
                                        n=n)

                    prob += (SOURCE_WEIGHT * source_prob)

                if prob >= best_prob:
                    best_converted = converted_tag
                    best_previous = prevtag
                    best_prob = prob
                    if self.noisy_channel_source_model:
                        best_n_c_node = nc_node
            # if best result is 0 do not add, pruning, could set this higher
            if best_prob > log(0.0):
                this_converted[tag] = best_converted
                this_viterbi[tag] = best_prob
                # the most likely preceding tag for this current tag
                this_backpointer[tag] = best_previous
                if self.noisy_channel_source_model:
                    this_noisy_channel[tag] = best_n_c_node
        # done with all tags in this iteration
        # so store the current viterbi step
        self.viterbi.append(this_viterbi)
        self.backpointer.append(this_backpointer)
        self.converted.append(this_converted)
        if self.noisy_channel_source_model:
            self.noisy_channel.append(this_noisy_channel)
        self.add_to_history(this_viterbi, this_backpointer, this_converted)
        return