Beispiel #1
0
def do_accumilate_posterior_obs(accumilation_dict, obs, aj, ei, posterior_unigram_val):
    # these are actual counts in log space!!
    if isinstance(obs, str) and (not isinstance(aj, tuple)) and isinstance(ei, str):
        if ('count_obs', obs) in accumilation_dict:
            accumilation_dict[('count_obs', obs)] = lu.logadd(accumilation_dict[('count_obs', obs)], posterior_unigram_val)
        else:
            accumilation_dict[('count_obs', obs)] = posterior_unigram_val
        if ('count_state', aj) in accumilation_dict:
            accumilation_dict[('count_state', aj)] = lu.logadd(accumilation_dict[('count_state', aj)], posterior_unigram_val)
        else:
            accumilation_dict[('count_state', aj)] = posterior_unigram_val

        if ('count_emission', obs, ei) in accumilation_dict:
            accumilation_dict[('count_emission', obs, ei)] = lu.logadd(accumilation_dict[('count_emission', obs, ei)],
                                                                       posterior_unigram_val)
        else:
            accumilation_dict[('count_emission', obs, ei)] = posterior_unigram_val
            # doing total counts ...
        if ('any_emission_from', ei) in accumilation_dict:
            accumilation_dict[('any_emission_from', ei)] = lu.logadd(accumilation_dict[('any_emission_from', ei)], posterior_unigram_val)
        else:
            accumilation_dict[('any_emission_from', ei)] = posterior_unigram_val
        return accumilation_dict
    else:
        print 'obs must be string, aj must be str, ei must be string'
        exit()
def do_accumilate_posterior_obs(accumilation_dict, obs, aj, ei, posterior_unigram_val):
    # these are actual counts in log space!!
    if isinstance(obs, basestring) and (not isinstance(aj, tuple)) and isinstance(ei, basestring):
        if ('count_obs', obs) in accumilation_dict:
            accumilation_dict[('count_obs', obs)] = lu.logadd(accumilation_dict[('count_obs', obs)],
                                                              posterior_unigram_val)
        else:
            accumilation_dict[('count_obs', obs)] = posterior_unigram_val
        if ('count_state', aj) in accumilation_dict:
            accumilation_dict[('count_state', aj)] = lu.logadd(accumilation_dict[('count_state', aj)],
                                                               posterior_unigram_val)
        else:
            accumilation_dict[('count_state', aj)] = posterior_unigram_val

        if ('count_emission', obs, ei) in accumilation_dict:
            accumilation_dict[('count_emission', obs, ei)] = lu.logadd(accumilation_dict[('count_emission', obs, ei)],
                                                                       posterior_unigram_val)
        else:
            accumilation_dict[('count_emission', obs, ei)] = posterior_unigram_val
            # doing total counts ...
        if ('any_emission_from', ei) in accumilation_dict:
            accumilation_dict[('any_emission_from', ei)] = lu.logadd(accumilation_dict[('any_emission_from', ei)],
                                                                     posterior_unigram_val)
        else:
            accumilation_dict[('any_emission_from', ei)] = posterior_unigram_val
        return accumilation_dict
    else:
        print 'obs must be string, aj must be str, ei must be string'
        exit()
def get_backwards(obs, trelis, alpha_pi, source_len=None):
    n = len(obs) - 1  # index of last word
    beta_pi = {(n, (BOUNDRY_STATE, BOUNDRY_STATE)): 0.0}
    S = alpha_pi[(n, (BOUNDRY_STATE, BOUNDRY_STATE))]  # from line 13 in pseudo code
    p_unigrams = {}
    p_obs = {}
    p_trans = {}
    for k in range(n, 0, -1):
        for v in trelis[k]:
            pb = beta_pi[(k, v)]
            aj = v[0]
            source_token = v[1]
            posterior_unigram_val = beta_pi[(k, v)] + alpha_pi[(k, v)] - S
            p_obs = do_accumilate_posterior_obs(p_obs, obs[k], aj, source_token, posterior_unigram_val)
            p_unigrams = do_append_posterior_unigrams(p_unigrams, k, v, posterior_unigram_val)
            for u in trelis[k - 1]:
                # print 'reverse transition', 'k', k, 'u', u, '->', 'v', v
                aj_1 = u[0]
                q = get_jump_transition(aj, aj_1, source_len)
                target_token = obs[k]
                e = get_emission(target_token, source_token)
                p = q + e
                beta_p = pb + p
                new_pi_key = (k - 1, u)
                if new_pi_key not in beta_pi:  # implements lines 16
                    beta_pi[new_pi_key] = beta_p
                else:
                    beta_pi[new_pi_key] = lu.logadd(beta_pi[new_pi_key], beta_p)
                posterior_bigram_val = alpha_pi[(k - 1, u)] + p + beta_pi[(k, v)] - S
                p_trans = do_accumilate_posterior_bigrams_jump(p_trans, aj, aj_1, posterior_bigram_val, source_len)

    return p_unigrams, p_trans, p_obs, S, beta_pi
def get_jump_mle(alignments_split, source_split):
    jcounts = {}
    for a, s in zip(alignments_split, source_split):
        alignment_bigrams = [(a[i], a[i - 1]) for i in range(1, len(a))]
        for j1, j0 in alignment_bigrams:
            jkey = jump_key(j1, j0, len(s))
            jcounts[jkey] = lu.logadd(jcounts.get(jkey, float('-inf')), 0.0)
    return jcounts
def do_accumilate_posterior_bigrams_jump(accumilation_dict, aj, aj_1, posterior_bigram_val, sent_length):
    # these are actual counts in log space!!
    if not isinstance(aj, tuple) or isinstance(aj_1, tuple):
        jkey = jump_key(aj, aj_1, sent_length)
        accumilation_dict[jkey] = lu.logadd(accumilation_dict.get(jkey, float('-inf')), posterior_bigram_val)
        return accumilation_dict
    else:
        print 'aj and aj_1 should be str ### or int', aj, aj_1
        exit()
def accumilate(accumilator, addition):
    for k, val in addition.iteritems():
        if isinstance(val, float):
            s = lu.logadd(accumilator.get(k, float('-inf')), val)
            accumilator[k] = s
        elif isinstance(val, set):
            accumilator[k] = accumilator.get(k, set([]))
            accumilator[k].update(val)
    return accumilator
Beispiel #7
0
def accumilate(accumilator, addition):
    for k, val in addition.iteritems():
        if isinstance(val, float):
            s = lu.logadd(accumilator.get(k, float('-inf')), val)
            accumilator[k] = s
        elif isinstance(val, set):
            accumilator[k] = accumilator.get(k, set([]))
            accumilator[k].update(val)
    return accumilator
Beispiel #8
0
def do_accumilate_posterior_bigrams(accumilation_dict, aj, aj_1, posterior_bigram_val):
    # these are actual counts in log space!!
    if not isinstance(aj, tuple) or isinstance(aj_1, tuple):
        if ('count_transition', aj, aj_1) not in accumilation_dict:
            accumilation_dict[('count_transition', aj, aj_1)] = posterior_bigram_val
        else:
            accumilation_dict[('count_transition', aj, aj_1)] = lu.logadd(accumilation_dict[('count_transition', aj, aj_1)],
                                                                          posterior_bigram_val)

        if ('any_transition_from', aj_1) not in accumilation_dict:
            accumilation_dict[('any_transition_from', aj_1)] = posterior_bigram_val
        else:
            accumilation_dict[('any_transition_from', aj_1)] = lu.logadd(accumilation_dict[('any_transition_from', aj_1)],
                                                                         posterior_bigram_val)
        return accumilation_dict
    else:
        print 'aj and aj_1 should be str ### or int', aj, aj_1
        exit()
Beispiel #9
0
def do_accumilate_posterior_bigrams(accumilation_dict, aj, aj_1, posterior_bigram_val):
    # these are actual counts in log space!!
    if not isinstance(aj, tuple) or isinstance(aj_1, tuple):
        if ('count_transition', aj, aj_1) not in accumilation_dict:
            accumilation_dict[('count_transition', aj, aj_1)] = posterior_bigram_val
        else:
            accumilation_dict[('count_transition', aj, aj_1)] = lu.logadd(accumilation_dict[('count_transition', aj, aj_1)],
                                                                          posterior_bigram_val)

        if ('any_transition_from', aj_1) not in accumilation_dict:
            accumilation_dict[('any_transition_from', aj_1)] = posterior_bigram_val
        else:
            accumilation_dict[('any_transition_from', aj_1)] = lu.logadd(accumilation_dict[('any_transition_from', aj_1)],
                                                                         posterior_bigram_val)
        return accumilation_dict
    else:
        print 'aj and aj_1 should be str ### or int', aj, aj_1
        exit()
Beispiel #10
0
def do_accumilate_posterior_bigrams_jump(accumilation_dict, aj, aj_1, posterior_bigram_val, sent_length):
    # these are actual counts in log space!!
    if not isinstance(aj, tuple) or isinstance(aj_1, tuple):
        jkey = jump_key(aj, aj_1)
        jiip_key = jump_iip_key(sent_length, aj_1)
        accumilation_dict[jkey] = lu.logadd(accumilation_dict.get(jkey, float('-inf')), posterior_bigram_val)
        accumilation_dict[jiip_key] = accumilation_dict.get(jiip_key, set([]))
        accumilation_dict[jiip_key].add(jkey)
        return accumilation_dict
    else:
        print 'aj and aj_1 should be str ### or int', aj, aj_1
        exit()
Beispiel #11
0
def get_jump_mle(alignments_split, source_split):
    jump_counts = {}
    jump_keys_by_sentence_len = {}
    for a, s in zip(alignments_split, source_split):
        alignment_bigrams = [(a[i], a[i - 1]) for i in range(1, len(a))]
        for j1, j0 in alignment_bigrams:
            jkey = jump_key(j1, j0)
            jiip_key = jump_iip_key(len(s), j0)
            jump_counts[jkey] = lu.logadd(jump_counts.get(jkey, float('-inf')), 0.0)
            jump_keys_by_sentence_len[jiip_key] = jump_keys_by_sentence_len.get(jiip_key, set([]))
            jump_keys_by_sentence_len[jiip_key].add(jkey)
    jiip = {}
    for jiip_key, jkeys in jump_keys_by_sentence_len.iteritems():
        jkeys_val = [jump_counts[jk] for jk in jkeys]
        jiip[jiip_key] = lu.logadd_of_list(jkeys_val)
    return jump_counts, jiip
def get_jump_mle(alignments_split, source_split):
    jump_counts = {}
    jump_keys_by_sentence_len = {}
    for a, s in zip(alignments_split, source_split):
        alignment_bigrams = [(a[i], a[i - 1]) for i in range(1, len(a))]
        for j1, j0 in alignment_bigrams:
            jkey = jump_key(j1, j0)
            jiip_key = jump_iip_key(len(s), j0)
            jump_counts[jkey] = lu.logadd(jump_counts.get(jkey, float("-inf")), 0.0)
            jump_keys_by_sentence_len[jiip_key] = jump_keys_by_sentence_len.get(jiip_key, set([]))
            jump_keys_by_sentence_len[jiip_key].add(jkey)
    jiip = {}
    for jiip_key, jkeys in jump_keys_by_sentence_len.iteritems():
        jkeys_val = [jump_counts[jk] for jk in jkeys]
        jiip[jiip_key] = lu.logadd_of_list(jkeys_val)
    return jump_counts, jiip
def get_jump_transition(current_state, prev_state, sent_length):
    '''
    This method implements eq (5) in the Vogel & Ney paper (HMM-Based Word Alignment in Statistical Translation)
    it returns the probability P(aj | aj-1, L)
    '''
    jkey = jump_key(current_state, prev_state, sent_length)
    if jkey in jump_counts:
        # TODO: using a normal distribution to get probability of jump widths might be much faster!
        if (prev_state, sent_length) in jump_denoms:
            denom = jump_denoms[(prev_state, sent_length)]
        else:
            denom = float('-inf')
            for l in range(sent_length):
                jl_key = jump_key(l, prev_state, sent_length)
                denom = lu.logadd(denom, jump_counts.get(jl_key, float('-inf')))
                jump_denoms[(prev_state, sent_length)] = denom
        return jump_counts[jkey] - denom
    else:
        return -100.00
Beispiel #14
0
def get_backwards(obs, trelis, alpha_pi):
    n = len(obs) - 1  # index of last word
    beta_pi = {(n, (BOUNDRY_STATE, BOUNDRY_STATE)): 0.0}
    S = alpha_pi[(n, (BOUNDRY_STATE,
                      BOUNDRY_STATE))]  # from line 13 in pseudo code
    p_unigrams = {}
    p_obs = {}
    p_trans = {}
    for k in range(n, 0, -1):
        for v in trelis[k]:
            pb = beta_pi[(k, v)]
            aj = v[0]
            source_token = v[1]
            posterior_unigram_val = beta_pi[(k, v)] + alpha_pi[(k, v)] - S
            p_obs = do_accumilate_posterior_obs(p_obs, obs[k], aj,
                                                source_token,
                                                posterior_unigram_val)
            p_unigrams = do_append_posterior_unigrams(p_unigrams, k, v,
                                                      posterior_unigram_val)

            for u in trelis[k - 1]:
                #print 'reverse transition', 'k', k, 'u', u, '->', 'v', v
                aj_1 = u[0]
                q = get_transition(aj, aj_1)
                target_token = obs[k]
                e = get_emission(target_token, source_token)
                p = q + e
                beta_p = pb + p
                new_pi_key = (k - 1, u)
                if new_pi_key not in beta_pi:  # implements lines 16
                    beta_pi[new_pi_key] = beta_p
                else:
                    beta_pi[new_pi_key] = lu.logadd(beta_pi[new_pi_key],
                                                    beta_p)
                    #print 'beta     ', new_pi_key, '=', beta_pi[new_pi_key], exp(beta_pi[new_pi_key])
                posterior_bigram_val = alpha_pi[(k - 1,
                                                 u)] + p + beta_pi[(k, v)] - S
                #posterior_bigram_val = "%.3f" % (exp(alpha_pi[(k - 1, u)] + p + beta_pi[(k, v)] - S))
                p_trans = do_accumilate_posterior_bigrams(
                    p_trans, aj, aj_1, posterior_bigram_val)
    return p_unigrams, p_trans, p_obs, S, beta_pi
Beispiel #15
0
def accumilate(accumilator, addition):
    for k in addition:
        s = lu.logadd(accumilator.get(k, float('-inf')), addition[k])
        accumilator[k] = s
    return accumilator
Beispiel #16
0
def accumilate(accumilator, addition):
    for k in addition:
        s = lu.logadd(accumilator.get(k, float('-inf')), addition[k])
        accumilator[k] = s
    return accumilator