예제 #1
0
def bigram_perplexity(text, biprobs, unicnts, vocab):
    perp = 0.0
    cnt = 0
    for ii in range(len(text) - 1):
        uu, vv = text[ii + 1], text[ii]
        if vv == EOS0 or vv == EOS1 or vv == SOS1:
            continue
        cnt += 1

        prob = bi.katz_backoff_prob2(biprobs, unicnts, vocab, vv, uu)
        perp += -1.0 * np.log(prob)
    return np.exp(perp / cnt)
예제 #2
0
def bigram_token_generate(bicnts, unicnts, vocab):
    vv = SOS0
    ret = ''

    while vv != EOS0:
        # Generate probabilities and values
        bicnt = bicnts[vv]

        pdist = bicnt[0]
        balance = bicnt[1]
        sumb = bicnt[2]
        probs = []
        vals = []

        # sum2 = 0
        # for uu in pdist.keys():
        #     sum2 += (unicnts[uu] - uni.discount)
        # sum1 = 0
        # for uu in vocab:
        #     if uu not in pdist.keys():
        #         vals.append(uu)
        #         calc = bi.katz_backoff_prob2(bicnts, unicnts, vocab, vv, uu)
        #         probs.append(calc)
        #         if uu == unknown:
        #             sum1 += (len(vocab) - 1) * uni.discount
        #         else:
        #             sum1 += (unicnts[uu] - uni.discount)
        #
        # print sum1, sum(probs), sumb, sum2, (sum(unicnts.values()) - sum2)

        for uu in vocab:
            vals.append(uu)
            calc = bi.katz_backoff_prob2(bicnts, unicnts, vocab, vv, uu)
            probs.append(calc)


#        print sum(probs)
        vv = np.random.choice(vals, 1, p=probs)
        while vv[0] == unknown or vv[0] == EOS1 or vv[0] == SOS1 or vv[
                0] == SOS0:
            vv = np.random.choice(vals, 1, p=probs)
        vv = vv[0]
        if vv != EOS0:
            ret = ret + ' ' + vv
    print ret, '\n'
예제 #3
0
def katz_backoff3(tricnts, bicnts, unicnts, vocab):
    # Class A: c(w, v, u) > 0
    # Class B: c(w, v, u) = 0
    cnt = 0
    voc_len = len(vocab)
    # print len(tricnts)
    for ww in tricnts.keys():
        bicnt = bicnts[ww]
        # print cnt
        cnt += 1
        for vv in tricnts[ww].keys():
            pdist = tricnts[ww][vv]
            balance = 0
            sumb = 0
            suma = 0

            for uu in pdist.keys():
                balance += (pdist[uu] - discount) / bicnt[0][vv]
                suma += bi.katz_backoff_prob2(bicnts, unicnts, vocab, vv, uu)
            balance = 1 - balance
            sumb = 1 - suma
            tricnts[ww][vv] = (pdist, balance, sumb)
예제 #4
0
def katz_backoff_prob3(tricnts, bicnts, unicnts, vocab, ww, vv, uu):
    base0 = ww
    base1 = vv
    tar = uu
    voc_len = len(vocab)
    flag = False

    if tricnts.has_key(ww) == False:
        base0 = unknown
    if tricnts[base0].has_key(vv) == False:
        base1 = unknown

    # if tricnts[base0][base1][0].has_key(uu) == False:
    #     tar = unknown
    # print base0, base1
    # Backoff to bigrmas
    # if flag == True:
    #     balance = tricnts[unknown][unknown][1]
    #     sumb = tricnts[unknown][unknown][2]
    #     prob = balance * (bi.katz_backoff_prob2(bicnts, unicnts, vocab, vv, ww) / sumb)
    #     return prob
    #
    # if tricnts[base0].has_key(base1) == False:
    #     base1 = unknown
    #     base0 = unknown

    bicnt = bicnts[base0]
    pdist = tricnts[base0][base1][0]
    balance = tricnts[base0][base1][1]
    sumb = tricnts[base0][base1][2]

    prob = 0.0
    if pdist.has_key(tar) == False:
        prob = balance * (
            bi.katz_backoff_prob2(bicnts, unicnts, vocab, base1, tar) / sumb)
    else:
        prob = (pdist[tar] - discount) / bicnt[0][base1]
    return prob