def bigram_perplexity(text, biprobs, unicnts, vocab): perp = 0.0 cnt = 0 for ii in range(len(text) - 1): uu, vv = text[ii + 1], text[ii] if vv == EOS0 or vv == EOS1 or vv == SOS1: continue cnt += 1 prob = bi.katz_backoff_prob2(biprobs, unicnts, vocab, vv, uu) perp += -1.0 * np.log(prob) return np.exp(perp / cnt)
def bigram_token_generate(bicnts, unicnts, vocab): vv = SOS0 ret = '' while vv != EOS0: # Generate probabilities and values bicnt = bicnts[vv] pdist = bicnt[0] balance = bicnt[1] sumb = bicnt[2] probs = [] vals = [] # sum2 = 0 # for uu in pdist.keys(): # sum2 += (unicnts[uu] - uni.discount) # sum1 = 0 # for uu in vocab: # if uu not in pdist.keys(): # vals.append(uu) # calc = bi.katz_backoff_prob2(bicnts, unicnts, vocab, vv, uu) # probs.append(calc) # if uu == unknown: # sum1 += (len(vocab) - 1) * uni.discount # else: # sum1 += (unicnts[uu] - uni.discount) # # print sum1, sum(probs), sumb, sum2, (sum(unicnts.values()) - sum2) for uu in vocab: vals.append(uu) calc = bi.katz_backoff_prob2(bicnts, unicnts, vocab, vv, uu) probs.append(calc) # print sum(probs) vv = np.random.choice(vals, 1, p=probs) while vv[0] == unknown or vv[0] == EOS1 or vv[0] == SOS1 or vv[ 0] == SOS0: vv = np.random.choice(vals, 1, p=probs) vv = vv[0] if vv != EOS0: ret = ret + ' ' + vv print ret, '\n'
def katz_backoff3(tricnts, bicnts, unicnts, vocab): # Class A: c(w, v, u) > 0 # Class B: c(w, v, u) = 0 cnt = 0 voc_len = len(vocab) # print len(tricnts) for ww in tricnts.keys(): bicnt = bicnts[ww] # print cnt cnt += 1 for vv in tricnts[ww].keys(): pdist = tricnts[ww][vv] balance = 0 sumb = 0 suma = 0 for uu in pdist.keys(): balance += (pdist[uu] - discount) / bicnt[0][vv] suma += bi.katz_backoff_prob2(bicnts, unicnts, vocab, vv, uu) balance = 1 - balance sumb = 1 - suma tricnts[ww][vv] = (pdist, balance, sumb)
def katz_backoff_prob3(tricnts, bicnts, unicnts, vocab, ww, vv, uu): base0 = ww base1 = vv tar = uu voc_len = len(vocab) flag = False if tricnts.has_key(ww) == False: base0 = unknown if tricnts[base0].has_key(vv) == False: base1 = unknown # if tricnts[base0][base1][0].has_key(uu) == False: # tar = unknown # print base0, base1 # Backoff to bigrmas # if flag == True: # balance = tricnts[unknown][unknown][1] # sumb = tricnts[unknown][unknown][2] # prob = balance * (bi.katz_backoff_prob2(bicnts, unicnts, vocab, vv, ww) / sumb) # return prob # # if tricnts[base0].has_key(base1) == False: # base1 = unknown # base0 = unknown bicnt = bicnts[base0] pdist = tricnts[base0][base1][0] balance = tricnts[base0][base1][1] sumb = tricnts[base0][base1][2] prob = 0.0 if pdist.has_key(tar) == False: prob = balance * ( bi.katz_backoff_prob2(bicnts, unicnts, vocab, base1, tar) / sumb) else: prob = (pdist[tar] - discount) / bicnt[0][base1] return prob