def uncertainty(orig, langin, trans, langout, change): # print 'UNCERTAINTY' t = make_string(trans) words = split_n_gramm(t) # print words sum_entropy = 0.0 i = 0 n = 5 while i < len(words): try: if i + n < len(words): sum_entropy += log( n_gramm_estimation(words[i:i + n], langout, n), 2) else: sum_entropy += log( n_gramm_estimation(words[i:len(words)], langout, len(words) - i), 2) i += n except: sum_entropy += -99999 sum_max_prob = 0.0 e_log = sum_entropy # print 'Entropy = ', sum_entropy for (i, n_gramm) in enumerate(trans): try: t_gramm = Ngramm.objects.get(n_gramm=n_gramm, lang=langout) o_gramm = Ngramm.objects.get(n_gramm=orig[i], lang=langout) if change: sum_max_prob += log( Translation.objects.get(orig=n_gramm, lang_orig=langout, trans=orig[i], lang_trans=langin).probability, 2) else: sum_max_prob += log( Translation.objects.get(orig=orig[i], lang_orig=langin, trans=n_gramm, lang_trans=langout).probability, 2) except: # sum_frequence = -99999 sum_max_prob += -99999 # try: # e_log = log(sum_entropy, 2) # except: # e_log = -99999 power = -1 * (e_log + sum_max_prob / len(trans)) # print power if power > 10: # print 'a ', power return power # print 'b ',power return pow(2, power)
def cross_entropy(text, langout, size): sum = 0.0 text = make_string(text) words = split_n_gramm(text) # разбиваем на н-граммы высшего порядка # words = join_by_n(text, size) i = 0 while i < len(words): try: if i+size < len(words): sum += log(n_gramm_estimation(words[i:i+size], langout, size), 2) else: sum += log(n_gramm_estimation(words[i:len(words)], langout, len(words)-i), 2) except: sum += -99999 i += size return sum / len(words)
def uncertainty(orig, langin, trans, langout, change): # print 'UNCERTAINTY' t = make_string(trans) words = split_n_gramm(t) # print words sum_entropy = 0.0 i = 0 n = 5 while i < len(words): try: if i+n < len(words): sum_entropy += log(n_gramm_estimation(words[i:i+n], langout, n), 2) else: sum_entropy += log(n_gramm_estimation(words[i:len(words)], langout, len(words)-i), 2) i += n except: sum_entropy += -99999 sum_max_prob = 0.0 e_log = sum_entropy # print 'Entropy = ', sum_entropy for (i, n_gramm) in enumerate(trans): try: t_gramm = Ngramm.objects.get(n_gramm=n_gramm, lang=langout) o_gramm = Ngramm.objects.get(n_gramm=orig[i], lang=langout) if change: sum_max_prob += log(Translation.objects.get(orig=n_gramm, lang_orig=langout, trans=orig[i], lang_trans=langin).probability, 2) else: sum_max_prob += log(Translation.objects.get(orig=orig[i], lang_orig=langin, trans=n_gramm, lang_trans=langout).probability, 2) except: # sum_frequence = -99999 sum_max_prob += -99999 # try: # e_log = log(sum_entropy, 2) # except: # e_log = -99999 power = -1*(e_log + sum_max_prob/len(trans)) # print power if power > 10: # print 'a ', power return power # print 'b ',power return pow(2, power)
def cross_entropy(text, langout, size): sum = 0.0 text = make_string(text) words = split_n_gramm(text) # разбиваем на н-граммы высшего порядка # words = join_by_n(text, size) i = 0 while i < len(words): try: if i + size < len(words): sum += log( n_gramm_estimation(words[i:i + size], langout, size), 2) else: sum += log( n_gramm_estimation(words[i:len(words)], langout, len(words) - i), 2) except: sum += -99999 i += size return sum / len(words)
def get_n(self): return len(split_n_gramm(self.n_gramm))