def feature_dict_qmax(slwords, tlwords, dict_stot, normalize_by_length, treat_oovs, dict_ttos, limit=20): logresult = 0 slwords_s_a = set() slwords_s_n = set() for i in slwords: if regex_alpha.match(i): if i in dict_stot.d: slwords_s_a.add(i) else: slwords_s_n.add(i) tlwords2 = list(tlwords) tlwords2.sort(key=len, reverse=True) if treat_oovs: for tlword in tlwords2[0:limit]: if tlword not in dict_ttos: pass else: t = [ dict_stot.get_prob_alpha(slword, tlword) for slword in slwords_s_a ] t.extend([ dict_stot.get_prob_nonalpha(slword, tlword) for slword in slwords_s_n ]) prob = max(t, default=dict_stot.smooth) logresult += math.log(prob) else: for tlword in tlwords2[0:limit]: t = [ dict_stot.get_prob_alpha(slword, tlword) for slword in slwords_s_a ] t.extend([ dict_stot.get_prob_nonalpha(slword, tlword) for slword in slwords_s_n ]) prob = max(t, default=dict_stot.smooth) logresult += math.log(prob) if normalize_by_length: logresult = float(logresult) / float(max(len(tlwords), limit)) return math.exp(logresult)
def feature_dict_qmax_nosmooth_nolimit_freq(slwords, tlwords, dict_stot, normalize_by_length, tlwordfreqs, fv): logresult = 0 slwords_s_a = set() slwords_s_n = set() for i in slwords: if regex_alpha.match(i): if i in dict_stot.d: slwords_s_a.add(i) else: slwords_s_n.add(i) slwords_s_n.add("NULL") tlwords2 = list(tlwords) tlwords2.sort(key=len, reverse=True) tlword_freqs = {} total_tlword_freq = 0 for tlword in tlwords2: freq = tlwordfreqs.get_word_freq(tlword) total_tlword_freq += freq tlword_freqs[tlword] = freq tlword_norm_freqs_inverse = { w: 1 - (f / total_tlword_freq) for w, f in tlword_freqs.items() } for tlword in tlwords2: t = [ dict_stot.get_prob_alpha(slword, tlword) for slword in slwords_s_a ] t.extend([ dict_stot.get_prob_nonalpha(slword, tlword) for slword in slwords_s_n ]) prob = max(t, default=dict_stot.smooth)**tlword_norm_freqs_inverse[tlword] logresult += math.log(prob) logging.debug("\t" + str(prob) + "\t" + str(logresult)) #logging.debug(str(logresult)+"\t"+str(float(logresult) / float(len(tlwords)))+"\t"+str(math.exp(float(logresult) / float(len(tlwords))))) if normalize_by_length: if fv >= 2: logresult = float(logresult) / float( max(1, len(tlwords)) ) # the max is to prevent zero division when tl sentence is empty else: # old behavior (it was a bug) logresult = float(logresult) / float(len(tlwords)) return math.exp(logresult)
def feature_dict_qmax_nosmooth(slwords, tlwords, dict_stot, normalize_by_length, fv, limit=20): logresult = 0 slwords_s_a = set() slwords_s_n = set() for i in slwords: if regex_alpha.match(i): if i in dict_stot.d: slwords_s_a.add(i) else: slwords_s_n.add(i) slwords_s_n.add("NULL") tlwords2 = list(tlwords) tlwords2.sort(key=len, reverse=True) for tlword in tlwords2[0:limit]: t = [ dict_stot.get_prob_alpha(slword, tlword) for slword in slwords_s_a ] t.extend([ dict_stot.get_prob_nonalpha(slword, tlword) for slword in slwords_s_n ]) prob = max(t, default=dict_stot.smooth) if prob > 0.0: logresult += math.log(prob) logging.debug("\t" + str(prob) + "\t" + str(logresult)) logging.debug( str(logresult) + "\t" + str(float(logresult) / float(len(tlwords))) + "\t" + str(math.exp(float(logresult) / float(len(tlwords))))) if normalize_by_length: if fv >= 2: logresult = float(logresult) / float( max(1, min(len(tlwords), limit)) ) # the max is to prevent zero division when tl sentence is empty else: # old behavior (it was a bug) logresult = float(logresult) / float(max(len(tlwords), limit)) return math.exp(logresult)
def feature_dict_qmax_nosmooth_nolimit_cummulated_prob(slwords, tlwords, dict_stot, normalize_by_length, fv): logresult = 0 slwords_s_a = set() slwords_s_n = set() for i in slwords: if regex_alpha.match(i): if i in dict_stot.d: slwords_s_a.add(i) else: slwords_s_n.add(i) slwords_s_n.add("NULL") tlwords2 = list(tlwords) tlwords2.sort(key=len, reverse=True) for tlword in tlwords2: t = [ dict_stot.get_prob_alpha(slword, tlword) for slword in slwords_s_a ] if len(t) == 0: t.extend([ dict_stot.get_prob_nonalpha(slword, tlword) for slword in slwords_s_n ]) if len(t) > 0: prob = 1.0 logresult += math.log(prob) else: prob = sum(t) / float(len(slwords)) logresult += math.log(prob) if normalize_by_length: if fv >= 2: logresult = float(logresult) / float( max(1, len(tlwords)) ) # the max is to prevent zero division when tl sentence is empty else: # old behavior (it was a bug) logresult = float(logresult) / float(len(tlwords)) return math.exp(logresult)
def feature_dict_qmax(slwords, tlwords, dict_stot, normalize_by_length, treat_oovs, dict_ttos, fv, limit = 20): logresult = 0 slwords_s_a = set() slwords_s_n = set() for i in slwords: if regex_alpha.match(i): if i in dict_stot.d: slwords_s_a.add(i) else: slwords_s_n.add(i) tlwords2 = list(tlwords) tlwords2.sort(key=len, reverse=True) if treat_oovs: for tlword in tlwords2[0:limit]: if tlword not in dict_ttos: if fv >= 2: logresult += math.log(0.0000001) else: pass # old behavior (it was a bug) else: t = [dict_stot.get_prob_alpha(slword, tlword) for slword in slwords_s_a] t.extend([dict_stot.get_prob_nonalpha(slword, tlword) for slword in slwords_s_n]) prob = max(t, default = dict_stot.smooth) logresult += math.log(prob) else: for tlword in tlwords2[0:limit]: t = [dict_stot.get_prob_alpha(slword, tlword) for slword in slwords_s_a] t.extend([dict_stot.get_prob_nonalpha(slword, tlword) for slword in slwords_s_n]) prob = max(t, default = dict_stot.smooth) logresult += math.log(prob) if normalize_by_length: if fv >= 2: logresult = float(logresult)/float(max(1, min(len(tlwords), limit))) # the max is to prevent zero division when tl sentence is empty else: # old behavior (it was a bug) logresult = float(logresult)/float(max(len(tlwords), limit)) return math.exp(logresult)