def __init__(self, n, tra_filename): self.tra_filename = tra_filename self.n = n self.tra_letter_list = [] self.ngram_list = [] self.n1gram_list = [] with open(TRA_SET_ROOT + tra_filename, "r") as file: for sentence in file: sentence = sentence.replace('\n', '') # sentence_letter_list = list(nltk.pad_sequence(sentence, self.n, # pad_left=False, pad_right=False)) sentence_letter_list = list( nltk.pad_sequence(sentence, self.n, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')) self.tra_letter_list.extend(sentence_letter_list) self.ngram_list.extend( list(nltk.ngrams(sentence_letter_list, self.n))) self.n1gram_list.extend( list(nltk.ngrams(sentence_letter_list, self.n - 1))) self.V = len(set(self.tra_letter_list)) self.ngram_cfd = nltk.FreqDist(self.ngram_list) self.n1gram_cfd = nltk.FreqDist(self.n1gram_list) if gl_smoothing_type == SmoothingType.NLTK_KNESER_NEY: self.kneser_ney_prob_dist = nltk.KneserNeyProbDist( self.ngram_cfd, bins=None, discount=KNESER_NEY_DISCOUNT) if gl_smoothing_type == SmoothingType.IMPROVED_KNESER_NEY: self.n2gram_cfd = nltk.FreqDist(self.tra_letter_list) self.letter_count_after_n1gram = {} self.letter_typenum_after_n1gram = {} self.letter_count_after_n2gram = {} self.letter_typenum_after_n2gram = {} for w0, w1, w2 in self.ngram_cfd: if (w0, w1) not in self.letter_count_after_n1gram: self.letter_count_after_n1gram[(w0, w1)] = 0 self.letter_typenum_after_n1gram[(w0, w1)] = 0 self.letter_count_after_n1gram[(w0, w1)] += self.ngram_cfd[(w0, w1, w2)] self.letter_typenum_after_n1gram[(w0, w1)] += 1 for w0, w1 in self.n1gram_cfd: if w0 not in self.letter_count_after_n2gram: self.letter_count_after_n2gram[w0] = 0 self.letter_typenum_after_n2gram[w0] = 0 self.letter_count_after_n2gram[w0] += self.n1gram_cfd[(w0, w1)] self.letter_typenum_after_n2gram[w0] += 1
def kneser_ney(tri_grams): freq_dist = nltk.probability.FreqDist([*tri_grams]) for k, v in freq_dist.items(): freq_dist[k] = tri_grams[k] KN = nltk.KneserNeyProbDist(freq_dist) KNDict = {} for i in KN.samples(): KNDict[i] = KN.prob(i) return KNDict
def getDictionaryProb(self): DictionaryProbDict = {} # DictionaryDist = nltk.LaplaceProbDist(self.TrigramProb) DictionaryDist = nltk.KneserNeyProbDist(self.TrigramProb) for i in DictionaryDist.samples(): DictionaryProbDict[i] = DictionaryDist.prob(i) return DictionaryProbDict
def biKneserNeyBackup(nGram, palabra1, palabra2): fdist = nltk.FreqDist(nGram) kneser_ney = nltk.KneserNeyProbDist(fdist) prob_sum = 0 limiter = 0 for i in kneser_ney.samples(): if i[0] == palabra1 and i[1] == palabra2: prob_sum += kneser_ney.prob(i) print ("{0}:{1}".format(i, kneser_ney.prob(i))) print (prob_sum) limiter += 1 if (limiter > 50): break
def biKneserNey(nGram, palabra1, palabra2): fdist = nltk.FreqDist(nGram) kneser_ney = nltk.KneserNeyProbDist(fdist) prob_sum = 0 dicPalabras3 = {} for i in kneser_ney.samples(): if i[0] == palabra1 and i[1] == palabra2: dicPalabras3[i[2]] = kneser_ney.prob(i) # print ("{0}:{1}".format(i, kneser_ney.prob(i))) if (dicPalabras3 == {}): return '[END] no mas iteraciones' return sorted(dicPalabras3.items(), key=operator.itemgetter(1), reverse=True)[:1][0][0]
def kneser_ney_smoothing(kneser_train, kneser_input): """ The Kneser-Ney smoothing technique computes the probability of a trigram given its prefix. The Kneser-Ney technique works only for trigrams and makes use of the KneserNeyProbDist() function to train on the training data. :param kneser_train: A dictionary of training data consisting of trigrams. :param kneser_input: A dictionary of input data consisting of trigrams. """ for input_key in kneser_input.keys(): lowest_perplexity = None result_lang = None for train_key in kneser_train.keys(): probability = 1 perplexity = None l = [] freq_dist_train = nltk.FreqDist(kneser_train[train_key]) kneser_ney_train = nltk.KneserNeyProbDist(freq_dist_train, bins=None, discount=0.75) for input_ngrams in kneser_input[input_key]: prob_kn = kneser_ney_train.prob(input_ngrams) if(prob_kn == 0): prob_kn = 0.1 probability *= prob_kn perplexity = probability**(-1/len(kneser_input[input_key])) if lowest_perplexity ==None: lowest_perplexity = perplexity result_lang = train_key elif lowest_perplexity > perplexity: lowest_perplexity = perplexity result_lang = train_key else: continue print str(input_key)+"\t"+str(result_lang)+"\t"+str(perplexity)+"\t"+str(n)
def triKneserNey(nGram, palabra1, palabra2, palabra3): fdist = nltk.FreqDist(nGram) kneser_ney = nltk.KneserNeyProbDist(fdist) prob_sum = 0 limiter = 0 for i in kneser_ney.samples(): if [0] == palabra1 and i[1] == palabra2 and i[2] == palabra3: prob_sum += kneser_ney.prob(i) print ("{0}:{1}:{2}".format(i, kneser_ney.prob(i))) print (prob_sum) limiter += 1 if (limiter > 50): break return kneser_ney.max()
dist_ugs = Counter(w for w in all_tokens) #bigramcounts=Counter(w for w in bigrams) bgs = nltk.bigrams(all_tokens) dist_bgs = nltk.FreqDist(bgs) #dist_bgs2 = dict((key,value) for key,value in dist_bgs.items() if value > 1) tgs = nltk.trigrams(all_tokens) dist_tgs = nltk.FreqDist(tgs) fgs = nltk.ngrams(all_tokens, 4) dist_fgs = nltk.FreqDist(fgs) del all_tokens prob_table_bi = defaultdict(dict) for key, value in dist_bgs.items(): prob_table_bi[key[0]][key[1]] = dist_bgs[key] / dist_ugs[key[0]] del dist_bgs kn = nltk.KneserNeyProbDist(dist_tgs) prob_table_kn2 = defaultdict(dict) for gram in kn.samples(): prob_table_kn2[gram[:2]][gram[2]] = kn.prob(gram) del kn ### keep only 5 words per combination. this logic will convert the nexted dict to list. for key, value in prob_table_kn2.items(): prob_table_kn2[key] = sorted(value.items(), key=lambda x: x[1], reverse=True)[:4] for key, value in prob_table_bi.items(): prob_table_bi[key] = sorted(value.items(), key=lambda x: x[1], reverse=True)[:4] #prob_table_f = defaultdict(dict) #for key,value in dist_fgs.items():
def kneserNeyProbDist(freqDist): return nltk.KneserNeyProbDist(freqDist)
:param line_list: :return: ''' start1, start2, end1, end2 = '<start-tag1>', '<start-tag2>', '</start-tag1>', '</start-tag2>' line_list.insert(0, start2) line_list.insert(0, start1) line_list.append(end1) line_list.append(end2) for index in range(2, len(line_list)): first = line_list[index - 2] second = line_list[index - 1] current = line_list[index] freq_dist[(first, second, current)] += 1 for x in freq_dist.items(): w, y, z = x[0] print w, y, z for path in data_path: with open(path, 'r') as file: line = file.readline() while line: line = line.strip() arr_line = preprocessing(line) build_tri_gram(arr_line) line = file.readline() kneser_ney = nltk.KneserNeyProbDist(freq_dist) cPickle.dump(kneser_ney, dict_bi_gram_saver)
def __init__(self, *args): super(KneserNeyModel, self).__init__(*args) self.model = nltk.KneserNeyProbDist(self.ngrams)
freq_1gram = nltk.FreqDist(train_corpus) len_brown = len(train_corpus) vocab = len(set(train_corpus)) trigrams_as_bigrams = [] trigram = [ x for x in ngrams(train_corpus, 3, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol="</s>") ] trigrams_as_bigrams.extend([((t[0], t[1]), t[2]) for t in trigram]) cfreq_3gram = nltk.ConditionalFreqDist(trigrams_as_bigrams) cprob_3gram = nltk.KneserNeyProbDist(nltk.FreqDist(trigram)) def trigram_prob(w1, w2, w3): return cprob_3gram.prob((w1, w2, w3)) def entropy(n, text): entropy = 0.0 text = ["<s>"] + text + ["</s>"] for i in range(n - 1, len(text)): context = text[i - n + 1:i] token = text[i] entropy += logprob(token, context) return entropy / float(len(text) - (n - 1))
for k in remove: del trigram_freq[k] dev_sentences_tokenized_trigram_flattened = [ val for sublist in dev_sent_tokenized_trigram for val in sublist ] trigrams_dev = list(nltk.trigrams(dev_sentences_tokenized_trigram_flattened)) trigram_dev_freq = nltk.FreqDist(trigrams_dev) remove = [k for k in trigram_dev_freq.keys() if k[2] in ['start1', 'start2']] for k in remove: del trigram_dev_freq[k] len(trigrams_dev) len(dev_sentences_tokenized_trigram_flattened) sum_prob = 0 trigram_cnt = 0 g = 0 kn_tri = nltk.KneserNeyProbDist(trigram_freq) kn_tri.samples() kn_tri.max() for itm in trigrams_dev: if kn_tri.prob(itm) != 0: sum_prob += math.log2(kn_tri.prob(itm)) else: g = g + 1 trigram_cnt += 1 HC = -sum_prob / trigram_cnt perpl = math.pow(2, HC) print("Cross Entropy: {0:.3f}".format(HC)) print("perplexity: {0:.3f}".format(perpl)) print("g: {0:.3f}".format(g)) g / len(trigrams_dev)
knownCPPFile = open(knownCpp) knownCPPString = "" for line in knownCPPFile: knownCPPString += line # print(knownCPPString) knownCPPGram = ngrams(knownCPPString.split(' '), 3) knownCPPHashFreq = nltk.FreqDist(knownCPPGram) # cppMaxGram = max(knownCPPHashFreq, key=knownCPPHashFreq.get) # print(cppMaxGram, knownCPPHashFreq[cppMaxGram]) ############################################################################################# # Section 2: to calculate trigram Probability ############################################################################################# kneserJava = nltk.KneserNeyProbDist(knownJavaHashFreq) kneserCPP = nltk.KneserNeyProbDist(knownCPPHashFreq) kneserJavaHash = convertProbListToHash(kneserJava) kneserCPPHash = convertProbListToHash(kneserCPP) cpp = 0 java = 0 totalCppWithTag = 0 totalJavaWithTag = 0 totalJavaTags = 0 totalCppTags = 0 totalEval = 0 resultsFile = open('Results.txt', 'a') codeFile = open('Code.txt', 'a')