def distance_matrix(word_label_pairs): words, labels = zip(*word_label_pairs) unked_word_label_pairs = zip(unk(words), labels) conditions = set(labels) divergences = [] for c1, c2 in pair_generator(conditions): fd1 = nltk.FreqDist([w for w, c in unked_word_label_pairs if c == c1]) fd2 = nltk.FreqDist([w for w, c in unked_word_label_pairs if c == c2]) P = nltk.MLEProbDist(fd1) Q = nltk.MLEProbDist(fd2) divergences.append(jensen_shannon_divergence(P, Q)) n_conditions = len(conditions) distances = zip(divergences, pair_generator(conditions)) divergences = np.array(divergences).reshape((n_conditions, n_conditions)) # plot that matrix cmap = plt.get_cmap('Blues') plt.pcolor(divergences, cmap=cmap) plt.xticks([x + .5 for x in xrange(n_conditions)], list(conditions), rotation=90) plt.yticks([x + .5 for x in xrange(n_conditions)], list(conditions)) plt.title('Jensen-Shannon Divergence of conditional distributions') plt.ylabel('P()') plt.xlabel('Q()') plt.show() return distances
def _create_probabilities(self): self.probs_by_features = { features: nltk.MLEProbDist(freq_dist) for (features, freq_dist) in self.freqs_by_features.items() } if USE_NEXT_TAG: self.probs_by_tags = { features: nltk.MLEProbDist(freq_dist) for (features, freq_dist) in self.freqs_by_tags.items() } self.probs_by_prev_tag = { prev_features: nltk.MLEProbDist(freq_dist) for (prev_features, freq_dist) in self.freqs_by_prev_tag.items() } self.probs_by_tag = { tag: nltk.MLEProbDist(freq_dist) for (tag, freq_dist) in self.freqs_by_tag.items() }
def get_next(self, current, weighted_by_probability=False): next_freq = self.transition_probabilities[current] if weighted_by_probability: prob_dist = nltk.MLEProbDist(next_freq) return prob_dist.generate() else: prob_dist = nltk.UniformProbDist(next_freq) return prob_dist.generate()
def entropy(self, condition, base=None): """Return the entropy of the distribution of a given condition. If base is set as None (which it is as default), the log base of entropy is the number of possible outcomes in the distribution.""" if condition == 'Ø' and 'Ø' not in self.conditions(): condition = '#ALL#' prob_dist = nltk.MLEProbDist(self[condition]) probs = [prob_dist.prob(bin_) for bin_ in prob_dist.samples()] if not base: base = len(self._possible_outcomes()) return stats.entropy(probs, base=base)
def rep(self): scores = self.get_scores() #sums = dict((i,sum([t[1] for t in scores[i]])) for i in scores.keys()) sums = dict() for k in scores.keys(): sums[k] = nltk.MLEProbDist(nltk.FreqDist(dict(scores[k]))) def rate(cand): r = 0 for ngramlength, dat in scores.items(): for c, s in dat: if c == cand: r += ngramlength * sums[ngramlength].prob(c) return r return rate
def f(nick): # For each message bigram_frequency = defaultdict(nltk.FreqDist) for message in self.nicks[nick].messages: # Compute bigrams for the message bigrams = list(nltk.bigrams(nltk.word_tokenize(message))) if len(bigrams) < 1: continue bigrams = [(0, bigrams[0][0])] + bigrams + [(bigrams[-1][1], 0)] # Put bigrams into frequency distribution for bigram in bigrams: bigram_frequency[bigram[0]][bigram[1]] += 1 for word, freq in bigram_frequency.items(): self.nicks[nick].bigram_distribution[word] = nltk.MLEProbDist(freq)
def main(): sents = create_tokens(CORPUS_FILENAME) train_corpus, test_corpus = train_test_split(sents) fd_1gram = ngram_freq_dist(train_corpus, ngram=1) cpd_1gram = nltk.MLEProbDist(fd_1gram) freq_dist2 = ngram_freq_dist(train_corpus, 2) print('Nations', freq_dist2["nations"]) cfd_2gram = ngram_freq_dist(train_corpus, ngram=2) #conditional frequency distribution for bigrams cpd_2gram = nltk.ConditionalProbDist(cfd_2gram, nltk.MLEProbDist) # conditional probality distribution for bigrams cfd_3gram = ngram_freq_dist(train_corpus, ngram=3) cpd_3gram = nltk.ConditionalProbDist(cfd_3gram, nltk.MLEProbDist) pws_2gram = probable_words('united states', cpd_2gram, 2) pws_3gram = probable_words('donald trump', cpd_3gram, 3) print('Probable words for donald trump using 3 gram model', pws_3gram) test_sent1 = 'donald president is trump' test_sent2 = 'donald trump is president' prob_1gram = find_sent_prob(test_sent2, cpd_1gram, ngram=1) print('Sentance probability of {}'.format(test_sent1), prob_1gram) print('Entropy of 1 gram model', entropy(cpd_1gram, test_corpus, 1)) print('Entropy of 2 gram model', entropy(cpd_2gram, test_corpus, 2)) print('Entropy of 3 gram model', entropy(cpd_3gram, test_corpus, 3)) print('Perplexity of 1 gram model', perplexity(cpd_1gram, test_corpus, 1)) print('Perplexity of 2 gram model', perplexity(cpd_2gram, test_corpus, 2)) print('Perplexity of 3 gram model', perplexity(cpd_3gram, test_corpus, 3)) text_wiki = generate_txt_bigram_model(cpd_2gram, 'trump', numwords=10) print('Test sentance for trump:', text_wiki)
# review the text files for cleaniness # Describe this in the document bbTokens = bbTokens[114:] # no need to do this for King of the Wind # Now we need to convert the tokens to all lowercase bbWords = [w.lower() for w in bbTokens] kwWords = [w.lower() for w in kwTokens] # check the length of the list of words bbDict['Tokens'] = len(bbWords) kwDict['Tokens'] = len(kwWords) # **skip this or is this the frequency for bigrams? bbBigram = ngrams(bbTokens, 2) freq_dist = nltk.FreqDist(bbBigram) prob_dist = nltk.MLEProbDist(freq_dist) numBigrams = freq_dist.N() bbTrigram = ngrams(bbTokens, 3) Tfreq_dist = nltk.FreqDist(bbTrigram) Tprob_dist = nltk.MLEProbDist(Tfreq_dist) numTrigrams = Tfreq_dist.N() # Bigrams - Black Beauty bbBigramList = list(nltk.bigrams(bbWords)) print(bbBigramList[:30]) # Bigrams - King of the Wind kwBigramList = list(nltk.bigrams(kwWords)) print(kwBigramList[:30]) # Trigrams - Black Beauty
def main(): with open(FILE_PATH, 'r') as f: data = f.read().lower().replace('\n', ' ') sents = tokenized_words(data) rev_sents = tokenized_rev_words(data) train_corpus = [word for sent in sents for word in sent] rev_train_corpus = [word for sent in rev_sents for word in sent] cfd_2gram = ngram_freq_dist(train_corpus, 2) cfd_2gram_rev = ngram_freq_dist(rev_train_corpus, 2) cpd_2gram = nltk.ConditionalProbDist(cfd_2gram, nltk.MLEProbDist) cpd_2gram_rev = nltk.ConditionalProbDist(cfd_2gram_rev, nltk.MLEProbDist) cfd_1gram = ngram_freq_dist(train_corpus) cpd_1gram = nltk.MLEProbDist(cfd_1gram) random_sentences = [] random_pos_tags = [] random_word_pos_tags = [] # Generate 200 sentences randomly for _ in range(5000): sent = generate_txt_bigram_model_random(cpd_2gram, cpd_2gram_rev, 'education', 9) word_pos_tags = nltk.pos_tag(sent.split()) pos_tags = [x[1] for x in word_pos_tags] random_word_pos_tags.append(word_pos_tags) random_sentences.append(sent) random_pos_tags.append(pos_tags) ''' RULES: 1. Determiner always comes before a noun. 2. Noun can be followed by another noun phrase. 3. Modals (could, will) can follow nouns. 4. .. ''' pos_template_dict = { 'NN': ['NN', 'VB', 'VBD', 'MD', 'VBP', 'IN', 'VBZ', 'NNS'], 'NNS': ['NN', 'VB', 'VBD', 'MD', 'VBP', 'IN', 'NN'], 'NNP': ['NN', 'VB', 'VBD', 'MD', 'VBP', 'IN', 'NNS'], 'NNPS': ['NN', 'VB', 'VBD', 'MD', 'VBP', 'IN'], 'DT': ['NN', 'NNS', 'NNP', 'NNPS', 'VBP', 'JJ'], 'JJ': ['CC'], 'CC': ['NN', 'NNS', 'NNP', 'NNPS'], 'VB': ['NN', 'DT', 'TO'], 'VBD': ['NN', 'TO'], 'VBG': ['IN', 'TO'], 'VBP': ['VBG', 'RB', 'TO'], 'VBN': ['RB', 'PRP', 'TO'], 'VBZ': ['VBN'], 'MD': ['VB', 'PRP'], 'IN': ['DT', 'JJ'], 'RB': ['NN', 'NNS'], 'PRP': ['MD', 'VBD'], 'TO': ['VB'], } filtered_sent = filter_sentences(random_pos_tags, random_sentences, pos_template_dict) #print_filtered_sent(filtered_sent) #print('------------------------------------------------------------------------------') dict_of_probs = sent_prob(filtered_sent, cpd_1gram, cpd_2gram) top_five = get_top_five(dict_of_probs) print('Top five tweets:\n') for tweet in top_five: print(tweet) print('=============') '''
def assignProbabilities(self, person): fd = nltk.FreqDist(self.personMessageDict[person]) probDist = nltk.MLEProbDist(fd) for y in probDist.samples(): self.probList[y] = probDist.prob(y)
j = j.replace("--", "") j = j.replace("_", "") book = book + j print(len(book)) print() print(len(book)) print() #region unigramms words = nltk.word_tokenize(book) unigram = nltk.ngrams(words, 1) freq_dist_un = nltk.FreqDist(unigram) prob_dist_un = nltk.MLEProbDist(freq_dist_un) # number_of_unigrams = prob_dist_un.N() if False: for i in freq_dist_un: print(i, " ", freq_dist_un[i], " ", prob_dist_un.prob(i)) #endregion #region bigramms sentences = nltk.sent_tokenize(book) tokenized = map(nltk.tokenize.word_tokenize, sentences) bigrams = map(ngrams_wrapper, tokenized) bigram = list(itertools.chain.from_iterable(bigrams))
def train(self, training_data): ''' Trains an n-gram model. ''' if self.status != 0: self.clear() # parse training data, counting n-grams for alignment in training_data: graphs = ['<', '<', '<', '<'] graphs.extend(alignment[0]) graphs.append('>') phons = ['<', '<', '<', '<'] phons.extend(alignment[1]) phons.append('>') for i in range(4, len(phons)): self.uni[(graphs[i], phons[i])] += 1 self.bi[((graphs[i - 1], graphs[i]), (phons[i - 1], phons[i]))] += 1 self.tri[((graphs[i - 2], graphs[i - 1], graphs[i]), (phons[i - 2], phons[i - 1], phons[i]))] += 1 self.quad[((graphs[i - 3], graphs[i - 2], graphs[i - 1], graphs[i]), (phons[i - 3], phons[i - 2], phons[i - 1], phons[i]))] += 1 self.quin[((graphs[i - 4], graphs[i - 3], graphs[i - 2], graphs[i - 1], graphs[i]), (phons[i - 4], phons[i - 3], phons[i - 2], phons[i - 1], phons[i]))] += 1 self.N[((graphs[i - 4], graphs[i - 3], graphs[i - 2], graphs[i - 1], graphs[i]), (phons[i - 4], phons[i - 3], phons[i - 2], phons[i - 1], phons[i]))] += 1 # smoothing self.uni = nltk.MLEProbDist(self.uni) self.bi = nltk.MLEProbDist(self.bi) self.tri = nltk.MLEProbDist(self.tri) self.quad = nltk.MLEProbDist(self.quad) self.quin = nltk.MLEProbDist(self.quin) # lambda estimation for ngram in self.N: four_gram = ((ngram[0][1], ngram[0][2], ngram[0][3], ngram[0][4]), (ngram[1][1], ngram[1][2], ngram[1][3], ngram[1][4])) three_gram = ((ngram[0][2], ngram[0][3], ngram[0][4]), (ngram[1][2], ngram[1][3], ngram[1][4])) two_gram = ((ngram[0][3], ngram[0][4]), (ngram[1][3], ngram[1][4])) one_gram = (ngram[0][4], ngram[1][4]) if self.quin.prob(ngram) >= self.quad.prob( four_gram) and self.quin.prob(ngram) >= self.tri.prob( three_gram) and self.quin.prob(ngram) >= self.bi.prob( two_gram) and self.quin.prob( ngram) >= self.uni.prob(one_gram): self.lambda5 += self.N.freq(ngram) * self.N.N() elif self.quad.prob(four_gram) >= self.tri.prob( three_gram) and self.quad.prob(four_gram) >= self.bi.prob( two_gram) and self.quad.prob( four_gram) >= self.uni.prob(one_gram): self.lambda4 += self.N.freq(ngram) * self.N.N() elif self.tri.prob(three_gram) >= self.bi.prob( two_gram) and self.tri.prob(three_gram) >= self.uni.prob( one_gram): self.lambda3 += self.N.freq(ngram) * self.N.N() elif self.bi.prob(two_gram) >= self.uni.prob(one_gram): self.lambda2 += self.N.freq(ngram) * self.N.N() else: self.lambda1 += self.N.freq(ngram) * self.N.N() self.lambda5 = self.lambda5 / self.N.N() self.lambda4 = self.lambda4 / self.N.N() self.lambda3 = self.lambda3 / self.N.N() self.lambda2 = self.lambda2 / self.N.N() self.lambda1 = self.lambda1 / self.N.N() # set status self.status = 1
with open('{}/test_plain.txt'.format(args.data_id), 'r') as f: data = f.read() test_plain = data.split('\n') if '' in test_plain: test_plain.remove('') test_data = [] for c_sent, p_sent in zip(test_cipher, test_plain): sent_tuples = [(c_sent[i], p_sent[i]) for i in range(len(c_sent))] test_data.append(sent_tuples) if args.laplace: estim = lambda fd, bins: nltk.LaplaceProbDist(fd, bins) else: estim = lambda fdist, bins: nltk.MLEProbDist(fdist) # Train HMM on POS tagging instead of ciphers if args.pos: # nltk.download('brown') # nltk.download('universal_tagset') from nltk.corpus import brown # list of (list of (str,str)), each top level list is a sentence, containing (word,tag) pairs brown_news_tagged = brown.tagged_sents(categories='news', tagset='universal')[:2000] n = len(brown_news_tagged) # Clean up sentences from brown and build sets of states and symbols tag_re = re.compile(r'[*]|--|[^+*-]+') tag_set = set()
def get_word_dist(text): words = nltk.word_tokenize(text) freq = nltk.FreqDist(words).most_common(200) dist = nltk.MLEProbDist(freq) return dist