from text_models.dataset import TokenCount from glob import glob from text_models.utils import TStatistic, LikelihoodRatios from wordcloud import WordCloud as WC from matplotlib import pylab as plt from collections import defaultdict, Counter from text_models import Vocabulary tm = TokenCount.textModel(token_list=[-2, -1]) tm.tokenize("Good afternoon, we will make some nice word clouds.") ## SHOW - bigrams "w_{i}~w_{i+1}" token = TokenCount(tokenizer=tm.tokenize) for fname in glob("books/*.txt"): txt = open(fname, encoding="utf-8").read() token.process([txt]) token.counter ## SHOW - number of times each bigram and word appear bigrams = {k: v for k, v in token.counter.items() if k.count("~")} cnt = Counter(bigrams) cnt.most_common(5) # [('of~the', 14615), # ('in~the', 9913), # ('to~the', 7339), # ('on~the', 4883), # ('and~the', 4843)] bigrams ## SHOW the selection # Word-cloud
c2 = self.N else: words = "~".join(ngram.split("~")[:-1]) c2 = self.N[words] if c1 and c2: return np.log(c1) - np.log(c2) raise ValueError("ngram %s not found" % ngram) def prob(self, ngram: str) -> float: return np.exp(self.log_prob(ngram)) tm = TokenCount.textModel(token_list=[-3]) token = TokenCount(tokenizer=tm.tokenize) read = Read(glob("books/*.txt"), n_gram=tm.token_list[0] * -1) token.process(read.read()) lm = LM(token.counter, words=tm.token_list[0] == -1) logp = 0 max_logp, cnt = 0, 0 N = 0 for txt in read.test_set: for ngram in tm.tokenize(txt): N += 1 try: _ = lm.log_prob(ngram) if _ < max_logp: max_logp = _ logp -= _ except ValueError: