# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np from text_models.dataset import TokenCount from glob import glob from text_models.utils import TStatistic, LikelihoodRatios from wordcloud import WordCloud as WC from matplotlib import pylab as plt from collections import defaultdict, Counter from text_models import Vocabulary tm = TokenCount.textModel(token_list=[-2, -1]) tm.tokenize("Good afternoon, we will make some nice word clouds.") ## SHOW - bigrams "w_{i}~w_{i+1}" token = TokenCount(tokenizer=tm.tokenize) for fname in glob("books/*.txt"): txt = open(fname, encoding="utf-8").read() token.process([txt]) token.counter ## SHOW - number of times each bigram and word appear bigrams = {k: v for k, v in token.counter.items() if k.count("~")} cnt = Counter(bigrams) cnt.most_common(5) # [('of~the', 14615), # ('in~the', 9913),
def log_prob(self, ngram: str) -> float: c1 = self._data[ngram] if self._words: c2 = self.N else: words = "~".join(ngram.split("~")[:-1]) c2 = self.N[words] if c1 and c2: return np.log(c1) - np.log(c2) raise ValueError("ngram %s not found" % ngram) def prob(self, ngram: str) -> float: return np.exp(self.log_prob(ngram)) tm = TokenCount.textModel(token_list=[-3]) token = TokenCount(tokenizer=tm.tokenize) read = Read(glob("books/*.txt"), n_gram=tm.token_list[0] * -1) token.process(read.read()) lm = LM(token.counter, words=tm.token_list[0] == -1) logp = 0 max_logp, cnt = 0, 0 N = 0 for txt in read.test_set: for ngram in tm.tokenize(txt): N += 1 try: _ = lm.log_prob(ngram) if _ < max_logp: