def test_TokenCount_single_co_occurrence(): from microtc.utils import tweet_iterator from text_models.dataset import TokenCount tcount = TokenCount.single_co_ocurrence() tcount.process_line("buenos xxx dias") assert tcount.counter["dias~xxx"] == 1 assert tcount.counter["xxx"] == 1
def test_TokenCount_process(): from microtc.utils import tweet_iterator from text_models.dataset import TokenCount tcount = TokenCount.bigrams() tcount.process(tweet_iterator(TWEETS)) print(tcount.counter.most_common(10)) assert tcount.counter["in~the"] == 313
def test_TokenCount_process_line(): from text_models.dataset import TokenCount tcount = TokenCount.bigrams() tcount.process_line("buenos dias xx la dias xx") counter = tcount.counter print(counter) assert counter["dias~xx"] == 2 and tcount.num_documents == 1
def test_TokenCount_clean(): from microtc.utils import tweet_iterator from text_models.dataset import TokenCount tcount = TokenCount.single_co_ocurrence() tcount.process(tweet_iterator(TWEETS)) ant = len(tcount.counter) tcount.clean() act = len(tcount.counter) assert ant > act
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np from text_models.dataset import TokenCount from glob import glob from text_models.utils import TStatistic, LikelihoodRatios from wordcloud import WordCloud as WC from matplotlib import pylab as plt from collections import defaultdict, Counter from text_models import Vocabulary tm = TokenCount.textModel(token_list=[-2, -1]) tm.tokenize("Good afternoon, we will make some nice word clouds.") ## SHOW - bigrams "w_{i}~w_{i+1}" token = TokenCount(tokenizer=tm.tokenize) for fname in glob("books/*.txt"): txt = open(fname, encoding="utf-8").read() token.process([txt]) token.counter ## SHOW - number of times each bigram and word appear bigrams = {k: v for k, v in token.counter.items() if k.count("~")} cnt = Counter(bigrams) cnt.most_common(5) # [('of~the', 14615), # ('in~the', 9913),
def log_prob(self, ngram: str) -> float: c1 = self._data[ngram] if self._words: c2 = self.N else: words = "~".join(ngram.split("~")[:-1]) c2 = self.N[words] if c1 and c2: return np.log(c1) - np.log(c2) raise ValueError("ngram %s not found" % ngram) def prob(self, ngram: str) -> float: return np.exp(self.log_prob(ngram)) tm = TokenCount.textModel(token_list=[-3]) token = TokenCount(tokenizer=tm.tokenize) read = Read(glob("books/*.txt"), n_gram=tm.token_list[0] * -1) token.process(read.read()) lm = LM(token.counter, words=tm.token_list[0] == -1) logp = 0 max_logp, cnt = 0, 0 N = 0 for txt in read.test_set: for ngram in tm.tokenize(txt): N += 1 try: _ = lm.log_prob(ngram) if _ < max_logp: