Esempio n. 1
0
def test_TokenCount_single_co_occurrence():
    from microtc.utils import tweet_iterator
    from text_models.dataset import TokenCount
    tcount = TokenCount.single_co_ocurrence()
    tcount.process_line("buenos xxx dias")
    assert tcount.counter["dias~xxx"] == 1
    assert tcount.counter["xxx"] == 1
Esempio n. 2
0
def test_TokenCount_process():
    from microtc.utils import tweet_iterator
    from text_models.dataset import TokenCount
    tcount = TokenCount.bigrams()
    tcount.process(tweet_iterator(TWEETS))
    print(tcount.counter.most_common(10))
    assert tcount.counter["in~the"] == 313
Esempio n. 3
0
def test_TokenCount_process_line():
    from text_models.dataset import TokenCount
    tcount = TokenCount.bigrams()
    tcount.process_line("buenos dias xx la dias xx")
    counter = tcount.counter
    print(counter)
    assert counter["dias~xx"] == 2 and tcount.num_documents == 1
Esempio n. 4
0
def test_TokenCount_clean():
    from microtc.utils import tweet_iterator
    from text_models.dataset import TokenCount
    tcount = TokenCount.single_co_ocurrence()    
    tcount.process(tweet_iterator(TWEETS))
    ant = len(tcount.counter)
    tcount.clean()
    act = len(tcount.counter)
    assert ant > act
Esempio n. 5
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from text_models.dataset import TokenCount
from glob import glob
from text_models.utils import TStatistic, LikelihoodRatios
from wordcloud import WordCloud as WC
from matplotlib import pylab as plt
from collections import defaultdict, Counter
from text_models import Vocabulary

tm = TokenCount.textModel(token_list=[-2, -1])
tm.tokenize("Good afternoon, we will make some nice word clouds.")
## SHOW - bigrams "w_{i}~w_{i+1}"

token = TokenCount(tokenizer=tm.tokenize)
for fname in glob("books/*.txt"):
    txt = open(fname, encoding="utf-8").read()
    token.process([txt])
token.counter
## SHOW - number of times each bigram and word appear

bigrams = {k: v for k, v in token.counter.items() if k.count("~")}
cnt = Counter(bigrams)
cnt.most_common(5)
# [('of~the', 14615),
#  ('in~the', 9913),
Esempio n. 6
0
    def log_prob(self, ngram: str) -> float:
        c1 = self._data[ngram]
        if self._words:
            c2 = self.N
        else:
            words = "~".join(ngram.split("~")[:-1])
            c2 = self.N[words]
        if c1 and c2:
            return np.log(c1) - np.log(c2)
        raise ValueError("ngram %s not found" % ngram)

    def prob(self, ngram: str) -> float:
        return np.exp(self.log_prob(ngram))


tm = TokenCount.textModel(token_list=[-3])
token = TokenCount(tokenizer=tm.tokenize)
read = Read(glob("books/*.txt"), n_gram=tm.token_list[0] * -1)
token.process(read.read())

lm = LM(token.counter, words=tm.token_list[0] == -1)

logp = 0
max_logp, cnt = 0, 0
N = 0
for txt in read.test_set:
    for ngram in tm.tokenize(txt):
        N += 1
        try:
            _ = lm.log_prob(ngram)
            if _ < max_logp: