Example #1
0
def test_doc2weight():
    from microtc.textmodel import TextModel
    from microtc.weighting import TFIDF
    from microtc.utils import tweet_iterator
    import os
    fname = join(os.path.dirname(__file__), 'text.json')
    tw = list(tweet_iterator(fname))
    docs = [x['text'] for x in tw]
    text = TextModel(docs, token_list=[-1, 3])
    # print(text['buenos dias'])
    docs = [text.tokenize(d) for d in docs]
    sp = TFIDF(docs)
    assert len(sp.doc2weight(text.tokenize('odio odio los los'))) == 3
Example #2
0
def test_tfidf_corpus():
    from nose.tools import assert_almost_equals
    from microtc.textmodel import TextModel
    from microtc.weighting import TFIDF
    from microtc.utils import Counter
    from microtc.utils import tweet_iterator
    import os
    import numpy as np
    fname = join(os.path.dirname(__file__), 'text.json')
    tw = list(tweet_iterator(fname))
    docs = [x['text'] for x in tw]
    text = TextModel(token_list=[-1, 3])
    docs = [text.tokenize(d) for d in docs]
    counter = Counter()
    [counter.update(set(x))for x in docs]
    tfidf = TFIDF(docs)
    tfidf2 = TFIDF.counter(counter)
    assert tfidf.num_terms == tfidf2.num_terms
    assert tfidf._ndocs == tfidf2._ndocs
    for k in tfidf2.word2id.keys():
        assert k in tfidf2.word2id
    for k, v in tfidf.word2id.items():
        id2 = tfidf2.word2id[k]
        v = tfidf.wordWeight[v]
        v2 = tfidf2.wordWeight[id2]
        print(v, v2, k)
        assert_almost_equals(v, v2)
Example #3
0
def test_getitem():
    from microtc.textmodel import TextModel
    from microtc.weighting import TFIDF
    from microtc.utils import tweet_iterator
    import os
    fname = join(os.path.dirname(__file__), 'text.json')
    tw = list(tweet_iterator(fname))
    docs = [x['text'] for x in tw]
    text = TextModel(docs, token_list=[-1, 3])
    # print(text['buenos dias'])
    docs = [text.tokenize(d) for d in docs]
    sp = TFIDF(docs)
    tok = text.tokenize('buenos dias')
    bow = sp.doc2weight(tok)
    ids = bow[0]
    assert len(ids) == len(sp[tok])
Example #4
0
def test_lang():
    from microtc.textmodel import TextModel

    text = [
        "Hi :) :P XD",
        "excelente dia xc",
        "el alma de la fiesta XD"
    ]
    model = TextModel(text, **{
        "del_dup1": True,
        "emo_option": "group",
        "lc": True,
        "num_option": "group",
        "strip_diac": False,
        "token_list": [
            (2, 1),
            (2, 2),
            -1,
            # 5,
        ],
        "url_option": "group",
        "usr_option": "group",
    })
    text = "El alma de la fiesta :) conociendo la maquinaria @user bebiendo nunca manches que onda"
    a = model.tokenize(text)
    b = ['el~de', 'alma~la', 'de~fiesta', 'la~_pos', 'fiesta~conociendo', '_pos~la', 'conociendo~maquinaria', 'la~_usr', 'maquinaria~bebiendo', '_usr~nunca',
         'bebiendo~manches', 'nunca~que', 'manches~onda', 'el~la', 'alma~fiesta', 'de~_pos', 'la~conociendo', 'fiesta~la', '_pos~maquinaria', 'conociendo~_usr',
         'la~bebiendo', 'maquinaria~nunca', '_usr~manches', 'bebiendo~que', 'nunca~onda', 'el', 'alma', 'de', 'la', 'fiesta', '_pos',
         'conociendo', 'la', 'maquinaria', '_usr', 'bebiendo', 'nunca', 'manches', 'que', 'onda']
    print(text)
    assert a == b, "got: {0}, expected: {1}".format(a, b)
Example #5
0
def test_space():
    from microtc.textmodel import TextModel
    from microtc.weighting import TFIDF
    from microtc.utils import tweet_iterator
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    docs = [x['text'] for x in tw]
    text = TextModel(docs, token_list=[-1, 3])
    # print(text['buenos dias'])
    docs = [text.tokenize(d) for d in docs]
    sp = TFIDF(docs)
    assert len(sp.wordWeight) == len(sp._w2id)
Example #6
0
def test_tfidf_corpus2():
    from nose.tools import assert_almost_equals
    from microtc.textmodel import TextModel
    from microtc.weighting import TFIDF
    from microtc.utils import Counter
    from microtc.utils import tweet_iterator
    import os
    import numpy as np
    fname = join(os.path.dirname(__file__), 'text.json')
    tw = list(tweet_iterator(fname))
    docs = [x['text'] for x in tw]
    tm = TextModel(token_list=[-1, 3])
    docs = [tm.tokenize(d) for d in docs]
    counter = Counter()
    [counter.update(set(x))for x in docs]
    tfidf = TFIDF(docs, token_min_filter=1)
    tfidf2 = TFIDF.counter(counter, token_min_filter=1)
    id2w2 = {v: k for k, v in tfidf2.word2id.items()}
    for text in docs:
        tokens = tm.tokenize(text)
        fm = {k: v for k, v in tfidf[tokens]}
        for k, v in tfidf2[tokens]:
            assert_almost_equals(fm[tfidf.word2id[id2w2[k]]], v)
Example #7
0
def test_textmodel_compute_tokens():
    from microtc.textmodel import TextModel
    from microtc.utils import tweet_iterator
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    tm = TextModel(token_list=[-2, -1])
    text = tm.text_transformations(tw[0]['text'])
    L = tm.compute_tokens(text)
    assert len(L) == 2
    r = []
    [r.__iadd__(x) for x in L]
    for a, b in zip(tm.tokenize(tw[0]), r):
        assert a == b
Example #8
0
def test_entropy():
    from microtc.textmodel import TextModel
    from microtc.weighting import Entropy, TFIDF
    from microtc.utils import tweet_iterator
    import os
    fname = join(os.path.dirname(__file__), 'text.json')
    tw = list(tweet_iterator(fname))
    docs = [x['text'] for x in tw]
    text = TextModel(token_list=[-1, 3])
    # print(text['buenos dias'])
    docs = [text.tokenize(d) for d in docs]
    sp = Entropy(docs, X=tw)
    print(sp.wordWeight)
    tfidf = TFIDF(docs)
    for k in sp.wordWeight.keys():
        if sp.wordWeight[k] != tfidf.wordWeight[k]:
            return
    # print(sp.w)
    assert False
Example #9
0
from nltk.stem.porter import PorterStemmer
from typing import Callable, Iterable

tm = TextModel(num_option=OPTION_NONE,
               usr_option=OPTION_NONE,
               url_option=OPTION_NONE,
               emo_option=OPTION_NONE,
               hashtag_option=OPTION_NONE,
               ent_option=OPTION_NONE,
               lc=False,
               del_dup=False,
               del_punc=False,
               del_diac=False,
               token_list=[-1])

tm.tokenize("Hello good morning")

# Count the number of words


def N_tokens_types(fname: str, counter: Counter, tm: Callable[[str],
                                                              Iterable[str]]):
    txt = open(fname).read()
    tokens = tm(txt)
    counter.update(tokens)
    N = sum([v for v in counter.values()])
    return N, len(counter)


counter = Counter()
heaps = [
Example #10
0
doc = nlp("This is a sentence. U.S. is another sentence.")
[x for x in doc.sents]

tm = TextModel(num_option=OPTION_NONE,
               usr_option=OPTION_NONE,
               url_option=OPTION_NONE,
               emo_option=OPTION_NONE,
               hashtag_option=OPTION_NONE,
               ent_option=OPTION_NONE,
               lc=True,
               del_dup=False,
               del_punc=False,
               del_diac=False,
               token_list=[-1])

tm.tokenize("Hello good morning")

heaps = []
counter = Counter()
for fname in glob("../books/*.txt"):
    txt = open(fname).read()
    tokens = tm.tokenize(txt)
    counter.update(tokens)
    V = len(counter)
    N = sum([v for v in counter.values()])
    heaps.append([N, V])

plt.plot([x for x, _ in heaps], [x for _, x in heaps])
plt.grid()
plt.xlabel("N")
plt.ylabel("|V|")