def test_doc2weight(): from microtc.textmodel import TextModel from microtc.weighting import TFIDF from microtc.utils import tweet_iterator import os fname = join(os.path.dirname(__file__), 'text.json') tw = list(tweet_iterator(fname)) docs = [x['text'] for x in tw] text = TextModel(docs, token_list=[-1, 3]) # print(text['buenos dias']) docs = [text.tokenize(d) for d in docs] sp = TFIDF(docs) assert len(sp.doc2weight(text.tokenize('odio odio los los'))) == 3
def test_tfidf_corpus(): from nose.tools import assert_almost_equals from microtc.textmodel import TextModel from microtc.weighting import TFIDF from microtc.utils import Counter from microtc.utils import tweet_iterator import os import numpy as np fname = join(os.path.dirname(__file__), 'text.json') tw = list(tweet_iterator(fname)) docs = [x['text'] for x in tw] text = TextModel(token_list=[-1, 3]) docs = [text.tokenize(d) for d in docs] counter = Counter() [counter.update(set(x))for x in docs] tfidf = TFIDF(docs) tfidf2 = TFIDF.counter(counter) assert tfidf.num_terms == tfidf2.num_terms assert tfidf._ndocs == tfidf2._ndocs for k in tfidf2.word2id.keys(): assert k in tfidf2.word2id for k, v in tfidf.word2id.items(): id2 = tfidf2.word2id[k] v = tfidf.wordWeight[v] v2 = tfidf2.wordWeight[id2] print(v, v2, k) assert_almost_equals(v, v2)
def test_getitem(): from microtc.textmodel import TextModel from microtc.weighting import TFIDF from microtc.utils import tweet_iterator import os fname = join(os.path.dirname(__file__), 'text.json') tw = list(tweet_iterator(fname)) docs = [x['text'] for x in tw] text = TextModel(docs, token_list=[-1, 3]) # print(text['buenos dias']) docs = [text.tokenize(d) for d in docs] sp = TFIDF(docs) tok = text.tokenize('buenos dias') bow = sp.doc2weight(tok) ids = bow[0] assert len(ids) == len(sp[tok])
def test_lang(): from microtc.textmodel import TextModel text = [ "Hi :) :P XD", "excelente dia xc", "el alma de la fiesta XD" ] model = TextModel(text, **{ "del_dup1": True, "emo_option": "group", "lc": True, "num_option": "group", "strip_diac": False, "token_list": [ (2, 1), (2, 2), -1, # 5, ], "url_option": "group", "usr_option": "group", }) text = "El alma de la fiesta :) conociendo la maquinaria @user bebiendo nunca manches que onda" a = model.tokenize(text) b = ['el~de', 'alma~la', 'de~fiesta', 'la~_pos', 'fiesta~conociendo', '_pos~la', 'conociendo~maquinaria', 'la~_usr', 'maquinaria~bebiendo', '_usr~nunca', 'bebiendo~manches', 'nunca~que', 'manches~onda', 'el~la', 'alma~fiesta', 'de~_pos', 'la~conociendo', 'fiesta~la', '_pos~maquinaria', 'conociendo~_usr', 'la~bebiendo', 'maquinaria~nunca', '_usr~manches', 'bebiendo~que', 'nunca~onda', 'el', 'alma', 'de', 'la', 'fiesta', '_pos', 'conociendo', 'la', 'maquinaria', '_usr', 'bebiendo', 'nunca', 'manches', 'que', 'onda'] print(text) assert a == b, "got: {0}, expected: {1}".format(a, b)
def test_space(): from microtc.textmodel import TextModel from microtc.weighting import TFIDF from microtc.utils import tweet_iterator import os fname = os.path.dirname(__file__) + '/text.json' tw = list(tweet_iterator(fname)) docs = [x['text'] for x in tw] text = TextModel(docs, token_list=[-1, 3]) # print(text['buenos dias']) docs = [text.tokenize(d) for d in docs] sp = TFIDF(docs) assert len(sp.wordWeight) == len(sp._w2id)
def test_tfidf_corpus2(): from nose.tools import assert_almost_equals from microtc.textmodel import TextModel from microtc.weighting import TFIDF from microtc.utils import Counter from microtc.utils import tweet_iterator import os import numpy as np fname = join(os.path.dirname(__file__), 'text.json') tw = list(tweet_iterator(fname)) docs = [x['text'] for x in tw] tm = TextModel(token_list=[-1, 3]) docs = [tm.tokenize(d) for d in docs] counter = Counter() [counter.update(set(x))for x in docs] tfidf = TFIDF(docs, token_min_filter=1) tfidf2 = TFIDF.counter(counter, token_min_filter=1) id2w2 = {v: k for k, v in tfidf2.word2id.items()} for text in docs: tokens = tm.tokenize(text) fm = {k: v for k, v in tfidf[tokens]} for k, v in tfidf2[tokens]: assert_almost_equals(fm[tfidf.word2id[id2w2[k]]], v)
def test_textmodel_compute_tokens(): from microtc.textmodel import TextModel from microtc.utils import tweet_iterator import os fname = os.path.dirname(__file__) + '/text.json' tw = list(tweet_iterator(fname)) tm = TextModel(token_list=[-2, -1]) text = tm.text_transformations(tw[0]['text']) L = tm.compute_tokens(text) assert len(L) == 2 r = [] [r.__iadd__(x) for x in L] for a, b in zip(tm.tokenize(tw[0]), r): assert a == b
def test_entropy(): from microtc.textmodel import TextModel from microtc.weighting import Entropy, TFIDF from microtc.utils import tweet_iterator import os fname = join(os.path.dirname(__file__), 'text.json') tw = list(tweet_iterator(fname)) docs = [x['text'] for x in tw] text = TextModel(token_list=[-1, 3]) # print(text['buenos dias']) docs = [text.tokenize(d) for d in docs] sp = Entropy(docs, X=tw) print(sp.wordWeight) tfidf = TFIDF(docs) for k in sp.wordWeight.keys(): if sp.wordWeight[k] != tfidf.wordWeight[k]: return # print(sp.w) assert False
from nltk.stem.porter import PorterStemmer from typing import Callable, Iterable tm = TextModel(num_option=OPTION_NONE, usr_option=OPTION_NONE, url_option=OPTION_NONE, emo_option=OPTION_NONE, hashtag_option=OPTION_NONE, ent_option=OPTION_NONE, lc=False, del_dup=False, del_punc=False, del_diac=False, token_list=[-1]) tm.tokenize("Hello good morning") # Count the number of words def N_tokens_types(fname: str, counter: Counter, tm: Callable[[str], Iterable[str]]): txt = open(fname).read() tokens = tm(txt) counter.update(tokens) N = sum([v for v in counter.values()]) return N, len(counter) counter = Counter() heaps = [
doc = nlp("This is a sentence. U.S. is another sentence.") [x for x in doc.sents] tm = TextModel(num_option=OPTION_NONE, usr_option=OPTION_NONE, url_option=OPTION_NONE, emo_option=OPTION_NONE, hashtag_option=OPTION_NONE, ent_option=OPTION_NONE, lc=True, del_dup=False, del_punc=False, del_diac=False, token_list=[-1]) tm.tokenize("Hello good morning") heaps = [] counter = Counter() for fname in glob("../books/*.txt"): txt = open(fname).read() tokens = tm.tokenize(txt) counter.update(tokens) V = len(counter) N = sum([v for v in counter.values()]) heaps.append([N, V]) plt.plot([x for x, _ in heaps], [x for _, x in heaps]) plt.grid() plt.xlabel("N") plt.ylabel("|V|")