def test_tfidf_corpus(): from nose.tools import assert_almost_equals from microtc.textmodel import TextModel from microtc.weighting import TFIDF from microtc.utils import Counter from microtc.utils import tweet_iterator import os import numpy as np fname = join(os.path.dirname(__file__), 'text.json') tw = list(tweet_iterator(fname)) docs = [x['text'] for x in tw] text = TextModel(token_list=[-1, 3]) docs = [text.tokenize(d) for d in docs] counter = Counter() [counter.update(set(x))for x in docs] tfidf = TFIDF(docs) tfidf2 = TFIDF.counter(counter) assert tfidf.num_terms == tfidf2.num_terms assert tfidf._ndocs == tfidf2._ndocs for k in tfidf2.word2id.keys(): assert k in tfidf2.word2id for k, v in tfidf.word2id.items(): id2 = tfidf2.word2id[k] v = tfidf.wordWeight[v] v2 = tfidf2.wordWeight[id2] print(v, v2, k) assert_almost_equals(v, v2)
def test_lang(): from microtc.textmodel import TextModel text = [ "Hi :) :P XD", "excelente dia xc", "el alma de la fiesta XD" ] model = TextModel(text, **{ "del_dup1": True, "emo_option": "group", "lc": True, "num_option": "group", "strip_diac": False, "token_list": [ (2, 1), (2, 2), -1, # 5, ], "url_option": "group", "usr_option": "group", }) text = "El alma de la fiesta :) conociendo la maquinaria @user bebiendo nunca manches que onda" a = model.tokenize(text) b = ['el~de', 'alma~la', 'de~fiesta', 'la~_pos', 'fiesta~conociendo', '_pos~la', 'conociendo~maquinaria', 'la~_usr', 'maquinaria~bebiendo', '_usr~nunca', 'bebiendo~manches', 'nunca~que', 'manches~onda', 'el~la', 'alma~fiesta', 'de~_pos', 'la~conociendo', 'fiesta~la', '_pos~maquinaria', 'conociendo~_usr', 'la~bebiendo', 'maquinaria~nunca', '_usr~manches', 'bebiendo~que', 'nunca~onda', 'el', 'alma', 'de', 'la', 'fiesta', '_pos', 'conociendo', 'la', 'maquinaria', '_usr', 'bebiendo', 'nunca', 'manches', 'que', 'onda'] print(text) assert a == b, "got: {0}, expected: {1}".format(a, b)
def fit_predict(self, X): #fit #Corpus textmodel = TextModel().fit(X) #Modelo de texto X = textmodel.transform(X) nmf = NMF(n_components=6, max_iter=500).fit(X) #reduccion de dimension nmf_features = nmf.transform(X) #Topics topics = self.topics X_topics = textmodel.transform(topics) #al modelo de texto nmf_topics = nmf.transform(X_topics) #reduccion de dimension K = cosine_similarity(nmf_features, nmf_topics) #similaridades n, p = K.shape #predict prob = K cat = self.categories labels = [] for i in range(prob.shape[0]): xx = prob[i, :] indx = self.maxin(xx) lab = cat[indx] labels.append(lab) return np.array(labels)
def test_textmodel_num_terms(): from microtc.textmodel import TextModel from microtc.utils import tweet_iterator import os fname = os.path.dirname(__file__) + '/text.json' tw = list(tweet_iterator(fname)) text = TextModel(token_list=[-2, -1, 3, 4]).fit(tw) assert text.num_terms is not None text.transform(["buenos"]) print(text.num_terms) assert text.num_terms == text.model.num_terms
def test_doc2weight(): from microtc.textmodel import TextModel from microtc.weighting import TFIDF from microtc.utils import tweet_iterator import os fname = join(os.path.dirname(__file__), 'text.json') tw = list(tweet_iterator(fname)) docs = [x['text'] for x in tw] text = TextModel(docs, token_list=[-1, 3]) # print(text['buenos dias']) docs = [text.tokenize(d) for d in docs] sp = TFIDF(docs) assert len(sp.doc2weight(text.tokenize('odio odio los los'))) == 3
def test_space(): from microtc.textmodel import TextModel from microtc.weighting import TFIDF from microtc.utils import tweet_iterator import os fname = os.path.dirname(__file__) + '/text.json' tw = list(tweet_iterator(fname)) docs = [x['text'] for x in tw] text = TextModel(docs, token_list=[-1, 3]) # print(text['buenos dias']) docs = [text.tokenize(d) for d in docs] sp = TFIDF(docs) assert len(sp.wordWeight) == len(sp._w2id)
def test_textmodel_compute_tokens(): from microtc.textmodel import TextModel from microtc.utils import tweet_iterator import os fname = os.path.dirname(__file__) + '/text.json' tw = list(tweet_iterator(fname)) tm = TextModel(token_list=[-2, -1]) text = tm.text_transformations(tw[0]['text']) L = tm.compute_tokens(text) assert len(L) == 2 r = [] [r.__iadd__(x) for x in L] for a, b in zip(tm.tokenize(tw[0]), r): assert a == b
def test_textmodel_transform_tonp(): from microtc.textmodel import TextModel from microtc.utils import tweet_iterator from sklearn.svm import LinearSVC from sklearn.preprocessing import LabelEncoder import os fname = os.path.dirname(__file__) + '/text.json' tw = list(tweet_iterator(fname)) text = TextModel().fit(tw) X = text.transform(tw) le = LabelEncoder().fit([x['klass'] for x in tw]) y = le.transform([x['klass'] for x in tw]) m = LinearSVC().fit(text.tonp(X), y) assert len(m.predict(text.tonp(X))) == len(y)
def test_textmodel_token_min_filter(): from microtc.textmodel import TextModel from microtc.utils import tweet_iterator import os fname = os.path.dirname(__file__) + '/text.json' tw = list(tweet_iterator(fname)) text = TextModel(tw, token_min_filter=1, token_list=[-2, -1, 3, 4]) print(len(text.model._w2id), 'hh', text.token_min_filter, text.token_max_filter) assert len(text.model._w2id) == 28 text = TextModel(tw, token_min_filter=0.01, token_list=[-2, -1, 3, 4]) print(len(text.model._w2id)) assert len(text.model._w2id) == 28 text = TextModel(tw, token_min_filter=1)
def test_textmodel(): from microtc.textmodel import TextModel from microtc.utils import tweet_iterator import os fname = os.path.dirname(__file__) + '/text.json' tw = list(tweet_iterator(fname)) text = TextModel([x['text'] for x in tw]) # print(text.tokenize("hola amiguitos gracias por venir :) http://hello.com @chanfle")) # assert False assert isinstance(text[tw[0]['text']], list) assert len(text[tw[0]]) == 3 text = TextModel(token_list=[3]).fit(tw) print(text.model.word2id) for k, _ in text.model.word2id.items(): assert len(k) == 5
def __call__(self, conf_code): conf, code = conf_code st = time() predY = np.zeros(len(self.y)) # X = np.array(self.X) for train, test in self.kfolds.split(self.X, self.y): # A = X[train] A = [self.X[i] for i in train] if len(self.Xstatic) > 0: A.extend(self.Xstatic) trainY = self.y[train] if len(self.ystatic) > 0: trainY = np.hstack((trainY, self.ystatic)) textmodel = TextModel(A, **conf) # textmodel = TextModel([X[i] for i in train], **conf) trainX = [textmodel[x] for x in A] c = self.create_classifier() try: c.fit(trainX, trainY) except ValueError: conf["_error"] = "this configuration produces an empty matrix" conf["_score"] = 0.0 return conf testX = [textmodel[self.X[i]] for i in test] predY[test] = c.predict(testX) self.compute_score(conf, predY) conf['_time'] = (time() - st) / self.nfolds return conf
def test_getitem(): from microtc.textmodel import TextModel from microtc.weighting import TFIDF from microtc.utils import tweet_iterator import os fname = join(os.path.dirname(__file__), 'text.json') tw = list(tweet_iterator(fname)) docs = [x['text'] for x in tw] text = TextModel(docs, token_list=[-1, 3]) # print(text['buenos dias']) docs = [text.tokenize(d) for d in docs] sp = TFIDF(docs) tok = text.tokenize('buenos dias') bow = sp.doc2weight(tok) ids = bow[0] assert len(ids) == len(sp[tok])
def __call__(self, conf_code): conf, code = conf_code st = time() predY = np.zeros(len(self.y)) # X = np.array(self.X) for train, test in self.kfolds: # A = X[train] A = [self.X[i] for i in train] if len(self.Xstatic) > 0: A.extend(self.Xstatic) textmodel = TextModel(A, **conf) # textmodel = TextModel([X[i] for i in train], **conf) trainX = [textmodel[x] for x in A] trainY = self.y[train] if len(self.ystatic) > 0: trainY = np.hstack((trainY, self.ystatic)) c = self.create_classifier() c.fit(trainX, trainY) testX = [textmodel[self.X[i]] for i in test] predY[test] = c.predict(testX) self.compute_score(conf, predY) conf['_time'] = (time() - st) / self.nfolds return conf
def test_textmodel_weighting_key(): from microtc.textmodel import TextModel from microtc.utils import tweet_iterator import os fname = os.path.dirname(__file__) + '/text.json' tw = list(tweet_iterator(fname)) for w in ['tfidf', 'tf', 'entropy']: TextModel(token_list=[-2, -1], weighting=w).fit(tw)
def test_textmodel_save_load(): import os from microtc.textmodel import TextModel from microtc.utils import tweet_iterator, save_model, load_model fname = os.path.dirname(__file__) + '/text.json' tw = list(tweet_iterator(fname)) tm = TextModel().fit(tw) save_model(tm, 't.model') assert isinstance(load_model('t.model'), TextModel) os.unlink('t.model')
def test_entropy(): from microtc.textmodel import TextModel from microtc.weighting import Entropy, TFIDF from microtc.utils import tweet_iterator import os fname = join(os.path.dirname(__file__), 'text.json') tw = list(tweet_iterator(fname)) docs = [x['text'] for x in tw] text = TextModel(token_list=[-1, 3]) # print(text['buenos dias']) docs = [text.tokenize(d) for d in docs] sp = Entropy(docs, X=tw) print(sp.wordWeight) tfidf = TFIDF(docs) for k in sp.wordWeight.keys(): if sp.wordWeight[k] != tfidf.wordWeight[k]: return # print(sp.w) assert False
def __call__(self, conf_code): conf, code = conf_code st = time() textmodel = TextModel(self.train_corpus, **conf) train_X = [textmodel[doc] for doc in self.train_corpus] c = self.create_classifier() c.fit(train_X, self.train_y) test_X = [textmodel[doc] for doc in self.test_corpus] pred_y = c.predict(test_X) self.compute_score(conf, pred_y) conf['_time'] = (time() - st) return conf
def params(cls): """ Parameters >>> from b4msa.textmodel import TextModel >>> TextModel.params() ['docs', 'threshold', 'lang', 'negation', 'stemming', 'stopwords', 'kwargs', 'docs', 'text', 'num_option', 'usr_option', 'url_option', 'emo_option', 'hashtag_option', 'ent_option', 'lc', 'del_dup', 'del_punc', 'del_diac', 'token_list', 'token_min_filter', 'token_max_filter', 'select_ent', 'select_suff', 'select_conn', 'weighting'] """ import inspect r = mTCTextModel.params() sig = inspect.signature(cls) params = sig.parameters.keys() return list(params) + list(r)
def test_tfidf_corpus2(): from nose.tools import assert_almost_equals from microtc.textmodel import TextModel from microtc.weighting import TFIDF from microtc.utils import Counter from microtc.utils import tweet_iterator import os import numpy as np fname = join(os.path.dirname(__file__), 'text.json') tw = list(tweet_iterator(fname)) docs = [x['text'] for x in tw] tm = TextModel(token_list=[-1, 3]) docs = [tm.tokenize(d) for d in docs] counter = Counter() [counter.update(set(x))for x in docs] tfidf = TFIDF(docs, token_min_filter=1) tfidf2 = TFIDF.counter(counter, token_min_filter=1) id2w2 = {v: k for k, v in tfidf2.word2id.items()} for text in docs: tokens = tm.tokenize(text) fm = {k: v for k, v in tfidf[tokens]} for k, v in tfidf2[tokens]: assert_almost_equals(fm[tfidf.word2id[id2w2[k]]], v)
def __call__(self, conf_code): conf, code = conf_code st = time() model_klass = os.environ.get("TEXTMODEL_KLASSES", None) if model_klass: model_klass = self.le.transform(model_klass.split(',')) _train = [ self.train_corpus[i] for i in len(self.train_corpus) if self.train_y[i] in model_klass ] textmodel = TextModel(_train, **conf) if conf['dist_vector'] != OPTION_NONE: _train_y = [ self.train_y[i] for i in len(self.train_corpus) if self.train_y[i] in model_klass ] textmodel = DistTextModel(textmodel, _train, _train_y, self.le.classes_.shape[0], conf['dist_vector']) else: textmodel = TextModel(self.train_corpus, **conf) if conf['dist_vector'] != OPTION_NONE: textmodel = DistTextModel(textmodel, self.train_corpus, self.train_y, self.le.classes_.shape[0], conf['dist_vector']) train_X = [textmodel[doc] for doc in self.train_corpus] c = self.create_classifier() c.fit(train_X, self.train_y) test_X = [textmodel[doc] for doc in self.test_corpus] pred_y = c.predict(test_X) self.compute_score(conf, pred_y) conf['_time'] = (time() - st) return conf
def test_textmodel_entropy(): from microtc.textmodel import TextModel from microtc.utils import tweet_iterator import os fname = os.path.dirname(__file__) + '/text.json' tw = list(tweet_iterator(fname)) text = TextModel(tw, weighting='microtc.weighting.Entropy', token_list=[-1, 3]) # print(text.tokenize("hola amiguitos gracias por venir :) http://hello.com @chanfle")) # assert False assert isinstance(text[tw[0]['text']], list) _ = text[tw[0]] print(_) for k, v in _: assert text.model.wordWeight[k] == v
def test_params(): import os import itertools from microtc.params import BASIC_OPTIONS from microtc.textmodel import TextModel from microtc.utils import tweet_iterator params = dict(strip_diac=[True, False], usr_option=BASIC_OPTIONS, url_option=BASIC_OPTIONS) params = sorted(params.items()) fname = os.path.dirname(__file__) + '/text.json' tw = [x for x in tweet_iterator(fname)] text = [x['text'] for x in tw] for x in itertools.product(*[x[1] for x in params]): args = dict(zip([x[0] for x in params], x)) ins = TextModel(text, **args) assert isinstance(ins[text[0]], list)
def test_predict_from_file(): from microtc.wrappers import ClassifierWrapper from microtc.textmodel import TextModel from microtc.utils import read_data_labels from sklearn.preprocessing import LabelEncoder import os fname = os.path.dirname(__file__) + '/text.json' corpus, labels = read_data_labels(fname) t = TextModel(corpus) le = LabelEncoder() le.fit(labels) y = le.transform(labels) c = ClassifierWrapper() X = [t[x] for x in corpus] c.fit(X, y) hy = le.inverse_transform(c.predict(X)) for i in hy: assert i in ['POS', 'NEU', 'NEG']
def __call__(self, conf_code): conf, code = conf_code st = time() textmodel = TextModel(self.train_corpus, **conf) train_X = [textmodel[doc] for doc in self.train_corpus] c = self.create_classifier() # c.fit(train_X, self.train_y) try: c.fit(train_X, self.train_y) except ValueError: conf["_error"] = "this configuration produces an empty matrix" conf["_score"] = 0.0 return conf test_X = [textmodel[doc] for doc in self.test_corpus] pred_y = c.predict(test_X) self.compute_score(conf, pred_y) conf['_time'] = (time() - st) return conf
from microtc.textmodel import TextModel from microtc.params import OPTION_NONE from glob import glob from collections import Counter import numpy as np from scipy.optimize import minimize from matplotlib import pylab as plt from nltk.stem.porter import PorterStemmer from typing import Callable, Iterable tm = TextModel(num_option=OPTION_NONE, usr_option=OPTION_NONE, url_option=OPTION_NONE, emo_option=OPTION_NONE, hashtag_option=OPTION_NONE, ent_option=OPTION_NONE, lc=False, del_dup=False, del_punc=False, del_diac=False, token_list=[-1]) tm.tokenize("Hello good morning") # Count the number of words def N_tokens_types(fname: str, counter: Counter, tm: Callable[[str], Iterable[str]]): txt = open(fname).read() tokens = tm(txt)
def __create_models(self): models = [] models_fit = [] #for _params in self.model_params: _params = {} for k, v in self.params.items(): if k.startswith('_'): continue _params[k] = v self.textModels = dict( mtc=TextModel(_params).fit(self.train), #charEmb=DocumentPoolEmbeddings([CharacterEmbeddings()]), #charLangEmb=DocumentPoolEmbeddings([CharacterEmbeddings(),BytePairEmbeddings(self.lang)]), ##charMultiEmb=DocumentPoolEmbeddings([CharacterEmbeddings(),BytePairEmbeddings('multi')]), langEmb=DocumentPoolEmbeddings([BytePairEmbeddings(self.lang)]), charLangMultiEmb=DocumentPoolEmbeddings([ CharacterEmbeddings(), BytePairEmbeddings(self.lang), BytePairEmbeddings('multi') ]), langMultiEmb=DocumentPoolEmbeddings( [BytePairEmbeddings(self.lang), BytePairEmbeddings('multi')]), bytePairEMB=DocumentPoolEmbeddings([BytePairEmbeddings('multi')]), #flairEmbF=DocumentPoolEmbeddings([FlairEmbeddings('multi-forward')]), #flairEmbB=DocumentPoolEmbeddings([FlairEmbeddings('multi-backward')]), #bertEMB=DocumentPoolEmbeddings([TransformerWordEmbeddings('bert-base-uncased', layers='-1')]) ) for km, tmodel in self.textModels.items(): models.append({'name': km}) models_fit.append({'name': km}) if km == 'mtc': xt = tmodel.transform(self.train) xv = tmodel.transform(self.validation) X = tmodel.transform(self.data) else: sentences_train = [Sentence(txt) for txt in self.train] tmodel.embed(sentences_train) xt = np.array([ e.get_embedding().cpu().detach().numpy() for e in sentences_train ]) sentences_val = [Sentence(txt) for txt in self.validation] tmodel.embed(sentences_val) xv = np.array([ e.get_embedding().cpu().detach().numpy() for e in sentences_val ]) sentences = [Sentence(txt) for txt in self.data] tmodel.embed(sentences) X = np.array([ e.get_embedding().cpu().detach().numpy() for e in sentences ]) models[-1]['xv'] = xv models[-1]['xt'] = xt models_fit[-1]['xt'] = X #max_iter=5000 #if km=='mtc': max_iter=1000 #if km=='langMulti': max_iter=5000 #self.models[-1]['clf']=LinearSVC(max_iter=max_iter).fit(xt,self.yt) #yp=self.models[-1]['clf'].decision_function(xv) #scaler=Normalizer().fit(yp) #self.models[-1]['macroF1']=f1_score(self.yv,np.argmax(scaler.transform(yp),axis=1),average='weighted') #self.models[-1]['weightedF1']=f1_score(self.yv,np.argmax(scaler.transform(yp),axis=1),average='weighted') #self.models[-1]['score']=f1_score(self.yv,np.argmax(yp,axis=1),average='weighted') #self.models[-1]['probas']=scaler.transform(yp) ### Fit model with all avaliable data #self.models_fit[-1]['clf']=LinearSVC(max_iter=max_iter).fit(X,self.y) print('Fitting Ensemble') #self.models = Parallel(n_jobs=5)(delayed(self._train_model)(md) for md in models) #self.models_fit = Parallel(n_jobs=5)(delayed(self._train_model)(md) for md in models_fit) self.models, self.models_fit = [], [] for md, mdf in zip(models, models_fit): self.models.append(self._train_model( md)) # = [self._train_model(md) for md in models] self.models_fit.append(self._train_model(md))
from EvoMSA.model import Bernoulli from EvoMSA.utils import LabelEncoder, bootstrap_confidence_interval from microtc.textmodel import TextModel from microtc.utils import tweet_iterator from os.path import join, dirname from sklearn.model_selection import train_test_split import numpy as np from sklearn.model_selection import StratifiedKFold tweets = join(dirname(base.__file__), 'tests', 'tweets.json') D = list(tweet_iterator(tweets)) y = [x['klass'] for x in D] le = LabelEncoder().fit(y) y = le.transform(y) tm = TextModel(token_list=[-1]).fit(D) X = tm.transform(D) m = Bernoulli().fit(X, y) print((y == m.predict(X)).mean()) # 0.724 _ = train_test_split(D, y, test_size=0.2) Xtrain, Xtest, ytrain, ytest = _ tm = TextModel(token_list=[-1]).fit(Xtrain) m = Bernoulli().fit(tm.transform(Xtrain), ytrain) hy = m.predict(tm.transform(Xtest)) print((ytest == hy).mean()) # 0.55 folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) hy = np.empty_like(y)