def fit_predict(self, X): #fit #Corpus textmodel = TextModel().fit(X) #Modelo de texto X = textmodel.transform(X) nmf = NMF(n_components=6, max_iter=500).fit(X) #reduccion de dimension nmf_features = nmf.transform(X) #Topics topics = self.topics X_topics = textmodel.transform(topics) #al modelo de texto nmf_topics = nmf.transform(X_topics) #reduccion de dimension K = cosine_similarity(nmf_features, nmf_topics) #similaridades n, p = K.shape #predict prob = K cat = self.categories labels = [] for i in range(prob.shape[0]): xx = prob[i, :] indx = self.maxin(xx) lab = cat[indx] labels.append(lab) return np.array(labels)
def test_textmodel_num_terms(): from microtc.textmodel import TextModel from microtc.utils import tweet_iterator import os fname = os.path.dirname(__file__) + '/text.json' tw = list(tweet_iterator(fname)) text = TextModel(token_list=[-2, -1, 3, 4]).fit(tw) assert text.num_terms is not None text.transform(["buenos"]) print(text.num_terms) assert text.num_terms == text.model.num_terms
def test_textmodel_transform_tonp(): from microtc.textmodel import TextModel from microtc.utils import tweet_iterator from sklearn.svm import LinearSVC from sklearn.preprocessing import LabelEncoder import os fname = os.path.dirname(__file__) + '/text.json' tw = list(tweet_iterator(fname)) text = TextModel().fit(tw) X = text.transform(tw) le = LabelEncoder().fit([x['klass'] for x in tw]) y = le.transform([x['klass'] for x in tw]) m = LinearSVC().fit(text.tonp(X), y) assert len(m.predict(text.tonp(X))) == len(y)
from EvoMSA.utils import LabelEncoder, bootstrap_confidence_interval from microtc.textmodel import TextModel from microtc.utils import tweet_iterator from os.path import join, dirname from sklearn.model_selection import train_test_split import numpy as np from sklearn.model_selection import StratifiedKFold tweets = join(dirname(base.__file__), 'tests', 'tweets.json') D = list(tweet_iterator(tweets)) y = [x['klass'] for x in D] le = LabelEncoder().fit(y) y = le.transform(y) tm = TextModel(token_list=[-1]).fit(D) X = tm.transform(D) m = Bernoulli().fit(X, y) print((y == m.predict(X)).mean()) # 0.724 _ = train_test_split(D, y, test_size=0.2) Xtrain, Xtest, ytrain, ytest = _ tm = TextModel(token_list=[-1]).fit(Xtrain) m = Bernoulli().fit(tm.transform(Xtrain), ytrain) hy = m.predict(tm.transform(Xtest)) print((ytest == hy).mean()) # 0.55 folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) hy = np.empty_like(y) for tr, ts in folds.split(D, y):