def fit_predict(self, X):
        #fit
        #Corpus
        textmodel = TextModel().fit(X)  #Modelo de texto
        X = textmodel.transform(X)

        nmf = NMF(n_components=6, max_iter=500).fit(X)  #reduccion de dimension
        nmf_features = nmf.transform(X)

        #Topics
        topics = self.topics
        X_topics = textmodel.transform(topics)  #al modelo de texto
        nmf_topics = nmf.transform(X_topics)  #reduccion de dimension

        K = cosine_similarity(nmf_features, nmf_topics)  #similaridades
        n, p = K.shape

        #predict
        prob = K
        cat = self.categories
        labels = []
        for i in range(prob.shape[0]):
            xx = prob[i, :]
            indx = self.maxin(xx)
            lab = cat[indx]
            labels.append(lab)

        return np.array(labels)
Exemple #2
0
def test_textmodel_num_terms():
    from microtc.textmodel import TextModel
    from microtc.utils import tweet_iterator
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    text = TextModel(token_list=[-2, -1, 3, 4]).fit(tw)
    assert text.num_terms is not None
    text.transform(["buenos"])
    print(text.num_terms)
    assert text.num_terms == text.model.num_terms
Exemple #3
0
def test_textmodel_transform_tonp():
    from microtc.textmodel import TextModel
    from microtc.utils import tweet_iterator
    from sklearn.svm import LinearSVC
    from sklearn.preprocessing import LabelEncoder
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    text = TextModel().fit(tw)
    X = text.transform(tw)
    le = LabelEncoder().fit([x['klass'] for x in tw])
    y = le.transform([x['klass'] for x in tw])
    m = LinearSVC().fit(text.tonp(X), y)
    assert len(m.predict(text.tonp(X))) == len(y)
Exemple #4
0
from EvoMSA.utils import LabelEncoder, bootstrap_confidence_interval
from microtc.textmodel import TextModel
from microtc.utils import tweet_iterator
from os.path import join, dirname
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import StratifiedKFold

tweets = join(dirname(base.__file__), 'tests', 'tweets.json')
D = list(tweet_iterator(tweets))
y = [x['klass'] for x in D]
le = LabelEncoder().fit(y)
y = le.transform(y)

tm = TextModel(token_list=[-1]).fit(D)
X = tm.transform(D)
m = Bernoulli().fit(X, y)
print((y == m.predict(X)).mean())
# 0.724

_ = train_test_split(D, y, test_size=0.2)
Xtrain, Xtest, ytrain, ytest = _
tm = TextModel(token_list=[-1]).fit(Xtrain)
m = Bernoulli().fit(tm.transform(Xtrain), ytrain)
hy = m.predict(tm.transform(Xtest))
print((ytest == hy).mean())
# 0.55

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
hy = np.empty_like(y)
for tr, ts in folds.split(D, y):