Python TextModel Examples, microtc.textmodel.TextModel Python Examples

Example #1

0

Show file

def test_tfidf_corpus():
    from nose.tools import assert_almost_equals
    from microtc.textmodel import TextModel
    from microtc.weighting import TFIDF
    from microtc.utils import Counter
    from microtc.utils import tweet_iterator
    import os
    import numpy as np
    fname = join(os.path.dirname(__file__), 'text.json')
    tw = list(tweet_iterator(fname))
    docs = [x['text'] for x in tw]
    text = TextModel(token_list=[-1, 3])
    docs = [text.tokenize(d) for d in docs]
    counter = Counter()
    [counter.update(set(x))for x in docs]
    tfidf = TFIDF(docs)
    tfidf2 = TFIDF.counter(counter)
    assert tfidf.num_terms == tfidf2.num_terms
    assert tfidf._ndocs == tfidf2._ndocs
    for k in tfidf2.word2id.keys():
        assert k in tfidf2.word2id
    for k, v in tfidf.word2id.items():
        id2 = tfidf2.word2id[k]
        v = tfidf.wordWeight[v]
        v2 = tfidf2.wordWeight[id2]
        print(v, v2, k)
        assert_almost_equals(v, v2)

Example #2

0

Show file

def test_lang():
    from microtc.textmodel import TextModel

    text = [
        "Hi :) :P XD",
        "excelente dia xc",
        "el alma de la fiesta XD"
    ]
    model = TextModel(text, **{
        "del_dup1": True,
        "emo_option": "group",
        "lc": True,
        "num_option": "group",
        "strip_diac": False,
        "token_list": [
            (2, 1),
            (2, 2),
            -1,
            # 5,
        ],
        "url_option": "group",
        "usr_option": "group",
    })
    text = "El alma de la fiesta :) conociendo la maquinaria @user bebiendo nunca manches que onda"
    a = model.tokenize(text)
    b = ['el~de', 'alma~la', 'de~fiesta', 'la~_pos', 'fiesta~conociendo', '_pos~la', 'conociendo~maquinaria', 'la~_usr', 'maquinaria~bebiendo', '_usr~nunca',
         'bebiendo~manches', 'nunca~que', 'manches~onda', 'el~la', 'alma~fiesta', 'de~_pos', 'la~conociendo', 'fiesta~la', '_pos~maquinaria', 'conociendo~_usr',
         'la~bebiendo', 'maquinaria~nunca', '_usr~manches', 'bebiendo~que', 'nunca~onda', 'el', 'alma', 'de', 'la', 'fiesta', '_pos',
         'conociendo', 'la', 'maquinaria', '_usr', 'bebiendo', 'nunca', 'manches', 'que', 'onda']
    print(text)
    assert a == b, "got: {0}, expected: {1}".format(a, b)

Example #3

0

Show file

File: _TopicClass.py Project: victor-gomez-espinosa/EVENTS

    def fit_predict(self, X):
        #fit
        #Corpus
        textmodel = TextModel().fit(X)  #Modelo de texto
        X = textmodel.transform(X)

        nmf = NMF(n_components=6, max_iter=500).fit(X)  #reduccion de dimension
        nmf_features = nmf.transform(X)

        #Topics
        topics = self.topics
        X_topics = textmodel.transform(topics)  #al modelo de texto
        nmf_topics = nmf.transform(X_topics)  #reduccion de dimension

        K = cosine_similarity(nmf_features, nmf_topics)  #similaridades
        n, p = K.shape

        #predict
        prob = K
        cat = self.categories
        labels = []
        for i in range(prob.shape[0]):
            xx = prob[i, :]
            indx = self.maxin(xx)
            lab = cat[indx]
            labels.append(lab)

        return np.array(labels)

Example #4

0

Show file

def test_textmodel_num_terms():
    from microtc.textmodel import TextModel
    from microtc.utils import tweet_iterator
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    text = TextModel(token_list=[-2, -1, 3, 4]).fit(tw)
    assert text.num_terms is not None
    text.transform(["buenos"])
    print(text.num_terms)
    assert text.num_terms == text.model.num_terms

Example #5

0

Show file

def test_doc2weight():
    from microtc.textmodel import TextModel
    from microtc.weighting import TFIDF
    from microtc.utils import tweet_iterator
    import os
    fname = join(os.path.dirname(__file__), 'text.json')
    tw = list(tweet_iterator(fname))
    docs = [x['text'] for x in tw]
    text = TextModel(docs, token_list=[-1, 3])
    # print(text['buenos dias'])
    docs = [text.tokenize(d) for d in docs]
    sp = TFIDF(docs)
    assert len(sp.doc2weight(text.tokenize('odio odio los los'))) == 3

Example #6

0

Show file

def test_space():
    from microtc.textmodel import TextModel
    from microtc.weighting import TFIDF
    from microtc.utils import tweet_iterator
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    docs = [x['text'] for x in tw]
    text = TextModel(docs, token_list=[-1, 3])
    # print(text['buenos dias'])
    docs = [text.tokenize(d) for d in docs]
    sp = TFIDF(docs)
    assert len(sp.wordWeight) == len(sp._w2id)

Example #7

0

Show file

def test_textmodel_compute_tokens():
    from microtc.textmodel import TextModel
    from microtc.utils import tweet_iterator
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    tm = TextModel(token_list=[-2, -1])
    text = tm.text_transformations(tw[0]['text'])
    L = tm.compute_tokens(text)
    assert len(L) == 2
    r = []
    [r.__iadd__(x) for x in L]
    for a, b in zip(tm.tokenize(tw[0]), r):
        assert a == b

Example #8

0

Show file

def test_textmodel_transform_tonp():
    from microtc.textmodel import TextModel
    from microtc.utils import tweet_iterator
    from sklearn.svm import LinearSVC
    from sklearn.preprocessing import LabelEncoder
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    text = TextModel().fit(tw)
    X = text.transform(tw)
    le = LabelEncoder().fit([x['klass'] for x in tw])
    y = le.transform([x['klass'] for x in tw])
    m = LinearSVC().fit(text.tonp(X), y)
    assert len(m.predict(text.tonp(X))) == len(y)

Example #9

0

Show file

def test_textmodel_token_min_filter():
    from microtc.textmodel import TextModel
    from microtc.utils import tweet_iterator
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    text = TextModel(tw, token_min_filter=1, token_list=[-2, -1, 3, 4])
    print(len(text.model._w2id), 'hh', text.token_min_filter,
          text.token_max_filter)
    assert len(text.model._w2id) == 28
    text = TextModel(tw, token_min_filter=0.01, token_list=[-2, -1, 3, 4])
    print(len(text.model._w2id))
    assert len(text.model._w2id) == 28
    text = TextModel(tw, token_min_filter=1)

Example #10

0

Show file

def test_textmodel():
    from microtc.textmodel import TextModel
    from microtc.utils import tweet_iterator
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    text = TextModel([x['text'] for x in tw])
    # print(text.tokenize("hola amiguitos gracias por venir :) http://hello.com @chanfle"))
    # assert False
    assert isinstance(text[tw[0]['text']], list)
    assert len(text[tw[0]]) == 3
    text = TextModel(token_list=[3]).fit(tw)
    print(text.model.word2id)
    for k, _ in text.model.word2id.items():
        assert len(k) == 5

Example #11

0

Show file

File: scorewrapper.py Project: kyriox/microtc

    def __call__(self, conf_code):
        conf, code = conf_code
        st = time()
        predY = np.zeros(len(self.y))
        # X = np.array(self.X)
        for train, test in self.kfolds.split(self.X, self.y):
            # A = X[train]
            A = [self.X[i] for i in train]
            if len(self.Xstatic) > 0:
                A.extend(self.Xstatic)

            trainY = self.y[train]
            if len(self.ystatic) > 0:
                trainY = np.hstack((trainY, self.ystatic))

            textmodel = TextModel(A, **conf)

            # textmodel = TextModel([X[i] for i in train], **conf)
            trainX = [textmodel[x] for x in A]

            c = self.create_classifier()
            try:
                c.fit(trainX, trainY)
            except ValueError:
                conf["_error"] = "this configuration produces an empty matrix"
                conf["_score"] = 0.0
                return conf

            testX = [textmodel[self.X[i]] for i in test]
            predY[test] = c.predict(testX)

        self.compute_score(conf, predY)
        conf['_time'] = (time() - st) / self.nfolds
        return conf

Example #12

0

Show file

def test_getitem():
    from microtc.textmodel import TextModel
    from microtc.weighting import TFIDF
    from microtc.utils import tweet_iterator
    import os
    fname = join(os.path.dirname(__file__), 'text.json')
    tw = list(tweet_iterator(fname))
    docs = [x['text'] for x in tw]
    text = TextModel(docs, token_list=[-1, 3])
    # print(text['buenos dias'])
    docs = [text.tokenize(d) for d in docs]
    sp = TFIDF(docs)
    tok = text.tokenize('buenos dias')
    bow = sp.doc2weight(tok)
    ids = bow[0]
    assert len(ids) == len(sp[tok])

Example #13

0

Show file

File: scorewrapper.py Project: StevenLOL/microTC

    def __call__(self, conf_code):
        conf, code = conf_code
        st = time()
        predY = np.zeros(len(self.y))
        # X = np.array(self.X)
        for train, test in self.kfolds:
            # A = X[train]
            A = [self.X[i] for i in train]
            if len(self.Xstatic) > 0:
                A.extend(self.Xstatic)

            textmodel = TextModel(A, **conf)
            # textmodel = TextModel([X[i] for i in train], **conf)
            trainX = [textmodel[x] for x in A]
            trainY = self.y[train]
            if len(self.ystatic) > 0:
                trainY = np.hstack((trainY, self.ystatic))

            c = self.create_classifier()
            c.fit(trainX, trainY)
            testX = [textmodel[self.X[i]] for i in test]
            predY[test] = c.predict(testX)

        self.compute_score(conf, predY)
        conf['_time'] = (time() - st) / self.nfolds
        return conf

Example #14

0

Show file

def test_textmodel_weighting_key():
    from microtc.textmodel import TextModel
    from microtc.utils import tweet_iterator
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    for w in ['tfidf', 'tf', 'entropy']:
        TextModel(token_list=[-2, -1], weighting=w).fit(tw)

Example #15

0

Show file

def test_textmodel_save_load():
    import os
    from microtc.textmodel import TextModel
    from microtc.utils import tweet_iterator, save_model, load_model
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    tm = TextModel().fit(tw)
    save_model(tm, 't.model')
    assert isinstance(load_model('t.model'), TextModel)
    os.unlink('t.model')

Example #16

0

Show file

def test_entropy():
    from microtc.textmodel import TextModel
    from microtc.weighting import Entropy, TFIDF
    from microtc.utils import tweet_iterator
    import os
    fname = join(os.path.dirname(__file__), 'text.json')
    tw = list(tweet_iterator(fname))
    docs = [x['text'] for x in tw]
    text = TextModel(token_list=[-1, 3])
    # print(text['buenos dias'])
    docs = [text.tokenize(d) for d in docs]
    sp = Entropy(docs, X=tw)
    print(sp.wordWeight)
    tfidf = TFIDF(docs)
    for k in sp.wordWeight.keys():
        if sp.wordWeight[k] != tfidf.wordWeight[k]:
            return
    # print(sp.w)
    assert False

Example #17

0

Show file

File: scorewrapper.py Project: StevenLOL/microTC

 def __call__(self, conf_code):
     conf, code = conf_code
     st = time()
     textmodel = TextModel(self.train_corpus, **conf)
     train_X = [textmodel[doc] for doc in self.train_corpus]
     c = self.create_classifier()
     c.fit(train_X, self.train_y)
     test_X = [textmodel[doc] for doc in self.test_corpus]
     pred_y = c.predict(test_X)
     self.compute_score(conf, pred_y)
     conf['_time'] = (time() - st)
     return conf

Example #18

0

Show file

File: textmodel.py Project: valjime95/b4msa

    def params(cls):
        """
        Parameters

        >>> from b4msa.textmodel import TextModel
        >>> TextModel.params()
        ['docs', 'threshold', 'lang', 'negation', 'stemming', 'stopwords', 'kwargs', 'docs', 'text', 'num_option', 'usr_option', 'url_option', 'emo_option', 'hashtag_option', 'ent_option', 'lc', 'del_dup', 'del_punc', 'del_diac', 'token_list', 'token_min_filter', 'token_max_filter', 'select_ent', 'select_suff', 'select_conn', 'weighting']
        """
        import inspect
        r = mTCTextModel.params()
        sig = inspect.signature(cls)
        params = sig.parameters.keys()
        return list(params) + list(r)

Example #19

0

Show file

File: textmodel.py Project: INGEOTEC/b4msa

    def params(cls):
        """
        Parameters

        >>> from b4msa.textmodel import TextModel
        >>> TextModel.params()
        ['docs', 'threshold', 'lang', 'negation', 'stemming', 'stopwords', 'kwargs', 'docs', 'text', 'num_option', 'usr_option', 'url_option', 'emo_option', 'hashtag_option', 'ent_option', 'lc', 'del_dup', 'del_punc', 'del_diac', 'token_list', 'token_min_filter', 'token_max_filter', 'select_ent', 'select_suff', 'select_conn', 'weighting']
        """
        import inspect
        r = mTCTextModel.params()
        sig = inspect.signature(cls)
        params = sig.parameters.keys()
        return list(params) + list(r)

Example #20

0

Show file

def test_tfidf_corpus2():
    from nose.tools import assert_almost_equals
    from microtc.textmodel import TextModel
    from microtc.weighting import TFIDF
    from microtc.utils import Counter
    from microtc.utils import tweet_iterator
    import os
    import numpy as np
    fname = join(os.path.dirname(__file__), 'text.json')
    tw = list(tweet_iterator(fname))
    docs = [x['text'] for x in tw]
    tm = TextModel(token_list=[-1, 3])
    docs = [tm.tokenize(d) for d in docs]
    counter = Counter()
    [counter.update(set(x))for x in docs]
    tfidf = TFIDF(docs, token_min_filter=1)
    tfidf2 = TFIDF.counter(counter, token_min_filter=1)
    id2w2 = {v: k for k, v in tfidf2.word2id.items()}
    for text in docs:
        tokens = tm.tokenize(text)
        fm = {k: v for k, v in tfidf[tokens]}
        for k, v in tfidf2[tokens]:
            assert_almost_equals(fm[tfidf.word2id[id2w2[k]]], v)

Example #21

0

Show file

    def __call__(self, conf_code):
        conf, code = conf_code
        st = time()
        model_klass = os.environ.get("TEXTMODEL_KLASSES", None)

        if model_klass:
            model_klass = self.le.transform(model_klass.split(','))
            _train = [
                self.train_corpus[i] for i in len(self.train_corpus)
                if self.train_y[i] in model_klass
            ]
            textmodel = TextModel(_train, **conf)

            if conf['dist_vector'] != OPTION_NONE:
                _train_y = [
                    self.train_y[i] for i in len(self.train_corpus)
                    if self.train_y[i] in model_klass
                ]
                textmodel = DistTextModel(textmodel, _train, _train_y,
                                          self.le.classes_.shape[0],
                                          conf['dist_vector'])
        else:
            textmodel = TextModel(self.train_corpus, **conf)
            if conf['dist_vector'] != OPTION_NONE:
                textmodel = DistTextModel(textmodel, self.train_corpus,
                                          self.train_y,
                                          self.le.classes_.shape[0],
                                          conf['dist_vector'])

        train_X = [textmodel[doc] for doc in self.train_corpus]
        c = self.create_classifier()
        c.fit(train_X, self.train_y)
        test_X = [textmodel[doc] for doc in self.test_corpus]
        pred_y = c.predict(test_X)
        self.compute_score(conf, pred_y)
        conf['_time'] = (time() - st)
        return conf

Example #22

0

Show file

def test_textmodel_entropy():
    from microtc.textmodel import TextModel
    from microtc.utils import tweet_iterator
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    text = TextModel(tw,
                     weighting='microtc.weighting.Entropy',
                     token_list=[-1, 3])
    # print(text.tokenize("hola amiguitos gracias por venir :) http://hello.com @chanfle"))
    # assert False
    assert isinstance(text[tw[0]['text']], list)
    _ = text[tw[0]]
    print(_)
    for k, v in _:
        assert text.model.wordWeight[k] == v

Example #23

0

Show file

def test_params():
    import os
    import itertools
    from microtc.params import BASIC_OPTIONS
    from microtc.textmodel import TextModel
    from microtc.utils import tweet_iterator

    params = dict(strip_diac=[True, False], usr_option=BASIC_OPTIONS,
                  url_option=BASIC_OPTIONS)
    params = sorted(params.items())
    fname = os.path.dirname(__file__) + '/text.json'
    tw = [x for x in tweet_iterator(fname)]
    text = [x['text'] for x in tw]
    for x in itertools.product(*[x[1] for x in params]):
        args = dict(zip([x[0] for x in params], x))
        ins = TextModel(text, **args)
        assert isinstance(ins[text[0]], list)

Example #24

0

Show file

def test_predict_from_file():
    from microtc.wrappers import ClassifierWrapper
    from microtc.textmodel import TextModel
    from microtc.utils import read_data_labels
    from sklearn.preprocessing import LabelEncoder

    import os
    fname = os.path.dirname(__file__) + '/text.json'
    corpus, labels = read_data_labels(fname)
    t = TextModel(corpus)
    le = LabelEncoder()
    le.fit(labels)
    y = le.transform(labels)
    c = ClassifierWrapper()
    X = [t[x] for x in corpus]
    c.fit(X, y)
    hy = le.inverse_transform(c.predict(X))
    for i in hy:
        assert i in ['POS', 'NEU', 'NEG']

Example #25

0

Show file

File: regscorewrapper.py Project: kyriox/microtc

 def __call__(self, conf_code):
     conf, code = conf_code
     st = time()
     textmodel = TextModel(self.train_corpus, **conf)
     train_X = [textmodel[doc] for doc in self.train_corpus]
     c = self.create_classifier()
     # c.fit(train_X, self.train_y)
     try:
         c.fit(train_X, self.train_y)
     except ValueError:
         conf["_error"] = "this configuration produces an empty matrix"
         conf["_score"] = 0.0
         return conf
 
     test_X = [textmodel[doc] for doc in self.test_corpus]
     pred_y = c.predict(test_X)
     self.compute_score(conf, pred_y)
     conf['_time'] = (time() - st)
     return conf

Example #26

0

Show file

from microtc.textmodel import TextModel
from microtc.params import OPTION_NONE
from glob import glob
from collections import Counter
import numpy as np
from scipy.optimize import minimize
from matplotlib import pylab as plt
from nltk.stem.porter import PorterStemmer
from typing import Callable, Iterable

tm = TextModel(num_option=OPTION_NONE,
               usr_option=OPTION_NONE,
               url_option=OPTION_NONE,
               emo_option=OPTION_NONE,
               hashtag_option=OPTION_NONE,
               ent_option=OPTION_NONE,
               lc=False,
               del_dup=False,
               del_punc=False,
               del_diac=False,
               token_list=[-1])

tm.tokenize("Hello good morning")

# Count the number of words


def N_tokens_types(fname: str, counter: Counter, tm: Callable[[str],
                                                              Iterable[str]]):
    txt = open(fname).read()
    tokens = tm(txt)

Example #27

0

Show file

File: bagg.py Project: kyriox/dravidian-codemixed

 def __create_models(self):
     models = []
     models_fit = []
     #for _params in self.model_params:
     _params = {}
     for k, v in self.params.items():
         if k.startswith('_'):
             continue
         _params[k] = v
     self.textModels = dict(
         mtc=TextModel(_params).fit(self.train),
         #charEmb=DocumentPoolEmbeddings([CharacterEmbeddings()]),
         #charLangEmb=DocumentPoolEmbeddings([CharacterEmbeddings(),BytePairEmbeddings(self.lang)]),
         ##charMultiEmb=DocumentPoolEmbeddings([CharacterEmbeddings(),BytePairEmbeddings('multi')]),
         langEmb=DocumentPoolEmbeddings([BytePairEmbeddings(self.lang)]),
         charLangMultiEmb=DocumentPoolEmbeddings([
             CharacterEmbeddings(),
             BytePairEmbeddings(self.lang),
             BytePairEmbeddings('multi')
         ]),
         langMultiEmb=DocumentPoolEmbeddings(
             [BytePairEmbeddings(self.lang),
              BytePairEmbeddings('multi')]),
         bytePairEMB=DocumentPoolEmbeddings([BytePairEmbeddings('multi')]),
         #flairEmbF=DocumentPoolEmbeddings([FlairEmbeddings('multi-forward')]),
         #flairEmbB=DocumentPoolEmbeddings([FlairEmbeddings('multi-backward')]),
         #bertEMB=DocumentPoolEmbeddings([TransformerWordEmbeddings('bert-base-uncased', layers='-1')])
     )
     for km, tmodel in self.textModels.items():
         models.append({'name': km})
         models_fit.append({'name': km})
         if km == 'mtc':
             xt = tmodel.transform(self.train)
             xv = tmodel.transform(self.validation)
             X = tmodel.transform(self.data)
         else:
             sentences_train = [Sentence(txt) for txt in self.train]
             tmodel.embed(sentences_train)
             xt = np.array([
                 e.get_embedding().cpu().detach().numpy()
                 for e in sentences_train
             ])
             sentences_val = [Sentence(txt) for txt in self.validation]
             tmodel.embed(sentences_val)
             xv = np.array([
                 e.get_embedding().cpu().detach().numpy()
                 for e in sentences_val
             ])
             sentences = [Sentence(txt) for txt in self.data]
             tmodel.embed(sentences)
             X = np.array([
                 e.get_embedding().cpu().detach().numpy() for e in sentences
             ])
         models[-1]['xv'] = xv
         models[-1]['xt'] = xt
         models_fit[-1]['xt'] = X
         #max_iter=5000
         #if km=='mtc': max_iter=1000
         #if km=='langMulti': max_iter=5000
         #self.models[-1]['clf']=LinearSVC(max_iter=max_iter).fit(xt,self.yt)
         #yp=self.models[-1]['clf'].decision_function(xv)
         #scaler=Normalizer().fit(yp)
         #self.models[-1]['macroF1']=f1_score(self.yv,np.argmax(scaler.transform(yp),axis=1),average='weighted')
         #self.models[-1]['weightedF1']=f1_score(self.yv,np.argmax(scaler.transform(yp),axis=1),average='weighted')
         #self.models[-1]['score']=f1_score(self.yv,np.argmax(yp,axis=1),average='weighted')
         #self.models[-1]['probas']=scaler.transform(yp)
         ### Fit model with all avaliable data
         #self.models_fit[-1]['clf']=LinearSVC(max_iter=max_iter).fit(X,self.y)
     print('Fitting Ensemble')
     #self.models  =  Parallel(n_jobs=5)(delayed(self._train_model)(md) for md in models)
     #self.models_fit = Parallel(n_jobs=5)(delayed(self._train_model)(md) for md in models_fit)
     self.models, self.models_fit = [], []
     for md, mdf in zip(models, models_fit):
         self.models.append(self._train_model(
             md))  #  =  [self._train_model(md) for md in models]
         self.models_fit.append(self._train_model(md))

Example #28

0

Show file

File: bernoulli-bow.py Project: INGEOTEC/NLP-Course

from EvoMSA.model import Bernoulli
from EvoMSA.utils import LabelEncoder, bootstrap_confidence_interval
from microtc.textmodel import TextModel
from microtc.utils import tweet_iterator
from os.path import join, dirname
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import StratifiedKFold

tweets = join(dirname(base.__file__), 'tests', 'tweets.json')
D = list(tweet_iterator(tweets))
y = [x['klass'] for x in D]
le = LabelEncoder().fit(y)
y = le.transform(y)

tm = TextModel(token_list=[-1]).fit(D)
X = tm.transform(D)
m = Bernoulli().fit(X, y)
print((y == m.predict(X)).mean())
# 0.724

_ = train_test_split(D, y, test_size=0.2)
Xtrain, Xtest, ytrain, ytest = _
tm = TextModel(token_list=[-1]).fit(Xtrain)
m = Bernoulli().fit(tm.transform(Xtrain), ytrain)
hy = m.predict(tm.transform(Xtest))
print((ytest == hy).mean())
# 0.55

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
hy = np.empty_like(y)