Beispiel #1
0
def test_projection():
    from EvoMSA.align import projection
    from EvoMSA.utils import download
    with StoreDelete(LabeledDataSet.create_space, TWEETS,
                     'emo-static-es.evoemo') as sd2:
        with StoreDelete(labeledDataSet, TWEETS,
                         'emo-static-en.evoemo') as sd1:
            res = projection(download('emo-static-es.evoemo'),
                             download('emo-static-en.evoemo'),
                             _read_words('es'), _read_words('en'))
            assert res.ndim == 2
            assert res.shape[0] == res.shape[1]
def test_BagOfWords_init():
    from microtc.utils import load_model
    from EvoMSA.utils import download
    tm = BagOfWords()
    xx = list(load_model(download("b4msa_Es.tm")).model.word2id.keys())
    tm2 = BagOfWords(tokens=xx)
    assert len(tm.tokenize.vocabulary) == len(tm2.tokenize.vocabulary)
Beispiel #3
0
    def common_words(self, quantile: float = None, bigrams=True):
        """Words used frequently; these correspond to py:attr:`EvoMSA.base.EvoMSA(B4MSA=True)`
        In the case quantile is given the these words and bigrams correspond to 
        the most frequent.
        """

        if quantile is None:
            from EvoMSA.utils import download
            return load_model(download("b4msa_%s.tm" %
                                       self._lang)).model.word2id
        words_N = sum([v for k, v in self.voc.items() if k.count("~") == 0])
        score = [[k, v / words_N] for k, v in self.voc.items()
                 if k.count("~") == 0]
        score.sort(key=lambda x: x[1], reverse=True)
        cum, k = 0, 0
        while cum <= quantile:
            cum += score[k][1]
            k += 1
        output = [k for k, _ in score[:k]]
        if bigrams:
            bigrams_N = sum([v for k, v in self.voc.items() if k.count("~")])
            score_bi = [[k, v / bigrams_N] for k, v in self.voc.items()
                        if k.count("~")]
            score_bi.sort(key=lambda x: x[1], reverse=True)
            cum, k = 0, 0
            while cum <= quantile:
                cum += score_bi[k][1]
                k += 1
            output += [k for k, _ in score_bi[:k]]
        return output
Beispiel #4
0
    def tm_words(self):
        """
        Text model words
        :rtype: dict
        """

        tm = self.text_transformations
        emos = self.load_emojis()
        textModel = load_model(download("b4msa_%s.tm" % self._lang))
        words = [
            tm(k) for k in textModel.model.word2id.keys()
            if k[:2] != "q:" and k.count("~") == 0 and k not in emos
        ]
        words.sort()
        _ = OrderedDict([(w, True) for w in words])
        return _
Beispiel #5
0
 def __init__(self, tokens: Union[str, List[str]] = "Es"):
     from microtc.utils import load_model
     from EvoMSA.utils import download
     tok = Tokenize()
     if isinstance(tokens, list):
         xx = tokens
     else:
         textModel = load_model(download("b4msa_%s.tm" % tokens))
         xx = list(textModel.model.word2id.keys())
         tok.textModel = textModel
     f = lambda cdn: "~".join([x for x in cdn.split("~") if len(x)])
     tok.fit([f(k) for k in xx if k.count("~") and k[:2] != "q:"])
     tok.fit([f(k) for k in xx if k.count("~") == 0 and k[:2] != "q:"])
     qgrams = [f(k[2:]) for k in xx if k[:2] == "q:"]
     tok.fit([x for x in qgrams if x.count("~") == 0 if len(x) >= 2])
     self._tokenize = tok
     self._text = "text"
Beispiel #6
0
def test_EvoMSA_regression():
    from EvoMSA.base import LabelEncoderWrapper
    from EvoMSA.utils import download
    X, y = get_data()
    X = [dict(text=x) for x in X]
    l = LabelEncoderWrapper().fit(y)
    y = l.transform(y) - 1.5
    evo = EvoMSA(stacked_method_args=dict(popsize=10,
                                          early_stopping_rounds=10,
                                          time_limit=5,
                                          n_estimators=2),
                 classifier=False,
                 models=[[download("emo_Es.tm"), 'EvoMSA.model.Identity']],
                 TR=False,
                 n_jobs=1).fit(X, y)
    assert evo
    df = evo.decision_function(X)
    print(df.shape, df.ndim)
    assert df.shape[0] == len(X) and df.ndim == 1
    df = evo.predict(X)
    assert df.shape[0] == len(X) and df.ndim == 1
Beispiel #7
0
def test_node_performance():
    from EvoMSA.utils import download

    X, y = get_data()
    kf = KFold(n_splits=2, random_state=1, shuffle=True)
    models = {
        0: [download("b4msa_Es.tm"), "sklearn.svm.LinearSVC"],
        1: ["b4msa.textmodel.TextModel", "EvoMSA.model.Bernoulli"],
        2: ["EvoMSA.model.AggressivenessEs", "EvoMSA.model.Identity"]
    }
    a = Node([0],
             models,
             metric=lambda y, hy: f1_score(y, hy, average="macro"),
             split_dataset=kf,
             aggregate=lambda x: x,
             cache=os.path.join("tm", "NB"))
    # a.fit(X[:500], y[:500])
    perf = a.performance(X, y)
    assert len(perf) == 2
    print(perf)
    perf = np.mean(perf)
    assert perf < 1 and perf > 0.40