def test_projection(): from EvoMSA.align import projection from EvoMSA.utils import download with StoreDelete(LabeledDataSet.create_space, TWEETS, 'emo-static-es.evoemo') as sd2: with StoreDelete(labeledDataSet, TWEETS, 'emo-static-en.evoemo') as sd1: res = projection(download('emo-static-es.evoemo'), download('emo-static-en.evoemo'), _read_words('es'), _read_words('en')) assert res.ndim == 2 assert res.shape[0] == res.shape[1]
def test_BagOfWords_init(): from microtc.utils import load_model from EvoMSA.utils import download tm = BagOfWords() xx = list(load_model(download("b4msa_Es.tm")).model.word2id.keys()) tm2 = BagOfWords(tokens=xx) assert len(tm.tokenize.vocabulary) == len(tm2.tokenize.vocabulary)
def common_words(self, quantile: float = None, bigrams=True): """Words used frequently; these correspond to py:attr:`EvoMSA.base.EvoMSA(B4MSA=True)` In the case quantile is given the these words and bigrams correspond to the most frequent. """ if quantile is None: from EvoMSA.utils import download return load_model(download("b4msa_%s.tm" % self._lang)).model.word2id words_N = sum([v for k, v in self.voc.items() if k.count("~") == 0]) score = [[k, v / words_N] for k, v in self.voc.items() if k.count("~") == 0] score.sort(key=lambda x: x[1], reverse=True) cum, k = 0, 0 while cum <= quantile: cum += score[k][1] k += 1 output = [k for k, _ in score[:k]] if bigrams: bigrams_N = sum([v for k, v in self.voc.items() if k.count("~")]) score_bi = [[k, v / bigrams_N] for k, v in self.voc.items() if k.count("~")] score_bi.sort(key=lambda x: x[1], reverse=True) cum, k = 0, 0 while cum <= quantile: cum += score_bi[k][1] k += 1 output += [k for k, _ in score_bi[:k]] return output
def tm_words(self): """ Text model words :rtype: dict """ tm = self.text_transformations emos = self.load_emojis() textModel = load_model(download("b4msa_%s.tm" % self._lang)) words = [ tm(k) for k in textModel.model.word2id.keys() if k[:2] != "q:" and k.count("~") == 0 and k not in emos ] words.sort() _ = OrderedDict([(w, True) for w in words]) return _
def __init__(self, tokens: Union[str, List[str]] = "Es"): from microtc.utils import load_model from EvoMSA.utils import download tok = Tokenize() if isinstance(tokens, list): xx = tokens else: textModel = load_model(download("b4msa_%s.tm" % tokens)) xx = list(textModel.model.word2id.keys()) tok.textModel = textModel f = lambda cdn: "~".join([x for x in cdn.split("~") if len(x)]) tok.fit([f(k) for k in xx if k.count("~") and k[:2] != "q:"]) tok.fit([f(k) for k in xx if k.count("~") == 0 and k[:2] != "q:"]) qgrams = [f(k[2:]) for k in xx if k[:2] == "q:"] tok.fit([x for x in qgrams if x.count("~") == 0 if len(x) >= 2]) self._tokenize = tok self._text = "text"
def test_EvoMSA_regression(): from EvoMSA.base import LabelEncoderWrapper from EvoMSA.utils import download X, y = get_data() X = [dict(text=x) for x in X] l = LabelEncoderWrapper().fit(y) y = l.transform(y) - 1.5 evo = EvoMSA(stacked_method_args=dict(popsize=10, early_stopping_rounds=10, time_limit=5, n_estimators=2), classifier=False, models=[[download("emo_Es.tm"), 'EvoMSA.model.Identity']], TR=False, n_jobs=1).fit(X, y) assert evo df = evo.decision_function(X) print(df.shape, df.ndim) assert df.shape[0] == len(X) and df.ndim == 1 df = evo.predict(X) assert df.shape[0] == len(X) and df.ndim == 1
def test_node_performance(): from EvoMSA.utils import download X, y = get_data() kf = KFold(n_splits=2, random_state=1, shuffle=True) models = { 0: [download("b4msa_Es.tm"), "sklearn.svm.LinearSVC"], 1: ["b4msa.textmodel.TextModel", "EvoMSA.model.Bernoulli"], 2: ["EvoMSA.model.AggressivenessEs", "EvoMSA.model.Identity"] } a = Node([0], models, metric=lambda y, hy: f1_score(y, hy, average="macro"), split_dataset=kf, aggregate=lambda x: x, cache=os.path.join("tm", "NB")) # a.fit(X[:500], y[:500]) perf = a.performance(X, y) assert len(perf) == 2 print(perf) perf = np.mean(perf) assert perf < 1 and perf > 0.40