Exemple #1
0
    def _init(self, data):
        """
        Process the :py:attr:`data` to create a :py:class:`microtc.utils.Counter` 
        """
        def sum_vocs(vocs):
            voc = vocs[0]
            for v in vocs[1:]:
                voc = voc + v
            return voc

        if isinstance(data, list):
            vocs = [
                download_tokens(day, lang=self._lang, country=self._country)
                for day in data
            ]
            vocs = [load_model(x) for x in vocs]
            if isinstance(vocs[0], Counter):
                voc = sum_vocs(vocs)
            elif not self._states:
                vocs = [sum_vocs([v for _, v in i]) for i in vocs]
                voc = sum_vocs(vocs)
            else:
                voc = {k: v for k, v in vocs[0]}
                for v in vocs[1:]:
                    for k, d in v:
                        try:
                            voc[k] = voc[k] + d
                        except KeyError:
                            voc[k] = d
            self._data = voc
        else:
            self.voc = load_model(
                download_tokens(data, lang=self._lang, country=self._country))
Exemple #2
0
def projection(model_from, model_to, text_from, text_to):
    """
    Compute the coefficients to project the output of a Emoji Space in the origin language to the objetive language

    :param lang_from: Origin model
    :type lang_from: str
    :param lang_to: Objective model
    :type lang_to: str [ar|en|es]
    :param text_from: Text in the origin language
    :type text_from: list
    :param text_from: Text in the objective language
    :type text_from: list
    """

    from microtc.utils import load_model
    import numpy as np
    from sklearn.neighbors import KDTree
    model_from = load_model(model_from)
    model_to = load_model(model_to)
    vec_from = model_from.transform(text_from)
    vec_to = model_to.transform(text_to)
    done = set()
    output = []
    X = []
    kdtree = KDTree(vec_to, metric='manhattan')
    ss = kdtree.query(vec_from)[1].flatten()
    for k, j in tqdm(enumerate(ss)):
        if j in done:
            continue
        X.append(vec_from[k])
        output.append(vec_to[j])
        done.add(j)
    output = np.stack(output)
    X = np.stack(X)
    return np.linalg.lstsq(X, output, rcond=None)[0]
 def predict(fname, ds, tm, emoji):
     D = []
     for key, tweets in load_model(fname).items():
         labels = [ds.klass(x['text']) for x in tweets]
         _ = [[x['text'], label] for label, x in zip(labels, tweets)
              if len(klasses.intersection(label))]
         D.extend(_)
     X = tm.transform([x for x, _ in D])
     y = [y for _, y in D]
     hy = []
     for k, emo in enumerate(emoji):
         output = join('models', f'{lang}_emo_{k}_mu{microtc.__version__}')
         m = load_model(f'{output}.LinearSVC')
         hy.append(m.predict(X))
     return y, hy
def recall_emo(lang='zh', n_jobs=1):
    def predict(fname, ds, tm, emoji):
        D = []
        for key, tweets in load_model(fname).items():
            labels = [ds.klass(x['text']) for x in tweets]
            _ = [[x['text'], label] for label, x in zip(labels, tweets)
                 if len(klasses.intersection(label))]
            D.extend(_)
        X = tm.transform([x for x, _ in D])
        y = [y for _, y in D]
        hy = []
        for k, emo in enumerate(emoji):
            output = join('models', f'{lang}_emo_{k}_mu{microtc.__version__}')
            m = load_model(f'{output}.LinearSVC')
            hy.append(m.predict(X))
        return y, hy

    def performance(emo, y, hy):
        y_emo = [emo in i for i in y]
        perf = recall_score(y_emo, hy > 0, pos_label=True)
        return perf, sum(y_emo) / len(y)

    info = load_model(join('models', f'{lang}_emo.info'))
    info = [[k, v] for k, (v, _) in enumerate(info.most_common())
            if _ >= 2**10]
    klasses = set([v for k, v in info])
    fnames = glob(join('data', lang, 'test', '*.gz'))
    ds = Dataset(text_transformations=False)
    ds.add(ds.load_emojis())
    dd = load_model(join('models', f'{lang}_emo.info'))
    emoji = [x for x, v in dd.most_common() if v >= 2**10]
    tm = load_model(join('models', f'{lang}_{microtc.__version__}.microtc'))
    predictions = Parallel(n_jobs=n_jobs)(
        delayed(predict)(fname, ds, tm, emoji) for fname in fnames)
    y = []
    [y.extend(x) for x, hy in predictions]
    hys = np.vstack([np.vstack(hy).T for _, hy in predictions])
    output = dict()
    _ = Parallel(n_jobs=n_jobs)(delayed(performance)(emo, y, hy)
                                for emo, hy in zip(emoji, hys.T))
    output = {
        emo: {
            'recall': perf,
            'ratio': ratio
        }
        for emo, (perf, ratio) in zip(emoji, _)
    }
    save_model(output, join('models', f'{lang}_emo.perf'))
def data_bow(lang='zh', size=2**19):
    num_tweets = {
        '{year}{month:02d}{day:02d}.gz'.format(**k): v
        for k, v in num_tweets_language(lang=lang)
    }
    files = [[num_tweets[basename(x)], x]
             for x in glob(join('data', lang, '*.gz'))]
    files.sort(key=lambda x: x[0])
    files = [x[1] for x in files]
    per_file = size / len(files)
    output = []
    for k, file in tqdm(enumerate(files), total=len(files)):
        tweets = load_model(file)
        [shuffle(tweets[key]) for key in tweets]
        cnt = [[key, len(tweets[key])] for key in tweets]
        cnt.sort(key=lambda x: x[1])
        per_place = int(np.ceil(per_file // len(cnt)))
        prev = len(output)
        for i, (key, n) in enumerate(cnt):
            _ = [x['text'] for x in tweets[key][:per_place]]
            output.extend(_)
            if len(_) < per_place and i < len(cnt) - 1:
                per_place += int(
                    np.ceil((per_place - len(_)) / (len(cnt) - (i + 1))))
        inc = len(output) - prev
        if inc < per_file and k < len(files) - 1:
            per_file += (per_file - inc) / (len(files) - (k + 1))
    shuffle(output)
    return output
def count_emo(lang='zh'):
    fnames = glob(join('data', lang, 'emo', '*.gz'))
    cnt = Counter()
    for fname in fnames:
        for key, data in load_model(fname).items():
            [cnt.update(x['klass']) for x in data if len(x['klass']) == 1]
    return cnt
Exemple #7
0
 def main(self):
     self.data = self.parser.parse_args()
     svc = load_model(self.data.model)
     X = [svc.model[x] for x in read_data(self.data.test_set)]
     output = self.get_output()
     if output.endswith('.gz'):
         gzip_flag = True
         output = gzip.open(output, 'wb')
     else:
         gzip_flag = False
         output = open(output, 'w')
     with output as fpt:
         if not self.data.decision_function:
             hy = svc.predict(X)
             for tweet, klass in zip(tweet_iterator(self.data.test_set), hy):
                 tweet['klass'] = str(klass)
                 cdn = json.dumps(tweet)+"\n"
                 cdn = bytes(cdn, encoding='utf-8') if gzip_flag else cdn
                 fpt.write(cdn)
         else:
             hy = svc.decision_function(X)
             for tweet, klass in zip(tweet_iterator(self.data.test_set), hy):
                 try:
                     o = klass.tolist()
                 except AttributeError:
                     o = klass
                 tweet['decision_function'] = o
                 cdn = json.dumps(tweet)+"\n"
                 cdn = bytes(cdn, encoding='utf-8') if gzip_flag else cdn
                 fpt.write(cdn)
def test_BagOfWords_init():
    from microtc.utils import load_model
    from EvoMSA.utils import download
    tm = BagOfWords()
    xx = list(load_model(download("b4msa_Es.tm")).model.word2id.keys())
    tm2 = BagOfWords(tokens=xx)
    assert len(tm.tokenize.vocabulary) == len(tm2.tokenize.vocabulary)
Exemple #9
0
def test_download_tokens():
    from text_models.utils import download_tokens
    from microtc.utils import load_model
    from os.path import isfile
    from os import unlink

    fname = download_tokens(dict(year=2020, month=2, day=14))
    assert isfile(fname)
    model = load_model(fname)
    print(model.most_common(10), model.update_calls)
    unlink(fname)
    fname = download_tokens(dict(year=2020, month=2, day=14), country="MX")
    assert isfile(fname)
    model2 = load_model(fname)
    assert len(model) != len(model2[0][1])
    unlink(fname)
Exemple #10
0
    def common_words(self, quantile: float = None, bigrams=True):
        """Words used frequently; these correspond to py:attr:`EvoMSA.base.EvoMSA(B4MSA=True)`
        In the case quantile is given the these words and bigrams correspond to 
        the most frequent.
        """

        if quantile is None:
            from EvoMSA.utils import download
            return load_model(download("b4msa_%s.tm" %
                                       self._lang)).model.word2id
        words_N = sum([v for k, v in self.voc.items() if k.count("~") == 0])
        score = [[k, v / words_N] for k, v in self.voc.items()
                 if k.count("~") == 0]
        score.sort(key=lambda x: x[1], reverse=True)
        cum, k = 0, 0
        while cum <= quantile:
            cum += score[k][1]
            k += 1
        output = [k for k, _ in score[:k]]
        if bigrams:
            bigrams_N = sum([v for k, v in self.voc.items() if k.count("~")])
            score_bi = [[k, v / bigrams_N] for k, v in self.voc.items()
                        if k.count("~")]
            score_bi.sort(key=lambda x: x[1], reverse=True)
            cum, k = 0, 0
            while cum <= quantile:
                cum += score_bi[k][1]
                k += 1
            output += [k for k, _ in score_bi[:k]]
        return output
def test_evo_test_set():
    from EvoMSA.base import EvoMSA
    sys.argv = ['EvoMSA', '--evodag-kw={"popsize": 10, "early_stopping_rounds": 10, "time_limit": 5, "n_estimators": 5}',
                '-ot.model', '--test_set', TWEETS, '-n2', TWEETS]
    train(output=True)
    evo = load_model('t.model')
    assert isinstance(evo, EvoMSA)
    os.unlink('t.model')
Exemple #12
0
def test_bug_two_cons_klass():
    from EvoMSA.utils import download
    from microtc.utils import load_model
    from os.path import dirname, join
    _ = join(dirname(__file__), "..", "data", "country.ds")
    dset = load_model(_)
    r = dset.klass("mexico y usa")
    assert len(r.intersection(set(["US", "MX"]))) == 2
Exemple #13
0
 def load_model(fname):
     from microtc.utils import load_model
     if os.path.isfile(fname):
         return load_model(fname)
     else:
         cls = CommandLine.get_class(fname)
         ins = cls()
         return ins
Exemple #14
0
 def main(self):
     self.data = self.parser.parse_args()
     svc = load_model(self.data.model)
     with open(self.get_output(), 'w') as fpt:
         for tw in tweet_iterator(self.data.test_set):
             extra = dict([(int(a), float(b)) for a, b in svc.model[tw['text']]]
                          + [('num_terms', svc.num_terms)])
             tw.update(extra)
             fpt.write(json.dumps(tw) + "\n")
Exemple #15
0
def test_evo_kwargs():
    from EvoMSA.base import EvoMSA
    sys.argv = ['EvoMSA', '--kw={"stacked_method_args": {"popsize": 10, "early_stopping_rounds": 10, "time_limit": 5, "n_estimators": 5}, "b4msa_args": {"del_dup":false}}',
                '-ot.model',
                '-n2', TWEETS]
    train(output=True)
    evo = load_model('t.model')
    assert isinstance(evo, EvoMSA)
    os.unlink('t.model')
Exemple #16
0
def test_textmodel_save_load():
    import os
    from microtc.textmodel import TextModel
    from microtc.utils import tweet_iterator, save_model, load_model
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    tm = TextModel().fit(tw)
    save_model(tm, 't.model')
    assert isinstance(load_model('t.model'), TextModel)
    os.unlink('t.model')
Exemple #17
0
 def __init__(
         self,
         fnames: Union[list, str],
         reader: Callable[[str], Iterable[dict]] = tweet_iterator) -> None:
     self._fnames = fnames if isinstance(fnames, list) else [fnames]
     self._reader = reader
     self._label = BoundingBox().label
     self._data = defaultdict(TokenCount.single_co_ocurrence)
     _ = join(dirname(__file__), "data", "state.dict")
     self._states = load_model(_)
Exemple #18
0
def vector_space(args):
    k, t, X, output = args
    if output is not None and os.path.isfile(output):
        return k, load_model(output)
    try:
        res = t.transform(X)
    except AttributeError:
        res = t.tonp([t[_] for _ in X])
    if output is not None:
        save_model(res, output)
    return k, res
Exemple #19
0
def test_counter():
    from microtc.utils import Counter, save_model, load_model
    import os
    c = Counter()
    c.update([1, 2, 3, 1])
    c.update([3])
    assert c[1] == 2
    print(c.update_calls)
    assert c.update_calls == 2
    save_model(c, "t.voc")
    cc = load_model("t.voc")
    os.unlink("t.voc")
    print(cc.update_calls, "**")
    assert cc.update_calls ==  2
Exemple #20
0
 def model(self, X):
     if not isinstance(X[0], list):
         X = [X]
     m = []
     kwargs = self._b4msa_args
     self._logger.info("Starting TextModel")
     self._logger.info(str(kwargs))
     for x in X:
         for tm, cl in self.models:
             if isinstance(tm, str):
                 m.append(load_model(tm))
             else:
                 m.append(tm(x, **kwargs))
     self._textModel = m
Exemple #21
0
def get_model(basename, data, labels, args):
    modelfile = get_filename(args, os.path.join("models", os.path.basename(basename)))

    if not os.path.exists(modelfile):

        if not os.path.isdir("models"):
            os.mkdir("models")

        args['docs'] = data
        model = TextModel(**args)
        save_model(model, modelfile)
    else:
        model = load_model(modelfile)
    return model
def emo(k, lang='zh', size=2**19):
    ds = Dataset(text_transformations=False)
    ds.add(ds.load_emojis())
    output = join('models', f'{lang}_emo_{k}_mu{microtc.__version__}')
    dd = load_model(join('models', f'{lang}_emo.info'))
    _ = [x for x, v in dd.most_common() if v >= 2**10]
    tot = sum([v for x, v in dd.most_common() if v >= 2**10])
    if k >= len(_):
        return
    pos = _[k]
    neg = set([x for i, x in enumerate(_) if i != k])
    POS, NEG, ADD = [], [], []
    for fname in glob(join('data', lang, 'emo', '*.gz')):
        for key, data in load_model(fname).items():
            for d in data:
                klass = d['klass']
                if len(klass) == 1:
                    klass = klass.pop()
                    if klass == pos:
                        POS.append(ds.process(d['text']))
                    elif klass in neg:
                        NEG.append(ds.process(d['text']))
                elif tot < size:
                    if pos not in klass and len(klass.intersection(neg)):
                        ADD.append(ds.process(d['text']))
    shuffle(POS), shuffle(NEG), shuffle(ADD)
    size2 = size // 2
    POS = POS[:size2]
    if len(NEG) < size2:
        NEG.extend(ADD)
    NEG = NEG[:size2]
    y = [1] * len(POS)
    y.extend([-1] * len(NEG))
    tm = load_model(join('models', f'{lang}_{microtc.__version__}.microtc'))
    X = tm.transform(POS + NEG)
    m = LinearSVC().fit(X, y)
    save_model(m, f'{output}.LinearSVC')
Exemple #23
0
 def fit_svm(self, Xvs, y):
     svc_models = []
     for (_, cl), X, output in zip(self.models, Xvs, self.cache.ml_train()):
         if output is not None and os.path.isfile(output):
             svc_models.append(load_model(output))
             continue
         try:
             c = cl(random_state=self._seed)
         except TypeError:
             c = cl()
         c.fit(X, y)
         svc_models.append(c)
         if output is not None:
             save_model(c, output)
     self._svc_models = svc_models
Exemple #24
0
def get_model(basename, data, labels, args):
    modelfile = get_filename(
        args, os.path.join("models", os.path.basename(basename)))

    if not os.path.exists(modelfile):

        if not os.path.isdir("models"):
            os.mkdir("models")

        args['docs'] = data
        model = TextModel(**args)
        save_model(model, modelfile)
    else:
        model = load_model(modelfile)
    return model
Exemple #25
0
    def postal_code_names(self):
        """
        Dictionary containing a descripcion of a postal code

        >>> from text_models.place import CP
        >>> cp = CP()
        >>> cp.postal_code_names["58000"]
        ['16', 'Michoacán de Ocampo', '053', 'Morelia']
        """
        try:
            return self._pc_names
        except AttributeError:
            path = join(dirname(__file__), "data", "CP.desc")
            self._pc_names = load_model(path)
        return self._pc_names
Exemple #26
0
 def __init__(self, day=None, window=30, end=None,
              data: Callable[[str], str]=download_geo,
              countries: Union[set, None]=None):
     path = join(dirname(__file__), "data", "state.dict")
     self._states = load_model(path)
     path = join(dirname(__file__), "data", "bbox_country.dict")
     self._n_states = load_model(path)
     self._bbox = BoundingBox()
     self._dates = list()
     delta = datetime.timedelta(days=1)
     init = datetime.datetime(year=2015, month=12, day=16)
     day = self.__handle_day(day)
     if end is not None:
         end = self.__handle_day(end)
         if end > day:
             end, day = day, end
         window = (day - end).days + 1
     days = []
     while len(days) < window and day >= init:
         try:
             fname = data("%s%02i%02i.travel" % (str(day.year)[-2:],
                                                 day.month,
                                                 day.day))
         except Exception:
             day = day - delta
             continue
         self._dates.append(day)
         day = day - delta
         _ = load_model(fname)
         _[0] = self.keep_only(_[0], countries)
         days.append(_)
     self._days = [x for x, _ in days]
     self.num_users = [x for _, x in days]
     self._days.reverse()
     self.num_users.reverse()
     self._dates.reverse()
Exemple #27
0
    def tm_words(self):
        """
        Text model words
        :rtype: dict
        """

        tm = self.text_transformations
        emos = self.load_emojis()
        textModel = load_model(download("b4msa_%s.tm" % self._lang))
        words = [
            tm(k) for k in textModel.model.word2id.keys()
            if k[:2] != "q:" and k.count("~") == 0 and k not in emos
        ]
        words.sort()
        _ = OrderedDict([(w, True) for w in words])
        return _
Exemple #28
0
 def __init__(self, tokens: Union[str, List[str]] = "Es"):
     from microtc.utils import load_model
     from EvoMSA.utils import download
     tok = Tokenize()
     if isinstance(tokens, list):
         xx = tokens
     else:
         textModel = load_model(download("b4msa_%s.tm" % tokens))
         xx = list(textModel.model.word2id.keys())
         tok.textModel = textModel
     f = lambda cdn: "~".join([x for x in cdn.split("~") if len(x)])
     tok.fit([f(k) for k in xx if k.count("~") and k[:2] != "q:"])
     tok.fit([f(k) for k in xx if k.count("~") == 0 and k[:2] != "q:"])
     qgrams = [f(k[2:]) for k in xx if k[:2] == "q:"]
     tok.fit([x for x in qgrams if x.count("~") == 0 if len(x) >= 2])
     self._tokenize = tok
     self._text = "text"
Exemple #29
0
 def __init__(self,
              data,
              lang: str = "Es",
              country: str = 'nogeo',
              states: bool = False) -> None:
     self._lang = lang
     self._country = country
     self._states = states
     if isinstance(data, dict) and len(data) > 3:
         self._data = data
     elif isinstance(data, str) and isfile(data):
         self.voc = load_model(data)
     else:
         self.date = data
         self._init(data)
     if not states:
         self._n_words = sum(
             [v for k, v in self.voc.items() if k.count("~") == 0])
         self._n_bigrams = sum(
             [v for k, v in self.voc.items() if k.count("~")])
Exemple #30
0
    def textModels(self):
        """Text Models

        :rtype: list
        """

        # Performing lazy loading
        # If the outputs are in the cache,
        # there is no need to load the model into memory
        solve = [(i, tm)
                 for (i,
                      tm), cache in zip(enumerate(self._textModel), self.cache)
                 if isinstance(tm, str) and (
                     cache is None or not os.path.isfile(cache))]
        for i, tm in solve:
            _ = load_model(tm)
            if isinstance(_, EvoMSA):
                _ = EvoMSAWrapper(evomsa=_)
            self._textModel[i] = _
        return self._textModel
def test_train_exogenous():
    from EvoMSA.base import EvoMSA
    import json
    with open('ex.json', 'w') as fpt:
        for x in tweet_iterator(TWEETS):
            x['decision_function'] = x['q_voc_ratio']
            fpt.write(json.dumps(x) + '\n')
    sys.argv = ['EvoMSA', '-ot.model', '-n2',
                '--exogenous', 'ex.json', 'ex.json',
                '--evodag-kw={"popsize": 10, "early_stopping_rounds": 10, "time_limit": 5, "n_estimators": 5}',
                TWEETS]
    train(output=True)
    evo = load_model('t.model')
    assert isinstance(evo, EvoMSA)
    os.unlink('t.model')
    m = evo._evodag_model._m.models[0]
    os.unlink('ex.json')
    print(m.nvar)
    assert m.nvar == 6
    assert evo.n_jobs == 2