def _init(self, data): """ Process the :py:attr:`data` to create a :py:class:`microtc.utils.Counter` """ def sum_vocs(vocs): voc = vocs[0] for v in vocs[1:]: voc = voc + v return voc if isinstance(data, list): vocs = [ download_tokens(day, lang=self._lang, country=self._country) for day in data ] vocs = [load_model(x) for x in vocs] if isinstance(vocs[0], Counter): voc = sum_vocs(vocs) elif not self._states: vocs = [sum_vocs([v for _, v in i]) for i in vocs] voc = sum_vocs(vocs) else: voc = {k: v for k, v in vocs[0]} for v in vocs[1:]: for k, d in v: try: voc[k] = voc[k] + d except KeyError: voc[k] = d self._data = voc else: self.voc = load_model( download_tokens(data, lang=self._lang, country=self._country))
def projection(model_from, model_to, text_from, text_to): """ Compute the coefficients to project the output of a Emoji Space in the origin language to the objetive language :param lang_from: Origin model :type lang_from: str :param lang_to: Objective model :type lang_to: str [ar|en|es] :param text_from: Text in the origin language :type text_from: list :param text_from: Text in the objective language :type text_from: list """ from microtc.utils import load_model import numpy as np from sklearn.neighbors import KDTree model_from = load_model(model_from) model_to = load_model(model_to) vec_from = model_from.transform(text_from) vec_to = model_to.transform(text_to) done = set() output = [] X = [] kdtree = KDTree(vec_to, metric='manhattan') ss = kdtree.query(vec_from)[1].flatten() for k, j in tqdm(enumerate(ss)): if j in done: continue X.append(vec_from[k]) output.append(vec_to[j]) done.add(j) output = np.stack(output) X = np.stack(X) return np.linalg.lstsq(X, output, rcond=None)[0]
def predict(fname, ds, tm, emoji): D = [] for key, tweets in load_model(fname).items(): labels = [ds.klass(x['text']) for x in tweets] _ = [[x['text'], label] for label, x in zip(labels, tweets) if len(klasses.intersection(label))] D.extend(_) X = tm.transform([x for x, _ in D]) y = [y for _, y in D] hy = [] for k, emo in enumerate(emoji): output = join('models', f'{lang}_emo_{k}_mu{microtc.__version__}') m = load_model(f'{output}.LinearSVC') hy.append(m.predict(X)) return y, hy
def recall_emo(lang='zh', n_jobs=1): def predict(fname, ds, tm, emoji): D = [] for key, tweets in load_model(fname).items(): labels = [ds.klass(x['text']) for x in tweets] _ = [[x['text'], label] for label, x in zip(labels, tweets) if len(klasses.intersection(label))] D.extend(_) X = tm.transform([x for x, _ in D]) y = [y for _, y in D] hy = [] for k, emo in enumerate(emoji): output = join('models', f'{lang}_emo_{k}_mu{microtc.__version__}') m = load_model(f'{output}.LinearSVC') hy.append(m.predict(X)) return y, hy def performance(emo, y, hy): y_emo = [emo in i for i in y] perf = recall_score(y_emo, hy > 0, pos_label=True) return perf, sum(y_emo) / len(y) info = load_model(join('models', f'{lang}_emo.info')) info = [[k, v] for k, (v, _) in enumerate(info.most_common()) if _ >= 2**10] klasses = set([v for k, v in info]) fnames = glob(join('data', lang, 'test', '*.gz')) ds = Dataset(text_transformations=False) ds.add(ds.load_emojis()) dd = load_model(join('models', f'{lang}_emo.info')) emoji = [x for x, v in dd.most_common() if v >= 2**10] tm = load_model(join('models', f'{lang}_{microtc.__version__}.microtc')) predictions = Parallel(n_jobs=n_jobs)( delayed(predict)(fname, ds, tm, emoji) for fname in fnames) y = [] [y.extend(x) for x, hy in predictions] hys = np.vstack([np.vstack(hy).T for _, hy in predictions]) output = dict() _ = Parallel(n_jobs=n_jobs)(delayed(performance)(emo, y, hy) for emo, hy in zip(emoji, hys.T)) output = { emo: { 'recall': perf, 'ratio': ratio } for emo, (perf, ratio) in zip(emoji, _) } save_model(output, join('models', f'{lang}_emo.perf'))
def data_bow(lang='zh', size=2**19): num_tweets = { '{year}{month:02d}{day:02d}.gz'.format(**k): v for k, v in num_tweets_language(lang=lang) } files = [[num_tweets[basename(x)], x] for x in glob(join('data', lang, '*.gz'))] files.sort(key=lambda x: x[0]) files = [x[1] for x in files] per_file = size / len(files) output = [] for k, file in tqdm(enumerate(files), total=len(files)): tweets = load_model(file) [shuffle(tweets[key]) for key in tweets] cnt = [[key, len(tweets[key])] for key in tweets] cnt.sort(key=lambda x: x[1]) per_place = int(np.ceil(per_file // len(cnt))) prev = len(output) for i, (key, n) in enumerate(cnt): _ = [x['text'] for x in tweets[key][:per_place]] output.extend(_) if len(_) < per_place and i < len(cnt) - 1: per_place += int( np.ceil((per_place - len(_)) / (len(cnt) - (i + 1)))) inc = len(output) - prev if inc < per_file and k < len(files) - 1: per_file += (per_file - inc) / (len(files) - (k + 1)) shuffle(output) return output
def count_emo(lang='zh'): fnames = glob(join('data', lang, 'emo', '*.gz')) cnt = Counter() for fname in fnames: for key, data in load_model(fname).items(): [cnt.update(x['klass']) for x in data if len(x['klass']) == 1] return cnt
def main(self): self.data = self.parser.parse_args() svc = load_model(self.data.model) X = [svc.model[x] for x in read_data(self.data.test_set)] output = self.get_output() if output.endswith('.gz'): gzip_flag = True output = gzip.open(output, 'wb') else: gzip_flag = False output = open(output, 'w') with output as fpt: if not self.data.decision_function: hy = svc.predict(X) for tweet, klass in zip(tweet_iterator(self.data.test_set), hy): tweet['klass'] = str(klass) cdn = json.dumps(tweet)+"\n" cdn = bytes(cdn, encoding='utf-8') if gzip_flag else cdn fpt.write(cdn) else: hy = svc.decision_function(X) for tweet, klass in zip(tweet_iterator(self.data.test_set), hy): try: o = klass.tolist() except AttributeError: o = klass tweet['decision_function'] = o cdn = json.dumps(tweet)+"\n" cdn = bytes(cdn, encoding='utf-8') if gzip_flag else cdn fpt.write(cdn)
def test_BagOfWords_init(): from microtc.utils import load_model from EvoMSA.utils import download tm = BagOfWords() xx = list(load_model(download("b4msa_Es.tm")).model.word2id.keys()) tm2 = BagOfWords(tokens=xx) assert len(tm.tokenize.vocabulary) == len(tm2.tokenize.vocabulary)
def test_download_tokens(): from text_models.utils import download_tokens from microtc.utils import load_model from os.path import isfile from os import unlink fname = download_tokens(dict(year=2020, month=2, day=14)) assert isfile(fname) model = load_model(fname) print(model.most_common(10), model.update_calls) unlink(fname) fname = download_tokens(dict(year=2020, month=2, day=14), country="MX") assert isfile(fname) model2 = load_model(fname) assert len(model) != len(model2[0][1]) unlink(fname)
def common_words(self, quantile: float = None, bigrams=True): """Words used frequently; these correspond to py:attr:`EvoMSA.base.EvoMSA(B4MSA=True)` In the case quantile is given the these words and bigrams correspond to the most frequent. """ if quantile is None: from EvoMSA.utils import download return load_model(download("b4msa_%s.tm" % self._lang)).model.word2id words_N = sum([v for k, v in self.voc.items() if k.count("~") == 0]) score = [[k, v / words_N] for k, v in self.voc.items() if k.count("~") == 0] score.sort(key=lambda x: x[1], reverse=True) cum, k = 0, 0 while cum <= quantile: cum += score[k][1] k += 1 output = [k for k, _ in score[:k]] if bigrams: bigrams_N = sum([v for k, v in self.voc.items() if k.count("~")]) score_bi = [[k, v / bigrams_N] for k, v in self.voc.items() if k.count("~")] score_bi.sort(key=lambda x: x[1], reverse=True) cum, k = 0, 0 while cum <= quantile: cum += score_bi[k][1] k += 1 output += [k for k, _ in score_bi[:k]] return output
def test_evo_test_set(): from EvoMSA.base import EvoMSA sys.argv = ['EvoMSA', '--evodag-kw={"popsize": 10, "early_stopping_rounds": 10, "time_limit": 5, "n_estimators": 5}', '-ot.model', '--test_set', TWEETS, '-n2', TWEETS] train(output=True) evo = load_model('t.model') assert isinstance(evo, EvoMSA) os.unlink('t.model')
def test_bug_two_cons_klass(): from EvoMSA.utils import download from microtc.utils import load_model from os.path import dirname, join _ = join(dirname(__file__), "..", "data", "country.ds") dset = load_model(_) r = dset.klass("mexico y usa") assert len(r.intersection(set(["US", "MX"]))) == 2
def load_model(fname): from microtc.utils import load_model if os.path.isfile(fname): return load_model(fname) else: cls = CommandLine.get_class(fname) ins = cls() return ins
def main(self): self.data = self.parser.parse_args() svc = load_model(self.data.model) with open(self.get_output(), 'w') as fpt: for tw in tweet_iterator(self.data.test_set): extra = dict([(int(a), float(b)) for a, b in svc.model[tw['text']]] + [('num_terms', svc.num_terms)]) tw.update(extra) fpt.write(json.dumps(tw) + "\n")
def test_evo_kwargs(): from EvoMSA.base import EvoMSA sys.argv = ['EvoMSA', '--kw={"stacked_method_args": {"popsize": 10, "early_stopping_rounds": 10, "time_limit": 5, "n_estimators": 5}, "b4msa_args": {"del_dup":false}}', '-ot.model', '-n2', TWEETS] train(output=True) evo = load_model('t.model') assert isinstance(evo, EvoMSA) os.unlink('t.model')
def test_textmodel_save_load(): import os from microtc.textmodel import TextModel from microtc.utils import tweet_iterator, save_model, load_model fname = os.path.dirname(__file__) + '/text.json' tw = list(tweet_iterator(fname)) tm = TextModel().fit(tw) save_model(tm, 't.model') assert isinstance(load_model('t.model'), TextModel) os.unlink('t.model')
def __init__( self, fnames: Union[list, str], reader: Callable[[str], Iterable[dict]] = tweet_iterator) -> None: self._fnames = fnames if isinstance(fnames, list) else [fnames] self._reader = reader self._label = BoundingBox().label self._data = defaultdict(TokenCount.single_co_ocurrence) _ = join(dirname(__file__), "data", "state.dict") self._states = load_model(_)
def vector_space(args): k, t, X, output = args if output is not None and os.path.isfile(output): return k, load_model(output) try: res = t.transform(X) except AttributeError: res = t.tonp([t[_] for _ in X]) if output is not None: save_model(res, output) return k, res
def test_counter(): from microtc.utils import Counter, save_model, load_model import os c = Counter() c.update([1, 2, 3, 1]) c.update([3]) assert c[1] == 2 print(c.update_calls) assert c.update_calls == 2 save_model(c, "t.voc") cc = load_model("t.voc") os.unlink("t.voc") print(cc.update_calls, "**") assert cc.update_calls == 2
def model(self, X): if not isinstance(X[0], list): X = [X] m = [] kwargs = self._b4msa_args self._logger.info("Starting TextModel") self._logger.info(str(kwargs)) for x in X: for tm, cl in self.models: if isinstance(tm, str): m.append(load_model(tm)) else: m.append(tm(x, **kwargs)) self._textModel = m
def get_model(basename, data, labels, args): modelfile = get_filename(args, os.path.join("models", os.path.basename(basename))) if not os.path.exists(modelfile): if not os.path.isdir("models"): os.mkdir("models") args['docs'] = data model = TextModel(**args) save_model(model, modelfile) else: model = load_model(modelfile) return model
def emo(k, lang='zh', size=2**19): ds = Dataset(text_transformations=False) ds.add(ds.load_emojis()) output = join('models', f'{lang}_emo_{k}_mu{microtc.__version__}') dd = load_model(join('models', f'{lang}_emo.info')) _ = [x for x, v in dd.most_common() if v >= 2**10] tot = sum([v for x, v in dd.most_common() if v >= 2**10]) if k >= len(_): return pos = _[k] neg = set([x for i, x in enumerate(_) if i != k]) POS, NEG, ADD = [], [], [] for fname in glob(join('data', lang, 'emo', '*.gz')): for key, data in load_model(fname).items(): for d in data: klass = d['klass'] if len(klass) == 1: klass = klass.pop() if klass == pos: POS.append(ds.process(d['text'])) elif klass in neg: NEG.append(ds.process(d['text'])) elif tot < size: if pos not in klass and len(klass.intersection(neg)): ADD.append(ds.process(d['text'])) shuffle(POS), shuffle(NEG), shuffle(ADD) size2 = size // 2 POS = POS[:size2] if len(NEG) < size2: NEG.extend(ADD) NEG = NEG[:size2] y = [1] * len(POS) y.extend([-1] * len(NEG)) tm = load_model(join('models', f'{lang}_{microtc.__version__}.microtc')) X = tm.transform(POS + NEG) m = LinearSVC().fit(X, y) save_model(m, f'{output}.LinearSVC')
def fit_svm(self, Xvs, y): svc_models = [] for (_, cl), X, output in zip(self.models, Xvs, self.cache.ml_train()): if output is not None and os.path.isfile(output): svc_models.append(load_model(output)) continue try: c = cl(random_state=self._seed) except TypeError: c = cl() c.fit(X, y) svc_models.append(c) if output is not None: save_model(c, output) self._svc_models = svc_models
def get_model(basename, data, labels, args): modelfile = get_filename( args, os.path.join("models", os.path.basename(basename))) if not os.path.exists(modelfile): if not os.path.isdir("models"): os.mkdir("models") args['docs'] = data model = TextModel(**args) save_model(model, modelfile) else: model = load_model(modelfile) return model
def postal_code_names(self): """ Dictionary containing a descripcion of a postal code >>> from text_models.place import CP >>> cp = CP() >>> cp.postal_code_names["58000"] ['16', 'Michoacán de Ocampo', '053', 'Morelia'] """ try: return self._pc_names except AttributeError: path = join(dirname(__file__), "data", "CP.desc") self._pc_names = load_model(path) return self._pc_names
def __init__(self, day=None, window=30, end=None, data: Callable[[str], str]=download_geo, countries: Union[set, None]=None): path = join(dirname(__file__), "data", "state.dict") self._states = load_model(path) path = join(dirname(__file__), "data", "bbox_country.dict") self._n_states = load_model(path) self._bbox = BoundingBox() self._dates = list() delta = datetime.timedelta(days=1) init = datetime.datetime(year=2015, month=12, day=16) day = self.__handle_day(day) if end is not None: end = self.__handle_day(end) if end > day: end, day = day, end window = (day - end).days + 1 days = [] while len(days) < window and day >= init: try: fname = data("%s%02i%02i.travel" % (str(day.year)[-2:], day.month, day.day)) except Exception: day = day - delta continue self._dates.append(day) day = day - delta _ = load_model(fname) _[0] = self.keep_only(_[0], countries) days.append(_) self._days = [x for x, _ in days] self.num_users = [x for _, x in days] self._days.reverse() self.num_users.reverse() self._dates.reverse()
def tm_words(self): """ Text model words :rtype: dict """ tm = self.text_transformations emos = self.load_emojis() textModel = load_model(download("b4msa_%s.tm" % self._lang)) words = [ tm(k) for k in textModel.model.word2id.keys() if k[:2] != "q:" and k.count("~") == 0 and k not in emos ] words.sort() _ = OrderedDict([(w, True) for w in words]) return _
def __init__(self, tokens: Union[str, List[str]] = "Es"): from microtc.utils import load_model from EvoMSA.utils import download tok = Tokenize() if isinstance(tokens, list): xx = tokens else: textModel = load_model(download("b4msa_%s.tm" % tokens)) xx = list(textModel.model.word2id.keys()) tok.textModel = textModel f = lambda cdn: "~".join([x for x in cdn.split("~") if len(x)]) tok.fit([f(k) for k in xx if k.count("~") and k[:2] != "q:"]) tok.fit([f(k) for k in xx if k.count("~") == 0 and k[:2] != "q:"]) qgrams = [f(k[2:]) for k in xx if k[:2] == "q:"] tok.fit([x for x in qgrams if x.count("~") == 0 if len(x) >= 2]) self._tokenize = tok self._text = "text"
def __init__(self, data, lang: str = "Es", country: str = 'nogeo', states: bool = False) -> None: self._lang = lang self._country = country self._states = states if isinstance(data, dict) and len(data) > 3: self._data = data elif isinstance(data, str) and isfile(data): self.voc = load_model(data) else: self.date = data self._init(data) if not states: self._n_words = sum( [v for k, v in self.voc.items() if k.count("~") == 0]) self._n_bigrams = sum( [v for k, v in self.voc.items() if k.count("~")])
def textModels(self): """Text Models :rtype: list """ # Performing lazy loading # If the outputs are in the cache, # there is no need to load the model into memory solve = [(i, tm) for (i, tm), cache in zip(enumerate(self._textModel), self.cache) if isinstance(tm, str) and ( cache is None or not os.path.isfile(cache))] for i, tm in solve: _ = load_model(tm) if isinstance(_, EvoMSA): _ = EvoMSAWrapper(evomsa=_) self._textModel[i] = _ return self._textModel
def test_train_exogenous(): from EvoMSA.base import EvoMSA import json with open('ex.json', 'w') as fpt: for x in tweet_iterator(TWEETS): x['decision_function'] = x['q_voc_ratio'] fpt.write(json.dumps(x) + '\n') sys.argv = ['EvoMSA', '-ot.model', '-n2', '--exogenous', 'ex.json', 'ex.json', '--evodag-kw={"popsize": 10, "early_stopping_rounds": 10, "time_limit": 5, "n_estimators": 5}', TWEETS] train(output=True) evo = load_model('t.model') assert isinstance(evo, EvoMSA) os.unlink('t.model') m = evo._evodag_model._m.models[0] os.unlink('ex.json') print(m.nvar) assert m.nvar == 6 assert evo.n_jobs == 2