def test_add(): dset = Dataset() assert len(dset.klasses) == 0 dset.add(dset.load_emojis()) cnt = len(dset.klasses) assert cnt > 0 words = dset.tm_words() dset.add(words) print(len(dset.klasses), len(words), cnt) assert len(dset.klasses) <= len(words) + cnt
def test_remove(): dset = Dataset() dset.add(dset.load_emojis()) dset.add(dset.tm_words()) xx = dset.klass("xxx good morning xxx asdfa") print(xx) assert len(xx) == 2 dset.remove("~good~") xx = dset.klass("xxx good xxx morning xxx") print(xx) assert len(xx) == 1
def test_process(): from microtc.emoticons import convert_emoji dset = Dataset() dset.add(dset.load_emojis()) dset.add(dset.tm_words()) xx = dset.process("xxx good 9 morning xxx fax x la", "~x~") for a, b in zip(xx, ["~xxx~good~9~morning~xxx~fax~", "~la~", "~la~"]): print(a, b) assert a == b txt = 'xxx good {} morning xxx fax x la'.format(convert_emoji('1F600')) xx = dset.process(txt, convert_emoji('1F600')) print(xx) for a, b in zip(xx, ["~xxx~good~", "~morning~xxx~fax~x~la~"]): assert a == b
def recall_emo(lang='zh', n_jobs=1): def predict(fname, ds, tm, emoji): D = [] for key, tweets in load_model(fname).items(): labels = [ds.klass(x['text']) for x in tweets] _ = [[x['text'], label] for label, x in zip(labels, tweets) if len(klasses.intersection(label))] D.extend(_) X = tm.transform([x for x, _ in D]) y = [y for _, y in D] hy = [] for k, emo in enumerate(emoji): output = join('models', f'{lang}_emo_{k}_mu{microtc.__version__}') m = load_model(f'{output}.LinearSVC') hy.append(m.predict(X)) return y, hy def performance(emo, y, hy): y_emo = [emo in i for i in y] perf = recall_score(y_emo, hy > 0, pos_label=True) return perf, sum(y_emo) / len(y) info = load_model(join('models', f'{lang}_emo.info')) info = [[k, v] for k, (v, _) in enumerate(info.most_common()) if _ >= 2**10] klasses = set([v for k, v in info]) fnames = glob(join('data', lang, 'test', '*.gz')) ds = Dataset(text_transformations=False) ds.add(ds.load_emojis()) dd = load_model(join('models', f'{lang}_emo.info')) emoji = [x for x, v in dd.most_common() if v >= 2**10] tm = load_model(join('models', f'{lang}_{microtc.__version__}.microtc')) predictions = Parallel(n_jobs=n_jobs)( delayed(predict)(fname, ds, tm, emoji) for fname in fnames) y = [] [y.extend(x) for x, hy in predictions] hys = np.vstack([np.vstack(hy).T for _, hy in predictions]) output = dict() _ = Parallel(n_jobs=n_jobs)(delayed(performance)(emo, y, hy) for emo, hy in zip(emoji, hys.T)) output = { emo: { 'recall': perf, 'ratio': ratio } for emo, (perf, ratio) in zip(emoji, _) } save_model(output, join('models', f'{lang}_emo.perf'))
def emo_data(lang='zh'): fnames = glob(join('data', lang, '*.gz')) ds = Dataset(text_transformations=False) ds.add(ds.load_emojis()) for fname in fnames: output = dict() for key, tweets in load_model(fname).items(): labels = [ds.klass(x['text']) for x in tweets] inner = [] for tweet, label in zip(tweets, labels): if len(label) == 0: continue tweet['klass'] = label inner.append(tweet) if len(inner): output[key] = inner if len(output) == 0: continue output_fname = join(dirname(fname), 'emo') if not isdir(output_fname): os.mkdir(output_fname) output_fname = join(output_fname, basename(fname)) save_model(output, output_fname)
def emo(k, lang='zh', size=2**19): ds = Dataset(text_transformations=False) ds.add(ds.load_emojis()) output = join('models', f'{lang}_emo_{k}_mu{microtc.__version__}') dd = load_model(join('models', f'{lang}_emo.info')) _ = [x for x, v in dd.most_common() if v >= 2**10] tot = sum([v for x, v in dd.most_common() if v >= 2**10]) if k >= len(_): return pos = _[k] neg = set([x for i, x in enumerate(_) if i != k]) POS, NEG, ADD = [], [], [] for fname in glob(join('data', lang, 'emo', '*.gz')): for key, data in load_model(fname).items(): for d in data: klass = d['klass'] if len(klass) == 1: klass = klass.pop() if klass == pos: POS.append(ds.process(d['text'])) elif klass in neg: NEG.append(ds.process(d['text'])) elif tot < size: if pos not in klass and len(klass.intersection(neg)): ADD.append(ds.process(d['text'])) shuffle(POS), shuffle(NEG), shuffle(ADD) size2 = size // 2 POS = POS[:size2] if len(NEG) < size2: NEG.extend(ADD) NEG = NEG[:size2] y = [1] * len(POS) y.extend([-1] * len(NEG)) tm = load_model(join('models', f'{lang}_{microtc.__version__}.microtc')) X = tm.transform(POS + NEG) m = LinearSVC().fit(X, y) save_model(m, f'{output}.LinearSVC')
def test_load_emojis(): emojis = Dataset.load_emojis() assert len(emojis) > 1000 assert isinstance(emojis, dict)