Example #1
0
def test_add():
    dset = Dataset()
    assert len(dset.klasses) == 0
    dset.add(dset.load_emojis())
    cnt = len(dset.klasses)
    assert cnt > 0
    words = dset.tm_words()
    dset.add(words)
    print(len(dset.klasses), len(words), cnt)
    assert len(dset.klasses) <= len(words) + cnt
Example #2
0
def test_remove():
    dset = Dataset()
    dset.add(dset.load_emojis())
    dset.add(dset.tm_words())
    xx = dset.klass("xxx good morning xxx asdfa")
    print(xx)
    assert len(xx) == 2
    dset.remove("~good~")
    xx = dset.klass("xxx good xxx morning xxx")
    print(xx)
    assert len(xx) == 1
Example #3
0
def test_process():
    
    from microtc.emoticons import convert_emoji
    dset = Dataset()
    dset.add(dset.load_emojis())
    dset.add(dset.tm_words())
    xx = dset.process("xxx good 9 morning xxx fax x la", "~x~")
    for a, b in zip(xx, ["~xxx~good~9~morning~xxx~fax~", "~la~", "~la~"]):
        print(a, b)
        assert a == b
    txt = 'xxx good {} morning xxx fax x la'.format(convert_emoji('1F600'))
    xx = dset.process(txt, convert_emoji('1F600'))
    print(xx)
    for a, b in zip(xx, ["~xxx~good~", "~morning~xxx~fax~x~la~"]):
        assert a == b
def recall_emo(lang='zh', n_jobs=1):
    def predict(fname, ds, tm, emoji):
        D = []
        for key, tweets in load_model(fname).items():
            labels = [ds.klass(x['text']) for x in tweets]
            _ = [[x['text'], label] for label, x in zip(labels, tweets)
                 if len(klasses.intersection(label))]
            D.extend(_)
        X = tm.transform([x for x, _ in D])
        y = [y for _, y in D]
        hy = []
        for k, emo in enumerate(emoji):
            output = join('models', f'{lang}_emo_{k}_mu{microtc.__version__}')
            m = load_model(f'{output}.LinearSVC')
            hy.append(m.predict(X))
        return y, hy

    def performance(emo, y, hy):
        y_emo = [emo in i for i in y]
        perf = recall_score(y_emo, hy > 0, pos_label=True)
        return perf, sum(y_emo) / len(y)

    info = load_model(join('models', f'{lang}_emo.info'))
    info = [[k, v] for k, (v, _) in enumerate(info.most_common())
            if _ >= 2**10]
    klasses = set([v for k, v in info])
    fnames = glob(join('data', lang, 'test', '*.gz'))
    ds = Dataset(text_transformations=False)
    ds.add(ds.load_emojis())
    dd = load_model(join('models', f'{lang}_emo.info'))
    emoji = [x for x, v in dd.most_common() if v >= 2**10]
    tm = load_model(join('models', f'{lang}_{microtc.__version__}.microtc'))
    predictions = Parallel(n_jobs=n_jobs)(
        delayed(predict)(fname, ds, tm, emoji) for fname in fnames)
    y = []
    [y.extend(x) for x, hy in predictions]
    hys = np.vstack([np.vstack(hy).T for _, hy in predictions])
    output = dict()
    _ = Parallel(n_jobs=n_jobs)(delayed(performance)(emo, y, hy)
                                for emo, hy in zip(emoji, hys.T))
    output = {
        emo: {
            'recall': perf,
            'ratio': ratio
        }
        for emo, (perf, ratio) in zip(emoji, _)
    }
    save_model(output, join('models', f'{lang}_emo.perf'))
Example #5
0
def emo_data(lang='zh'):
    fnames = glob(join('data', lang, '*.gz'))
    ds = Dataset(text_transformations=False)
    ds.add(ds.load_emojis())
    for fname in fnames:
        output = dict()
        for key, tweets in load_model(fname).items():
            labels = [ds.klass(x['text']) for x in tweets]
            inner = []
            for tweet, label in zip(tweets, labels):
                if len(label) == 0:
                    continue
                tweet['klass'] = label
                inner.append(tweet)
            if len(inner):
                output[key] = inner
        if len(output) == 0:
            continue
        output_fname = join(dirname(fname), 'emo')
        if not isdir(output_fname):
            os.mkdir(output_fname)
        output_fname = join(output_fname, basename(fname))
        save_model(output, output_fname)
def emo(k, lang='zh', size=2**19):
    ds = Dataset(text_transformations=False)
    ds.add(ds.load_emojis())
    output = join('models', f'{lang}_emo_{k}_mu{microtc.__version__}')
    dd = load_model(join('models', f'{lang}_emo.info'))
    _ = [x for x, v in dd.most_common() if v >= 2**10]
    tot = sum([v for x, v in dd.most_common() if v >= 2**10])
    if k >= len(_):
        return
    pos = _[k]
    neg = set([x for i, x in enumerate(_) if i != k])
    POS, NEG, ADD = [], [], []
    for fname in glob(join('data', lang, 'emo', '*.gz')):
        for key, data in load_model(fname).items():
            for d in data:
                klass = d['klass']
                if len(klass) == 1:
                    klass = klass.pop()
                    if klass == pos:
                        POS.append(ds.process(d['text']))
                    elif klass in neg:
                        NEG.append(ds.process(d['text']))
                elif tot < size:
                    if pos not in klass and len(klass.intersection(neg)):
                        ADD.append(ds.process(d['text']))
    shuffle(POS), shuffle(NEG), shuffle(ADD)
    size2 = size // 2
    POS = POS[:size2]
    if len(NEG) < size2:
        NEG.extend(ADD)
    NEG = NEG[:size2]
    y = [1] * len(POS)
    y.extend([-1] * len(NEG))
    tm = load_model(join('models', f'{lang}_{microtc.__version__}.microtc'))
    X = tm.transform(POS + NEG)
    m = LinearSVC().fit(X, y)
    save_model(m, f'{output}.LinearSVC')
Example #7
0
def test_load_emojis():
    emojis = Dataset.load_emojis()
    assert len(emojis) > 1000
    assert isinstance(emojis, dict)