Example #1
0
def test_process():
    
    from microtc.emoticons import convert_emoji
    dset = Dataset()
    dset.add(dset.load_emojis())
    dset.add(dset.tm_words())
    xx = dset.process("xxx good 9 morning xxx fax x la", "~x~")
    for a, b in zip(xx, ["~xxx~good~9~morning~xxx~fax~", "~la~", "~la~"]):
        print(a, b)
        assert a == b
    txt = 'xxx good {} morning xxx fax x la'.format(convert_emoji('1F600'))
    xx = dset.process(txt, convert_emoji('1F600'))
    print(xx)
    for a, b in zip(xx, ["~xxx~good~", "~morning~xxx~fax~x~la~"]):
        assert a == b
def emo(k, lang='zh', size=2**19):
    ds = Dataset(text_transformations=False)
    ds.add(ds.load_emojis())
    output = join('models', f'{lang}_emo_{k}_mu{microtc.__version__}')
    dd = load_model(join('models', f'{lang}_emo.info'))
    _ = [x for x, v in dd.most_common() if v >= 2**10]
    tot = sum([v for x, v in dd.most_common() if v >= 2**10])
    if k >= len(_):
        return
    pos = _[k]
    neg = set([x for i, x in enumerate(_) if i != k])
    POS, NEG, ADD = [], [], []
    for fname in glob(join('data', lang, 'emo', '*.gz')):
        for key, data in load_model(fname).items():
            for d in data:
                klass = d['klass']
                if len(klass) == 1:
                    klass = klass.pop()
                    if klass == pos:
                        POS.append(ds.process(d['text']))
                    elif klass in neg:
                        NEG.append(ds.process(d['text']))
                elif tot < size:
                    if pos not in klass and len(klass.intersection(neg)):
                        ADD.append(ds.process(d['text']))
    shuffle(POS), shuffle(NEG), shuffle(ADD)
    size2 = size // 2
    POS = POS[:size2]
    if len(NEG) < size2:
        NEG.extend(ADD)
    NEG = NEG[:size2]
    y = [1] * len(POS)
    y.extend([-1] * len(NEG))
    tm = load_model(join('models', f'{lang}_{microtc.__version__}.microtc'))
    X = tm.transform(POS + NEG)
    m = LinearSVC().fit(X, y)
    save_model(m, f'{output}.LinearSVC')
Example #3
0
for x in emojis['fully-qualified']:
    key = remove_components(x)
    emojis_filter[key].append(x)

components.add('FE0F')
m_qual = {remove_components(x): x for x in emojis_filter.keys()}

for x in emojis['minimally-qualified']:
    key = remove_components(x)
    value = m_qual[key]
    emojis_filter[value].append(x)

for x in emojis['unqualified']:
    key = remove_components(x)
    value = m_qual[key]
    emojis_filter[value].append(x)

output = dict()

for k, v in emojis_filter.items():
    ident = convert_emoji(k).strip()
    for item in v:
        output[convert_emoji(item).strip()] = ident

save_model(output, 'emojis.dict')
ds = Dataset()
ds.add(output)

ds.process('buenos xx 12 dias. {} todos! acción'.format(
    convert_emoji('1F44B 1F3FC')))