def test_process(): from microtc.emoticons import convert_emoji dset = Dataset() dset.add(dset.load_emojis()) dset.add(dset.tm_words()) xx = dset.process("xxx good 9 morning xxx fax x la", "~x~") for a, b in zip(xx, ["~xxx~good~9~morning~xxx~fax~", "~la~", "~la~"]): print(a, b) assert a == b txt = 'xxx good {} morning xxx fax x la'.format(convert_emoji('1F600')) xx = dset.process(txt, convert_emoji('1F600')) print(xx) for a, b in zip(xx, ["~xxx~good~", "~morning~xxx~fax~x~la~"]): assert a == b
def emo(k, lang='zh', size=2**19): ds = Dataset(text_transformations=False) ds.add(ds.load_emojis()) output = join('models', f'{lang}_emo_{k}_mu{microtc.__version__}') dd = load_model(join('models', f'{lang}_emo.info')) _ = [x for x, v in dd.most_common() if v >= 2**10] tot = sum([v for x, v in dd.most_common() if v >= 2**10]) if k >= len(_): return pos = _[k] neg = set([x for i, x in enumerate(_) if i != k]) POS, NEG, ADD = [], [], [] for fname in glob(join('data', lang, 'emo', '*.gz')): for key, data in load_model(fname).items(): for d in data: klass = d['klass'] if len(klass) == 1: klass = klass.pop() if klass == pos: POS.append(ds.process(d['text'])) elif klass in neg: NEG.append(ds.process(d['text'])) elif tot < size: if pos not in klass and len(klass.intersection(neg)): ADD.append(ds.process(d['text'])) shuffle(POS), shuffle(NEG), shuffle(ADD) size2 = size // 2 POS = POS[:size2] if len(NEG) < size2: NEG.extend(ADD) NEG = NEG[:size2] y = [1] * len(POS) y.extend([-1] * len(NEG)) tm = load_model(join('models', f'{lang}_{microtc.__version__}.microtc')) X = tm.transform(POS + NEG) m = LinearSVC().fit(X, y) save_model(m, f'{output}.LinearSVC')
for x in emojis['fully-qualified']: key = remove_components(x) emojis_filter[key].append(x) components.add('FE0F') m_qual = {remove_components(x): x for x in emojis_filter.keys()} for x in emojis['minimally-qualified']: key = remove_components(x) value = m_qual[key] emojis_filter[value].append(x) for x in emojis['unqualified']: key = remove_components(x) value = m_qual[key] emojis_filter[value].append(x) output = dict() for k, v in emojis_filter.items(): ident = convert_emoji(k).strip() for item in v: output[convert_emoji(item).strip()] = ident save_model(output, 'emojis.dict') ds = Dataset() ds.add(output) ds.process('buenos xx 12 dias. {} todos! acción'.format( convert_emoji('1F44B 1F3FC')))