Beispiel #1
0
    def predict(self, text, suggest=False, k=5, max_k=200):
        tokenized = self.tokenizer.encode(text)
        if len(tokenized.tokens) > MAX_LEN:
            raise ValueError('The text is too long (>512) to process')
        token_ids = tokenized.ids
        segment_ids = tokenized.type_ids
        mapping = rematch(tokenized.offsets)
        token_ids, segment_ids = np.array([token_ids]), np.array([segment_ids])
        probas = self.detector.predict(token_ids, segment_ids)[0][0]
        incorrect_ids = np.where(probas > 0.5)[0]
        token_ids[0, incorrect_ids] = self.mask_id

        if not suggest:
            ret = []
            for i in incorrect_ids:
                ret.append((i - 1, tokenized.tokens[i]))
            return ret

        probas = self.corrector.predict(token_ids, segment_ids)[0][0]
        sorted_probas, sort_indexs = topK(probas, max_k)
        ret = {}
        for i in incorrect_ids:
            if i == 0 or i == len(tokenized.tokens) - 1:
                continue
            current_token = text[mapping[i][0]:mapping[i][-1] + 1]
            current_pinyin = ' '.join(xmnlp.pinyin(current_token))
            cands = []
            for proba, token in zip(
                    sorted_probas[i],
                    self.tokenizer.decode(sort_indexs[i]).split()):
                pinyin = ' '.join(xmnlp.pinyin(token))
                score = 0
                if current_pinyin == pinyin:
                    score = 1
                cands.append((token, proba + score))
            cands.sort(key=lambda x: x[1], reverse=True)
            ret[(i - 1, current_token)] = cands[:k]
        return dict(ret)
Beispiel #2
0
def test_pinyin():
    assert ['ren', 'gong', 'zhi', 'neng'] == xmnlp.pinyin('人工智能')
Beispiel #3
0
import sys
sys.path.append("..")

if sys.version_info[0] == 2:
    reload(sys)
    sys.setdefaultencoding('utf8')

descr = """
  文本转拼音
/ trie tree / 
"""
print(descr)

doc = """面朝大海,春暖花开"""

print('\n++++++++++++++++++++++++ usage 1 ++++++++++++++++++++++++\n')
"""
 1. 使用类来进行操作

"""
from xmnlp import XmNLP
xm = XmNLP(doc)
print('Text: \n', doc)
print('PinYin: \n', xm.pinyin())

print('\n++++++++++++++++++++++++ usage 2 ++++++++++++++++++++++++\n')

import xmnlp
print('Text: \n', doc)
print('PinYin: \n', xmnlp.pinyin(doc))