def predict(self, text, suggest=False, k=5, max_k=200): tokenized = self.tokenizer.encode(text) if len(tokenized.tokens) > MAX_LEN: raise ValueError('The text is too long (>512) to process') token_ids = tokenized.ids segment_ids = tokenized.type_ids mapping = rematch(tokenized.offsets) token_ids, segment_ids = np.array([token_ids]), np.array([segment_ids]) probas = self.detector.predict(token_ids, segment_ids)[0][0] incorrect_ids = np.where(probas > 0.5)[0] token_ids[0, incorrect_ids] = self.mask_id if not suggest: ret = [] for i in incorrect_ids: ret.append((i - 1, tokenized.tokens[i])) return ret probas = self.corrector.predict(token_ids, segment_ids)[0][0] sorted_probas, sort_indexs = topK(probas, max_k) ret = {} for i in incorrect_ids: if i == 0 or i == len(tokenized.tokens) - 1: continue current_token = text[mapping[i][0]:mapping[i][-1] + 1] current_pinyin = ' '.join(xmnlp.pinyin(current_token)) cands = [] for proba, token in zip( sorted_probas[i], self.tokenizer.decode(sort_indexs[i]).split()): pinyin = ' '.join(xmnlp.pinyin(token)) score = 0 if current_pinyin == pinyin: score = 1 cands.append((token, proba + score)) cands.sort(key=lambda x: x[1], reverse=True) ret[(i - 1, current_token)] = cands[:k] return dict(ret)
def test_pinyin(): assert ['ren', 'gong', 'zhi', 'neng'] == xmnlp.pinyin('人工智能')
import sys sys.path.append("..") if sys.version_info[0] == 2: reload(sys) sys.setdefaultencoding('utf8') descr = """ 文本转拼音 / trie tree / """ print(descr) doc = """面朝大海,春暖花开""" print('\n++++++++++++++++++++++++ usage 1 ++++++++++++++++++++++++\n') """ 1. 使用类来进行操作 """ from xmnlp import XmNLP xm = XmNLP(doc) print('Text: \n', doc) print('PinYin: \n', xm.pinyin()) print('\n++++++++++++++++++++++++ usage 2 ++++++++++++++++++++++++\n') import xmnlp print('Text: \n', doc) print('PinYin: \n', xmnlp.pinyin(doc))