Example #1
0
def token_indexing(idx, encoding_type, return_type):
    """
    将输入的单词id映射为每个字五笔、拼音的字符的id
    :param idx: (seq_len, batch_size)
    :return: chars: (seq_len, batch_size, num_char)  token_lens: (seq_len, batch_size, num_char)
    """
    c = dict_word['idx2word'][idx]
    if c == '<eos>':
        c = '。'
    if encoding_type == 'wubi':
        encoding = wubi(c)[0] if wubi(c) else c
        full_encoding = encoding if len(
            encoding) == 8 else encoding + '。' * (8 - len(encoding))
        assert len(full_encoding) == 8, full_encoding
        tokens = [dict_wubi['char2idx'][c] for c in full_encoding]
        length = [i < len(encoding) for i in range(len(tokens))]
    elif encoding_type == 'pinyin':
        encoding = pinyin(c)[0][0] if pinyin(c) else c
        full_encoding = encoding if len(
            encoding) == 8 else encoding + '。' * (8 - len(encoding))
        assert len(full_encoding) == 8, full_encoding
        tokens = [dict_pinyin['char2idx'][c] for c in full_encoding]
        length = [i < len(encoding) for i in range(len(tokens))]
    else:
        raise NotImplementedError
    # print(idx, c, encoding, tokens, length)
    return tokens if return_type == 'tokens' else length
def wubi_convert(word):
    length = len(word)
    wubi_list = wubi(word)
    ret = ''
    if length == 2:
        ret = wubi_list[0][:2] + wubi_list[1][:2]
    elif length == 3:
        ret = wubi_list[0][:1] + wubi_list[1][:1] + wubi_list[2][:2]
    elif length >= 4:
        ret = wubi_list[0][:1] + wubi_list[1][:1] + wubi_list[
            2][:1] + wubi_list[-1][:1]
    return ret
Example #3
0
def convert_to_wubi(text):
  return ' '.join(wubi(text))
def convert_wubi(s):
    wubi_code_list = wubi(s)
    wubi_code = ''.join(wubi_code_list)
    return wubi_code