Exemple #1
0
    def reset(self):
        self.question_list = []
        self.question_id = 0

        # 选出要考的诗
        random.shuffle(self.poem_list)
        poem_list = self.poem_list[:10]

        for poem in poem_list:
            # 把一首诗打散成很多联
            pair_list = []
            pair_list_ = poem["content"].replace("\n", u"。").split(u"。")
            for pair in pair_list_:
                pair = pair.strip()
                if not pair:
                    continue
                if set(list(u"()()?!:?;")) & set(list(pair)):
                    continue
                if u"," not in pair:
                    continue

                pair_list.append(pair)

            if not pair_list:
                continue
            # 选一联出题
            pair = random.choice(pair_list)
            sentence_list = pair.split(u",")
            # 选出要考的句子
            id_ = random.randint(0, len(sentence_list) - 1)
            target_sentence = sentence_list[id_]
            sentence_list[id_] = "_" * 14
            # 找到跟要考的句子相似的句子组成四个选项
            similar_sentence_list = []
            random.shuffle(self.sentence_list)
            for s in self.sentence_list:
                if s.strip() == target_sentence.strip():
                    continue
                if len(s) != len(target_sentence):
                    continue
                # 要求最后一个字的韵母相同
                if get_finals(lazy_pinyin(s[-1])[0], True) != get_finals(
                        lazy_pinyin(target_sentence[-1])[0], True):
                    continue
                similar_sentence_list.append(s)
                if len(similar_sentence_list) == 3:
                    break
            if len(similar_sentence_list) < 3:
                similar_sentence_list += ["达拉崩吧公主米亚幸福的像个童话"
                                          ] * (3 - len(similar_sentence_list))
            insert_index = random.randint(0, 3)
            similar_sentence_list.insert(insert_index, target_sentence)

            # 加入试卷
            self.question_list.append({
                "question": u",".join(sentence_list),
                "choice_list": similar_sentence_list,
                "answer": "ABCD"[insert_index]
            })
Exemple #2
0
def pypinyin_g2p_phone(text) -> List[str]:
    from pypinyin import Style, pinyin
    from pypinyin.style._utils import get_finals, get_initials

    phones = [
        p for phone in pinyin(text, style=Style.TONE3) for p in [
            get_initials(phone[0], strict=True),
            get_finals(phone[0][:-1], strict=True) +
            phone[0][-1] if phone[0][-1].isdigit() else get_finals(
                phone[0], strict=True) if phone[0][-1].isalnum() else phone[0],
        ]
        # Remove the case of individual tones as a phoneme
        if len(p) != 0 and not p.isdigit()
    ]
    return phones
def to_finals(pinyin, strict=True, v_to_u=False):
    """将 :py:attr:`~pypinyin.Style.TONE`、
    :py:attr:`~pypinyin.Style.TONE2` 、
    :py:attr:`~pypinyin.Style.TONE3` 或
    :py:attr:`~pypinyin.Style.NORMAL` 风格的拼音转换为
    :py:attr:`~pypinyin.Style.FINALS` 风格的拼音

    :param pinyin: :py:attr:`~pypinyin.Style.TONE`、
                   :py:attr:`~pypinyin.Style.TONE2` 、
                   :py:attr:`~pypinyin.Style.TONE3` 或
                   :py:attr:`~pypinyin.Style.NORMAL` 风格的拼音
    :param strict: 返回结果是否严格遵照《汉语拼音方案》来处理声母和韵母,
                   详见 :ref:`strict`
    :param v_to_u: 是否使用 ``ü`` 代替原来的 ``v``,
                   当为 False 时结果中将使用 ``v`` 表示 ``ü``
    :return: :py:attr:`~pypinyin.Style.FINALS` 风格的拼音

    Usage::

      >>> from pypinyin.contrib.tone_convert import to_finals
      >>> to_finals('zhōng')
      'ong'

    """
    new_pinyin = replace_symbol_to_no_symbol(pinyin).replace('v', 'ü')
    finals = get_finals(new_pinyin, strict=strict)
    finals = _fix_v_u(finals, finals, v_to_u)
    return finals
def get_rhyme(word):
    rhyme = get_finals(lazy_pinyin(word)[0], strict=False)
    rhymes_chinese = read_rhyme_finals()
    for idx, rhymes in enumerate(rhymes_chinese):
        if rhyme in rhymes:
            return idx + 1
    return 0
def generate_words_by_rhyme(input_word):
    rhymes_df = pd.read_csv('./dataset/rhymes-table.csv',
                            sep=',',
                            header=0,
                            encoding='utf-8')
    rhymes_df = rhymes_df.iloc[:, 1:]
    word_rhyme = ''
    for rhyme in lazy_pinyin(input_word):
        rhyme_without_tone = get_finals(rhyme, strict=False)
        word_rhyme += rhyme_without_tone + "'"
    word_rhyme = word_rhyme[0:-1]
    if word_rhyme in rhymes_df.columns:
        rhymes_without_nan = [i for i in list(rhymes_df[word_rhyme]) if i == i]
        np.random.shuffle(rhymes_without_nan)
        corresponding_rhyme = [
            ast.literal_eval(i)[0] for i in rhymes_without_nan
        ]
        corresponding_frequency = np.asarray(
            [ast.literal_eval(i)[1] for i in rhymes_without_nan])
        normalized_frequency = corresponding_frequency - np.min(
            corresponding_frequency) / np.max(
                corresponding_frequency) - np.min(corresponding_frequency)
        sample_num = min(3, len(corresponding_rhyme))
        words = np.random.choice(corresponding_rhyme,
                                 size=sample_num,
                                 replace=False,
                                 p=normalized_frequency)
        print('匹配的韵脚是:')
        for word in words:
            print(word)
    else:
        print('Sorry! 好像没有找到押韵的词语哦~')
Exemple #6
0
def part(text, pinyin2cmu_dict):
    phone_list = list()
    tone_list = list()
#     print(text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
#     exit()
    for pinyin in text.split(" "):
        if len(pinyin) == 0 or pinyin == " " or "#" in pinyin:
            continue
        tone = re.findall(r"\d+\.?\d*", pinyin)
        if len(tone) == 0:
            tone = "5"
            pass
        tone = int(tone[0]) + 3

        pinyin = pinyin.replace(str(tone-3), "")
        print(pinyin, len(pinyin))
        print('test', get_initials(pinyin, False).upper(), get_initials(pinyin, False).upper() not in pinyin2cmu_dict.keys())
        print('tste', get_finals(pinyin, False).upper(), get_finals(pinyin, False).upper() not in pinyin2cmu_dict.keys())
        print((get_initials(pinyin, False).upper() not in pinyin2cmu_dict.keys()) and (get_finals(pinyin, False).upper() not in pinyin2cmu_dict.keys()))
        print("===================")
        # if (get_initials(pinyin, False).upper() not in pinyin2cmu_dict.keys()) and (get_finals(pinyin, False).upper() not in pinyin2cmu_dict.keys()):
        #     print("not in:", pinyin)
        #     print(get_initials(pinyin, False).upper())
        #     print(get_finals(pinyin, False).upper())
        #     phone_list.append(pinyin)
        #     phone_list.append(" ")
        #     tone_list.append(str(tone))
        #     tone_list.append(str(tone))
        #     continue
        
        for pin_part in (get_initials(pinyin, False), get_finals(pinyin, False)):
            print("pin_part", pin_part)
            if pin_part.upper() in pinyin2cmu_dict.keys():
                phone_list.append(pinyin2cmu_dict[pin_part.upper()])
                for _ in pinyin2cmu_dict[pin_part.upper()].split(" "):
                    tone_list.append(str(tone))
                print("cmu", pinyin2cmu_dict[pin_part.upper()])
        phone_list.append("$")
        tone_list.append(str(8))
    # print(len(phone_list[:-5]))
    print(phone_list)
    return phone_list, tone_list
Exemple #7
0
    def to_finals_tone2(self, pinyin, **kwargs):
        if kwargs.get('strict'):
            pinyin = convert_finals(pinyin)
        has_fi = has_finals(pinyin)

        # 用数字表示声调
        pinyin = replace_symbol_to_number(pinyin)
        if not has_fi:
            return pinyin
        # 获取韵母部分
        return get_finals(pinyin, strict=False)
Exemple #8
0
    def to_finals(self, pinyin, **kwargs):
        if kwargs.get('strict'):
            pinyin = convert_finals(pinyin)
        has_fi = has_finals(pinyin)

        # 替换声调字符为无声调字符
        pinyin = replace_symbol_to_no_symbol(pinyin)
        if not has_fi:
            return pinyin
        # 获取韵母部分
        return get_finals(pinyin, strict=False)
Exemple #9
0
    def to_finals_tone2(self, pinyin, **kwargs):
        if kwargs.get('strict'):
            pinyin = convert_finals(pinyin)
        has_fi = has_finals(pinyin)

        # 用数字表示声调
        pinyin = replace_symbol_to_number(pinyin)
        if not has_fi:
            return pinyin
        # 获取韵母部分
        return get_finals(pinyin, strict=False)
Exemple #10
0
    def to_finals(self, pinyin, **kwargs):
        if kwargs.get('strict'):
            pinyin = convert_finals(pinyin)
        has_fi = has_finals(pinyin)

        # 替换声调字符为无声调字符
        pinyin = replace_symbol_to_no_symbol(pinyin)
        if not has_fi:
            return pinyin
        # 获取韵母部分
        return get_finals(pinyin, strict=False)
def pypinyin_g2p_phone(text) -> List[str]:
    from pypinyin import pinyin
    from pypinyin import Style
    from pypinyin.style._utils import get_finals
    from pypinyin.style._utils import get_initials

    phones = [
        p for phone in pinyin(text, style=Style.TONE3) for p in [
            get_initials(phone[0], strict=True),
            get_finals(phone[0], strict=True),
        ] if len(p) != 0
    ]
    return phones
Exemple #12
0
    def to_finals_tone3(self, pinyin, **kwargs):
        if kwargs.get('strict'):
            pinyin = convert_finals(pinyin)
        has_fi = has_finals(pinyin)

        # 用数字表示声调
        pinyin = replace_symbol_to_number(pinyin)
        # 将声调数字移动到最后
        pinyin = RE_TONE3.sub(r'\1\3\2', pinyin)

        if not has_fi:
            return pinyin
        # 获取韵母部分
        return get_finals(pinyin, strict=False)
Exemple #13
0
    def to_finals_tone3(self, pinyin, **kwargs):
        if kwargs.get('strict'):
            pinyin = convert_finals(pinyin)
        has_fi = has_finals(pinyin)

        # 用数字表示声调
        pinyin = replace_symbol_to_number(pinyin)
        # 将声调数字移动到最后
        pinyin = RE_TONE3.sub(r'\1\3\2', pinyin)

        if not has_fi:
            return pinyin
        # 获取韵母部分
        return get_finals(pinyin, strict=False)
Exemple #14
0
def part2(text, pinyin2cmu_dict):
    phone_list = list()
    tone_list = list()
    #     print(text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    #     exit()
    new_phone_list = list()
    for pinyin in text.split(" "):
        if "#" not in pinyin:
            tone = re.findall(r"\d+\.?\d*", pinyin)
        else:
            tone = []
        if len(tone) == 0:
            tone = 7
        else:
            tone = int(tone[0]) + 2
        #         print(pinyin2cmu_dict.keys())
        #         exit()
        # pinyin = pinyin.replace(str(tone - 2), "")

        head = get_initials(pinyin, False).upper()
        tail = get_finals(pinyin, False).upper()

        if "#" in pinyin:
            new_phone_list.append(pinyin)
            continue

        if head not in pinyin2cmu_dict.keys() and tail not in pinyin2cmu_dict:
            new_phone_list.append(pinyin)
            continue
        if head != "":
            new_phone_list.append(pinyin2cmu_dict[head])
        if tail != "":
            tone = re.findall(r"\d+\.?\d*", tail)
            if len(tone)==0:
                new_phone_list.append(pinyin2cmu_dict[tail])
            else:
                tail = tail.replace(str(tone[0]), "")
                new_phone_list.append(pinyin2cmu_dict[tail]+str(tone[0]))
                pass
        new_phone_list.append(" ")
        # if get_initials(pinyin, False).upper() in pinyin2cmu_dict.keys():
        #     new_phone_list.append(pinyin2cmu_dict[get_initials(pinyin, False).upper()])
        # elif get_finals(pinyin, False).upper() in pinyin2cmu_dict.keys():
        #     new_phone_list.append(pinyin2cmu_dict[get_finals(pinyin, False).upper()])
        # else:
        #     new_phone_list.append(pinyin)

    return new_phone_list
Exemple #15
0
def frontend(text):
    """Clean text and then convert to id sequence."""
    text = pinyin(text, style=Style.TONE3)
    text = [c[0] for c in text]
    print(f"Cleaned text: {text}")
    idseq = []
    for x in text:
        c_init = get_initials(x, strict=True)
        c_final = get_finals(x, strict=True)
        for c in [c_init, c_final]:
            if len(c) == 0:
                continue
            if c not in char_to_id.keys():
                print(f"WARN: {c} is not included in dict.")
                idseq += [char_to_id["<unk>"]]
            else:
                idseq += [char_to_id[c]]
    idseq += [idim - 1]  # <eos>
    return torch.LongTensor(idseq).view(-1).to(device)
def get_pinyin(content):
    # Some special rules to match CSMSC pinyin
    text = pinyin(content, style=Style.TONE3)
    text = [c[0] for c in text]
    clean_content = []
    for c in text:
        c_init = get_initials(c, strict=True)
        c_final = get_finals(c, strict=True).replace("ü", "v")
        if c_init == 'w':
            c_init = ''
            if c_final != 'u':
                c_final = 'u' + c_final

        if c_init == 'y':
            c_init = ''
            if c_final.startswith("u"):
                c_final = c_final.replace('u', 'v')
            elif not c_final.startswith('i'):
                c_final = 'i' + c_final

        if re.match("iu\d", c_final):
            c_final = c_final.replace("iu", "iou")
        if re.match("ui\d", c_final):
            c_final = c_final.replace("ui", "uei")
        if re.match("ue\d", c_final):
            c_final = c_final.replace("ue", "ve")

        if re.match("i\d", c_final):
            if c_init in ['z', 'c', 's']:
                c_final = c_final.replace("i", "ii")
            elif c_init in ['zh', 'ch', 'sh', 'r']:
                c_final = c_final.replace("i", "iii")

        if re.match("(u|un|uan)\d", c_final):
            if c_init in ['j', 'q', 'x', 'y']:
                c_final = c_final.replace("u", "v")
            else:
                if re.match("un\d", c_final):
                    c_final = c_final.replace("un", "uen")
        if c_init:
            clean_content.append(c_init)
        clean_content.append(c_final)
    return ' '.join(clean_content)
def get_rhymes_table(input_filepath, output_filepath):
    dataset = pd.read_csv(input_filepath,
                          header=None,
                          sep='\t',
                          encoding='utf-8',
                          names=['words', 'pronunciation', 'frequency'])
    rhymes = {}
    for _, data in dataset.iterrows():
        word_rhyme = ''
        for character in data['pronunciation'].split("'"):
            rhyme_with_tone = get_finals(character, strict=False)
            rhyme_without_tone = rhyme_with_tone[0:-1]
            word_rhyme += rhyme_without_tone + "'"
        word_rhyme = word_rhyme[0:-1]
        if word_rhyme in rhymes.keys():
            rhymes[word_rhyme].append([data['words'], data['frequency']])
        else:
            rhymes[word_rhyme] = [[data['words'], data['frequency']]]

    rhymes_df = pd.DataFrame(
        {key: pd.Series(value)
         for key, value in rhymes.items()})
    rhymes_df.to_csv(output_filepath, sep=",", header=True)
def translate_pinyin(sentence):
    """
        将中文语句(只能包含中文字符和中文标点符号)转化为模型需要的拼音
    :param sentence: 输入的中文语句
    :return: 返回转化后的拼音数据
    """
    # 对中文语句进行编码转化,转化为utf-8编码格式
    sentence = sentence.decode(encoding='utf-8')
    # 正则匹配去除掉非中文以及我们所不需要的标点
    regex = u".*?([\u2E80-\u9FFF,!。?、]+).*?"
    m = re.findall(regex, sentence)
    regex_txt = ''.join(m).encode(encoding='utf-8')
    # 对中文语句进行转化
    pinyin_list = lazy_pinyin(regex_txt, style=Style.TONE3)
    result = []
    # 对转化的拼音的格式进行修改
    for pinyin in pinyin_list:
        # 判断当前拼音是否为标点符号
        if pinyin in [u'\uff0c', u'\u3002', u'\uff1f', u'\uff01', u'\u3001']:
            result.append(pinyin)
            continue
        '''对拼音进行标准化,更换声母和部分韵母的表达方式,以及对一些轻音的添加声调为第5声'''
        # 对没有声调的轻音,将其转化为第5声
        if pinyin[-1] not in ['1', '2', '3', '4']:
            pinyin = pinyin + '5'
        # 按照 TRANSLATE_DICT 中,对部分拼音进行更换声母或者韵母
        if pinyin[:-1] in TRANSLATE_DICT.keys():
            pinyin = TRANSLATE_DICT[pinyin[:-1]] + pinyin[-1]

        # 获取声母
        shengmu = get_initials(pinyin, strict=False)
        # 获取韵母
        yunmu = get_finals(pinyin, strict=False)
        # 将单个拼音按照“{声母 韵母}”的格式输出
        result.append('{' + shengmu + ' ' + yunmu + '}')
    # 返回中文语句转化成拼音的字符串
    return ' '.join(result)
Exemple #19
0
    def to_finals_tone(self, pinyin, **kwargs):
        if not has_finals(pinyin):
            return pinyin

        # 获取韵母部分
        return get_finals(pinyin, strict=kwargs.get('strict'))
    # clean every line in transcription file first
    transcription_dict = {}
    with codecs.open(args.transcription_path, "r", "utf-8") as fid:
        for line in fid.readlines():
            segments = line.split(" ")
            lang_char = args.transcription_path.split("/")[-1][0]
            id = args.spk + "_" + lang_char + segments[0]  # ex. TMF1_M10001
            content = segments[1].replace("\n", "")

            # Some special rules to match CSMSC pinyin
            text = pinyin(content, style=Style.TONE3)
            text = [c[0] for c in text]
            clean_content = []
            for c in text:
                c_init = get_initials(c, strict=True)
                c_final = get_finals(c, strict=True)
                for c in [c_init, c_final]:
                    if len(c) == 0:
                        continue
                    c = c.replace("ü", "v")
                    c = c.replace("ui", "uei")
                    c = c.replace("un", "uen")
                    c = c.replace("iu", "iou")

                    # Special rule: "e5n" -> "en5"
                    if "5" in c:
                        c = c.replace("5", "") + "5"
                    clean_content.append(c)

            transcription_dict[id] = " ".join(["<" + args.lang_tag + ">"] +
                                              clean_content)
Exemple #21
0
    def to_finals_tone(self, pinyin, **kwargs):
        if not has_finals(pinyin):
            return pinyin

        # 获取韵母部分
        return get_finals(pinyin, strict=kwargs.get('strict'))