Esempio n. 1
0
def prep_ph2num():
    kiritan_phone_mapping = {}
    with open(join(config["sinsy_dic"], "japanese.table"),
              encoding="UTF-8") as f:
        for l in f:
            s = l.strip().split()
            key = jaconv.hira2kata(s[0])
            kiritan_phone_mapping[key] = s[1:]
    sinsy_phone_mapping = {}
    with open(join(config["sinsy_dic"], "japanese.utf_8.table"),
              encoding="UTF-8") as f:
        for l in f:
            s = l.strip().split()
            key = jaconv.hira2kata(s[0])
            sinsy_phone_mapping[key] = s[1:]
    ph2num = {}
    counter = 0
    for p in ["sil", "pau", "br"]:
        ph2num[p] = counter
        counter += 1
    for k, v in sinsy_phone_mapping.items():
        for p in v:
            if p not in ph2num:
                ph2num[p] = counter
                counter += 1
    for k, v in kiritan_phone_mapping.items():
        for p in v:
            if p not in ph2num:
                ph2num[p] = counter
                counter += 1
    # undef
    ph2num["xx"] = counter

    return ph2num
def unify_text(texts):
    unification_dict = {}
    for text in texts:
        for i, word in enumerate(text):
            if jaconv.kata2hira(word) in unification_dict:
                text[i] = jaconv.kata2hira(word)
            elif jaconv.hira2kata(word) in unification_dict:
                text[i] = jaconv.hira2kata(word)
            else:
                unification_dict[word] = True

    return texts
Esempio n. 3
0
    def ja_parser(ja_recipe):
        #日本レシピを整理し、図面作成、単語ベクトル生成用データを準備する

        p1_data = ja_recipe
        p1_data = p1_data.set_index('recipename')

        #変な字を消す、stringとリストの変換
        kako = re.compile(r'([((].+?[))])')
        delid = re.compile(r'ID\d{7},')
        graphstr = re.compile(r'\*|〇|▲|□|◇|☆|●|◆|○|◎|┗|★')
        p1_data.loc[:,
                    'ingredientlist'] = p1_data.loc[:,
                                                    'ingredient'].str.replace(
                                                        kako, '').str.replace(
                                                            delid,
                                                            '').str.replace(
                                                                graphstr, ''
                                                            ).str.replace(
                                                                r'\n', ''
                                                            ).str.replace(
                                                                r',,', '')
        p1_data.loc[:, 'ingredientlist'] = jaconv.hira2kata(
            p1_data.loc[:, 'ingredientlist'].str)
        p1_data.loc[:,
                    'ingredientlist'] = p1_data.loc[:,
                                                    'ingredientlist'].str.split(
                                                        ',')

        #wordcoud、単語ベクトル生成用データを準備する
        bigsoup = p1_data.loc[:, 'ingredientlist'].to_string(
            index=False,
            header=False).replace('、', ',').replace('[', '').replace(
                '\n', '').replace(']', '').replace('  ', '').replace(
                    ' ', '').replace('しょうが', '生姜').replace('☆', '').replace(
                        'しょうゆ', '醤油').replace('オリーブ油', 'オリーブオイル').replace(
                            'にんじん', '人参').replace('葱', 'ねぎ')
        littlesoup = bigsoup.split(',')
        littlesoup = pd.Series(littlesoup)
        frequency = (littlesoup.value_counts()[:500]) / len(ja_recipe.index)

        #平仮名をカタカナに変換する
        kataindex = []
        for x in frequency.index:
            y = jaconv.hira2kata(x)
            kataindex.append(y)
        frequency.index = kataindex
        jacloudsource = frequency.to_dict()
        frequency = pd.Series(frequency)

        #ランキング、wordcloud作成、単語ベクトル生成用データの出力
        return frequency, jacloudsource, p1_data
Esempio n. 4
0
def get_yomi(word, yomi=''):
    def to_yomi(word):
        return nolmcb.parse(word).strip()

    word = re_symbol.sub('', word)
    word = jaconv.hira2kata(word)
    if re_katakana.search(word):
        return word
    elif re_katakana.search(yomi):
        return yomi
    yomi = jaconv.hira2kata(to_yomi(word))
    if re_katakana.search(yomi):
        return yomi
    return '*'
Esempio n. 5
0
def g2p(input_yomi):
    # 全て全角カタカナに変換
    input_yomi = jaconv.h2z(input_yomi)
    input_yomi = jaconv.hira2kata(input_yomi)

    output_yomi = []

    for i, item in enumerate(input_yomi):

        # 先頭に長音符がきたら読まない
        if i == 0 and (item == "ー" or item == "〜"):
            pass

        # 文字列の、末端で無いとき、次の文字が捨て仮名で無いか確認する
        elif i < len(input_yomi)-1:
            if input_yomi[i+1] in sutegana:
                youon = item+input_yomi[i+1]
                # 拗音の音素を出力
                if youon in g2p_list:
                    output_yomi.append(g2p_list[youon])
                # 拗音ではない場合、通常の仮名の音素を出力
                else:
                    output_yomi += nonyouon_before_st(input_yomi, i)
                    output_yomi += nonyouon_before_st(input_yomi, i+1)
            else:
                output_yomi += nonyouon(input_yomi, i, item)
        # 末端
        else:
            output_yomi += nonyouon(input_yomi, i, item)

    output_str = " ".join(output_yomi)
    output_yomi = output_str.split()
    # 音素を出力
    return output_yomi
def zen2han(input):
    # 半角カナにない特殊文字をまず変換
    buf = []
    for x in input:
        if x in ('ゐ', 'ヰ'):
            y = '\u0010'
        elif x in ('ゑ', 'ヱ'):
            y = '\u0011'
        elif x == 'ヵ':
            y = '\u0012'
        elif x == 'ヶ':
            y = '\u0013'
        elif x in ('ゎ', 'ヮ'):
            y = '\u0014'
        else:
            y = x
        buf.append(y)
    output = "".join(buf)

    # 半角カタカナに変換
    output = jaconv.z2h(jaconv.hira2kata(output),
                        kana=True,
                        digit=True,
                        ascii=True)
    output = output.replace('゛', '゙')  # 全角濁点を半角濁点に
    output = output.replace('゜', '゚')  # 全角半濁点を半角半濁点に
    return output
Esempio n. 7
0
    async def on_message(self, message):
        if len(message.content) == 0:
            return

        content = message.content.lower()
        content_normalized = jaconv.hira2kata(content[0].upper() + content[1:])

        if content_normalized == 'No.???':
            return

        # filter pokemon list
        match = self.table[
            self.table.drop(columns='No.').applymap(
                lambda name: name == content_normalized
            ).any(axis=1)
        ]
        if len(match) != 0:
            # display 1st pokemon of filtered list (expected that only 1 item has been extracted)
            match = match.drop(columns='No.str')
            matchseq = match.iloc[0]
            embed = format_pokeinfo(zip(matchseq.index, matchseq))
            await message.channel.send(embed=embed)

        # filter move(waza) list
        content_normalized_lower = content_normalized.lower()
        match_moves = self.table_moves[
            self.table_moves_katakana_lower.applymap(
                lambda name: name == content_normalized_lower
            ).any(axis=1)
        ]
        if len(match_moves) != 0:
            # display 1st move of filtered list (expected that only 1 item has been extracted)
            matchseq_moves = match_moves.iloc[0]
            embed = format_pokeinfo(zip(matchseq_moves.index, matchseq_moves))
            await message.channel.send(embed=embed)
Esempio n. 8
0
    async def on_message(self, ctx):
        if ctx.author.bot:
            return
        if all([
                ctx.channel.id != cs.Zyanken_room,
                ctx.channel.id != cs.Test_room
        ]):
            return

        for hand in ["グー", "チョキ", "パー"]:
            # グー,チョキ,パーの順に文字が含まれているか検索
            if hand not in jaconv.hira2kata(jaconv.h2z(ctx.content)):
                continue
            # img, hand, msg, emoji1, emoji2 = zf.honda_to_zyanken(hand, ctx.author.id)
            img, hand, msg, emoji1, emoji2 = zf.honda_to_zyanken_breaktime(
                hand, ctx.author.id)
            if str(ctx.author.id) not in zf.No_reply:
                await ctx.add_reaction(emoji1)
                await ctx.add_reaction(emoji2)
                await ctx.channel.send(f"{ctx.author.mention} {hand}\n{msg}",
                                       file=discord.File(img),
                                       delete_after=5)
            if cs.Zyanken not in [roles.id for roles in ctx.author.roles]:
                guild = self.bot.get_guild(ctx.guild.id)
                await guild.get_member(ctx.author.id
                                       ).add_roles(get_role(guild, cs.Zyanken))
            """
            if emoji2 == "🎉" and len(zf.Former_winner) <= 5:
                guild = self.bot.get_guild(ctx.guild.id)
                await guild.get_member(ctx.author.id).add_roles(get_role(guild, cs.Winner))
                if ctx.author.id not in zf.Former_winner:
                    zf.Former_winner.append(ctx.author.id)
            """
            break
Esempio n. 9
0
def str_cleanUp(st):
    st = st.replace(" ", "")
    st = st.replace("・", "")
    st = st.replace("&", "アンド")
    st = jaconv.h2z(st, kana=True)
    st = jaconv.hira2kata(st)
    return st
Esempio n. 10
0
def reverse_hirakana(string):
    import jaconv
    if is_hiragana(string):
        string = jaconv.hira2kata(string)
    elif is_katakana(string):
        string = jaconv.kata2hira(string)
    return string
Esempio n. 11
0
def replaceName(string):
    string = hira2kata(z2h(string, digit=True, kana=False))
    for tmp_string in trans_string_table:
        string = string.replace(tmp_string[0], tmp_string[1])
    string = string.translate(trans_table)
    string = re.sub(replace_string, "", string)
    return string
Esempio n. 12
0
File: util.py Progetto: r9y9/nnsvs
def prep_ph2num(dic_path):
    if isdir(dic_path):
        _dic_path = join(dic_path, "japanese.utf_8.table")
    elif isfile(dic_path):
        _dic_path = dic_path

    phone_mapping = {}

    with open(_dic_path, encoding="UTF-8") as f:
        for label in f:
            s = label.strip().split()
            key = jaconv.hira2kata(s[0])
            phone_mapping[key] = s[1:]
    ph2num = {}
    counter = 0
    for p in ["sil", "pau", "br"]:
        ph2num[p] = counter
        counter += 1
    for _, v in phone_mapping.items():
        for p in v:
            if p not in ph2num:
                ph2num[p] = counter
                counter += 1
    # undef
    ph2num["xx"] = counter

    return ph2num
def tester(
):  # only complete function once user correctly identifies all Kanji in learned_list
    mastered = {}
    while mastered != learned_list:
        for i in range(0, len(learned_list)):
            current_card = random.choice(list(learned_list.items()))
            if current_card[0] in mastered.keys(
            ):  # choosing only non-mastered Kanji
                continue
            user_answer = input(
                'What does {} read as? Type in Romaji: '.format(
                    current_card[0]))
            # using jaconv module. converting Romaji to Hiragana and Katakana
            hiragana_answer = jaconv.alphabet2kana(
                u'{}'.format(user_answer))  # takes Romaji --> Hiragana
            katakana_answer = jaconv.hira2kata(
                u'{}'.format(hiragana_answer))  # takes Hiragana --> Katakana
            if hiragana_answer in current_card[
                    1] or katakana_answer in current_card[
                        1]:  # checking for on/kun readings
                print('Correct!')
                mastered[current_card[0]] = current_card[1]
            else:
                print('Incorrect. This reads as {}'.format(current_card[1]))
    print('Congratulations! You have mastered the learned_list')
Esempio n. 14
0
def get_kanji(letter):
    base_url = "https://mojikiban.ipa.go.jp/mji/q"
    kanjis = []
    #hiragana_part
    query = {"読み": letter}
    r = requests.get(url=base_url, params=query)
    if r.json()["find"] == False:
        print("NO RESULT FOUND")
        list = []
    else:
        list = r.json()["results"]
    for i in list:
        moji = "\\" + i['UCS']["対応するUCS"].replace("+", "").replace(
            "^", "\\").lower()
        kanji = chr(int(i['UCS']["対応するUCS"][2:], 16))
        kanjis.append([kanji, i["総画数"]])
    #katakana_part
    another_letter = jaconv.hira2kata(letter)
    query = {"読み": another_letter}
    r = requests.get(url=base_url, params=query)
    if r.json()["find"] == False:
        print("NO RESULT FOUND")
        list = []
    else:
        list = r.json()["results"]
    for i in list:
        kanji = chr(int(i['UCS']["対応するUCS"][2:], 16))
        kanjis.append([kanji, i["総画数"]])
    return (kanjis)
Esempio n. 15
0
def sentence_to_tokens(sentence, is_katakana=False):
    '''
    Parses one sentence into tokens using MeCab. Assumes UniDic CWJ
    2.2.0 version of the dictionary is set as default. If is_katakana
    is set, then will convert hiragana to katakana before passing the
    string to MeCab and then finally reverting the change to the
    surface form.
    '''
    unidic_features = [
        'pos1', 'pos2', 'pos3', 'pos4', 'cType', 'cForm', 'lForm', 'lemma',
        'orth', 'pron', 'orthBase', 'pronBase', 'goshu', 'iType', 'iForm',
        'fType', 'fForm', 'iConType', 'fConType', 'type', 'kana', 'kanaBase',
        'form', 'formBase', 'aType', 'aConType', 'aModType', 'lid', 'lemma_id'
    ]
    tokens = []

    if is_katakana:
        sentence = jaconv.kata2hira(sentence)

    with MeCab() as mecab:
        for node in mecab.parse(sentence, as_nodes=True):
            if not node.is_eos():
                token = dict(zip(unidic_features, node.feature.split(',')))

                token['pos'] = token['pos1']
                if token['pos2'] != '*':
                    token['pos'] += '-' + token['pos2']
                if token['pos3'] != '*':
                    token['pos'] += '-' + token['pos3']
                if token['pos4'] != '*':
                    token['pos'] += '-' + token['pos4']

                if len(token) == 7:  # OOV
                    if is_katakana:
                        token['orth'] = jaconv.hira2kata(node.surface)
                    else:
                        token['orth'] = node.surface
                    token['orthBase'] = node.surface
                    token['lemma'] = node.surface
                    token['oov'] = True
                    tokens.append(token)
                else:
                    if is_katakana:
                        token['orth'] = jaconv.hira2kata(token['orth'])
                    token['oov'] = False
                    tokens.append(token)
    return tokens
Esempio n. 16
0
def get_words_by_kana(kana_word):
    kana_word = jaconv.hira2kata(kana_word)  # ひらがなをカタカナに変換
    words = words_list[words_list["読み"] == kana_word]

    # 入力によって経路が存在しなくなるのを防ぐため,
    # 単語リストに存在しない1文字の場合には十分高いコストで読みのまま返す
    if len(kana_word) == 1 and len(words) == 0:
        words = create_empty_word(kana_word, 9999999)
    return words
Esempio n. 17
0
 def _kanji_to_kana(self, char):
     glyph = self.c.lookup_char(char).first()
     if glyph is None:
         return None
     romaji_on = glyph.kJapaneseKun.lower()
     romaji_kun = glyph.kJapaneseOn.lower()
     jp_on = jaconv.alphabet2kana(romaji_on).split(' ')
     jp_kun = jaconv.hira2kata(jaconv.alphabet2kana(romaji_kun)).split(' ')
     return jp_on, jp_kun, glyph.kDefinition
Esempio n. 18
0
def open_ust(file_name):
    song = []
    instance = {}
    bpm = 0.0
    for strm in open(file_name, "r"):
        if strm.strip().startswith("["):
            if len(instance) > 0:
                if instance.get("Tempo", None):
                    bpm = float(".".join(instance["Tempo"].split(",")))
                if instance.get("Lyric", None):
                    if len(instance["Lyric"].split(" ")) > 1:
                        instance["Lyric"] = instance["Lyric"].split(" ")[-1]
                    if "R" in instance["Lyric"] or "息" in instance["Lyric"]:
                        instance["Lyric"] = ""
                    if hira_p.search(unicode(instance["Lyric"])):
                        instance["Lyric"] = re.sub("[A-Za-z]+", "",
                                                   instance["Lyric"])
                    else:
                        instance["Lyric"] = romkan.to_katakana(
                            instance["Lyric"])
                    instance["Lyric"] = re.sub(
                        u"[^ァ-ン]", "",
                        unicode(instance["Lyric"])).encode("utf8")
                instance["Tempo"] = str(bpm)
                if "NoteNum" in instance:
                    if instance["Lyric"] == "":
                        instance["NoteNum"] = "rest"
                    song.append(instance)
            instance = {"InstanceIdx": strm.strip().lstrip("[#").rstrip("]")}
        else:
            if len(strm.strip().split("=")) < 2:
                continue
            key, value = strm.strip().split("=")
            if key == "Lyric":
                value = jaconv.hira2kata(unicode(value)).encode("utf8")
            if key in ("NoteNum", "Lyric", "Length", "Tempo"):
                instance[key] = value
    if len(instance) > 0:
        if instance.get("Tempo", None):
            bpm = float(".".join(instance["Tempo"].split(",")))
        if instance.get("Lyric", None):
            if len(instance["Lyric"].split(" ")) > 1:
                instance["Lyric"] = instance["Lyric"].split(" ")[-1]
            if "R" in instance["Lyric"] or "息" in instance["Lyric"]:
                instance["Lyric"] = ""
            if hira_p.search(unicode(instance["Lyric"])):
                instance["Lyric"] = re.sub("[A-Za-z]+", "", instance["Lyric"])
            else:
                instance["Lyric"] = romkan.to_katakana(instance["Lyric"])
            instance["Lyric"] = re.sub(u"[^ァ-ン]", "", unicode(
                instance["Lyric"])).encode("utf8")
        instance["Tempo"] = str(bpm)
        if "NoteNum" in instance:
            if instance["Lyric"] == "":
                instance["NoteNum"] = "rest"
            song.append(instance)
    return song
Esempio n. 19
0
def get_names(filename: str, gender: str):
    with open(filename, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            name = row['name']
            name_simplified = simplify_swedish_name(row['name'])
            count = normalize_int(row['count'])
            katakana = jaconv.hira2kata(jaconv.alphabet2kana(name_simplified))
            if is_simple_katakana(katakana):
                yield { 'name': name, 'katakana': katakana, 'count': count, 'gender': gender }
Esempio n. 20
0
def add_ojisan(parsed: list):
    word_list = []
    for i, word in enumerate(parsed):

        if '人名' and '固有名詞' in word[3]:
            word_list.append(word[0])
            word_list.append('チャン')

            if random.randint(0, 2) == 0:
                word_list.append(random_emote())

        elif '係助詞' in word[3]:
            word_list.append(word[0])
            word_list.append(random_emote())
            word_list.append('、')

        elif '助動詞' in word[3] and '特殊' not in word[4]:
            # catches ない to not kanakanalise it.
            word_list.append(jaconv.hira2kata(word[0]))
            word_list.append(random_emote())

        elif '終助詞' in word[3]:
            word_list.append(jaconv.hira2kata(word[0]))
            word_list.append(random_emote())

        elif '接尾' and '人名' in word[3]:
            pass

        elif '接続助詞' in word[3]:
            word_list.append(word[0])
            word_list.append(random_emote())

        elif word[0] == ('?' or '?'):
            word_list.append('❓')

        elif word[0] == ('!' or '!'):
            word_list.append('❗')

        else:
            word_list.append(word[0])

    return ''.join(word_list)
Esempio n. 21
0
def convert_hira_to_kana(user_message_hira):
    re_hira = re.compile(r'[\u3041-\u3093]+')
    remove_bar = user_message_hira.replace('ー', '')

    if re_hira.fullmatch(remove_bar):
        converted_kana = jaconv.hira2kata(user_message_hira)
        print(converted_kana)
        return converted_kana
    else:
        print("ひらがなを入力してください")
        return None
 def C(self, word):
     try:
         H = float(self.df_hira[word])
     except KeyError:
         H = 0.0
     try:
         K = float(self.df_kata[jaconv.hira2kata(word)])
     except KeyError:
         K = 0.0
     n = 10  #tuning parameter
     self.Cvalue = 2 / (1 + math.exp(-n * K / (H + 1))) - 1
     return self.Cvalue
Esempio n. 23
0
def talk():
    # 受け取ったmsgをもとに返事を生成
    msg = jaconv.hira2kata(urllib.parse.unquote(request.args.get('msg')))
    msg_data = np.zeros((1, lstm.max_encoder_seq_length, lstm.num_encoder_tokens), dtype='float32')
    for i, char in enumerate(msg):
        msg_data[0, i, lstm.input_token_index[char]] = 1.
    response = lstm.decode_sequence(msg_data)
    response = re.sub(r'[「」]', '', response)
    response = jaconv.kata2hira(response)
    if response.find('…………') != -1:
        response = 'ぶわーーーー!=3 =3\n'
    return response
Esempio n. 24
0
def line_info_n_k_func(self):
    for look_up_file, f_name in zip(self.look_up_files, self.f_names):
        ono_lis = []
        with open(self.b_name, "r") as f:
            for line in f:
                ono_lis.append(line.rstrip("\n"))
        ono_lis = list(set(ono_lis))
        ono_lis = [jaconv.hira2kata(i) for i in ono_lis]
        ono_counter = {}
        ono_lis_st = "|".join(ono_lis)

        file_list = glob(look_up_file.rstrip("/"))
        ono = []
        print(file_list)

        for file in tqdm(file_list):
            story_list = file + "/*"
            for story in tqdm(glob(story_list)):
                for i in ono_lis:
                    ono_counter[i] = []
                for data in tqdm(glob(story + "/*")):
                    if f_name + story.lstrip(
                            look_up_file) + "line_info_n_k.csv" in glob(
                                f_name + "*.csv"):
                        pass
                    elif len(glob(story + "/*")) == 1:
                        ono_counter = search_inside_sentence(
                            data, ono_lis_st, ono_counter, "line_info_n_k",
                            True)
                    else:
                        if self.exclude_stirngs in data:
                            pass
                        else:
                            ono_counter = search_inside_sentence(
                                data, ono_lis_st, ono_counter, "line_info_n_k",
                                False)
                if f_name + story.lstrip(
                        look_up_file) + "line_info_n_k.csv" in glob(f_name +
                                                                    "*.csv"):
                    print(f_name + story.lstrip(look_up_file) +
                          "line_info_n_k.csv")
                    pass
                else:
                    df = pd.DataFrame(
                        [[i for i in ono_counter.values() if len(i) > 0]],
                        index=[story],
                        columns=[
                            i for i in ono_counter.keys()
                            if len(ono_counter[i]) > 0
                        ])
                    df.to_csv(f_name + story.lstrip(look_up_file) +
                              "line_info_n_k.csv",
                              index=False)
Esempio n. 25
0
def convert_kana(strings):
    """ kana """
    result = morphological_analysis(strings, _KBM_MODEL, _KYTEA_PATH)
    result_strings = result.split(' ')
    target_array = [s.split('/')[2] for s in result_strings]
    join_strings = ''.join(target_array)
    regex = re.compile('[\u3041-\u309F]+')
    process_strings = regex.findall(join_strings)
    hiragana = ''.join(process_strings)
    katakana = jaconv.hira2kata(hiragana)
    zenkaku = jaconv.h2z(katakana, digit=True, ascii=True)

    return zenkaku
Esempio n. 26
0
def talkA3rt(message):
    apikey = ''
    client = pya3rt.TalkClient(apikey)
    api_response = client.talk(message.body['text'])
    # レスポンスがokの時返事を返す
    if api_response['message'] == 'ok':
        reply_message = api_response['results'][0]['reply']
        message.reply(jaconv.hira2kata(reply_message) + '…ロボ')
        # 普通の返事
        # message.reply(reply_message)
    # APIエラーの時はmessageを返す
    else:
        message.reply('エラー、ウマク返事ガデキマセン [ERROR:' + api_response['message'] + ']')
Esempio n. 27
0
def checkSite(name, yomi, soup):
    if not (CheckKATAKANA.checkKatakana(name)):
        try:
            soup_yomi = soup.find('h2').span.string
            soup_yomi = soup_yomi.replace('−', '-')
            soup_yomi = soup_yomi.replace('-', '-')
            soup_yomi = jaconv.hira2kata(soup_yomi[soup_yomi.index('(') + 1:soup_yomi.index(')')])
        except ValueError:
            print('サイト側に読みがありません')
            return False
        except AttributeError:
            print('サイト側に読みがありません')
            return False
        yomi = jaconv.hira2kata(yomi)
        if yomi == soup_yomi:
            print('TRUE')
            return True
        else:
            print('FALSE')
            return False
    else:
        return True
def search_morpheme(
        m: MultiMorpheme,
        match_reading=True) -> List[Tuple[jmdict.JMDEntry, List[int]]]:
    pos = m.part_of_speech()
    has_kanji = re.search(kanji_re, m.surface())
    ids = set()
    entries: List[jmdict.JMDEntry] = []
    reading = m.reading_form()
    dict_reading = "".join(m.reading_form()
                           for m in parse(m.dictionary_form()))
    for entry in jmdict_lookup(m.dictionary_form()).entries:
        if entry.idseq not in ids:
            ids.add(entry.idseq)
            entries.append(entry)

    matches: List[Tuple[jmdict.JMDEntry, List[int]]] = []
    reading_matches: List[Tuple[jmdict.JMDEntry, List[int]]] = []
    for entry in entries:
        if match_reading and not any(
                jaconv.hira2kata(r.text) in (reading, dict_reading)
                for r in entry.kana_forms):
            continue

        match_senses = list()
        senses = list()
        reading_matches.append((entry, list(range(len(entry.senses)))))
        for i, sense in enumerate(entry.senses):
            if not sense.pos:
                senses.append(i)
            elif any(sudachi_jmdict_pos_match(pos, p) for p in sense.pos):
                senses.append(i)
                match_senses.append(i)

        def sense_key(i):
            sense = entry.senses[i]
            uk_match = (has_kanji != "word usually written using kana alone" in
                        sense.misc)
            common = any(
                ("common" in p or "futsuumeishi" in p) for p in sense.pos)
            has_pos = bool(sense.pos)
            return (uk_match, common, has_pos)

        senses.sort(key=sense_key, reverse=True)

        if match_senses:
            matches.append((entry, senses))

    if not matches:
        return reading_matches

    return matches
Esempio n. 29
0
def text_to_sequence(text, p=0.0):
    for c in [" ", " ", "「", "」", "『", "』", "・", "【", "】", "(", ")", "(", ")"]:
        text = text.replace(c, "")
    text = text.replace("!", "!")
    text = text.replace("?", "?")

    text = normalize_delimitor(text)
    text = jaconv.normalize(text)
    if p > 0:
        text = mix_pronunciation(text, p)
    text = jaconv.hira2kata(text)
    text = add_punctuation(text)

    return [ord(c) for c in text] + [_eos]  # EOS
Esempio n. 30
0
    def sy2a(self, s, y):
        # preprocess strings
        s = s.strip()
        y = jaconv.normalize(y, "NFKC")
        y = jaconv.hira2kata(y)

        # encode
        s_np, y_np = self.encode_sy(s, y)
        s_np, y_np = self.add_batch_dim(s_np, y_np)

        # inference
        accent = self.infer(s_np, y_np)[0]
        yomi_and_accent = self.zip_ya(y, accent)
        return yomi_and_accent
Esempio n. 31
0
def convert(mozc_map, mozc_dir, output_dir):
    with open(os.path.join(output_dir, 'mozc.csv'), 'w') as out_fd:
        for f in glob.glob(os.path.join(mozc_dir, 'src/data/dictionary_oss/dictionary*.txt')):
            with open(f) as in_fd:
                for l in in_fd:
                    l = l.decode('utf8').strip().split('\t')
                    (yomi, lid, rid, cost, surface) = l[:5]
                    if lid not in mozc_map:
                        continue
                    (new_id, pos) = mozc_map[lid]
                    yomi = jaconv.hira2kata(yomi)
                    line = ','.join([surface, new_id, new_id, '0', pos, surface, yomi, yomi])
                    line += '\n'
                    out_fd.write(line.encode('utf8', 'replace'))
Esempio n. 32
0
def encode_katakana(text):
    """I don't think this quite works yet."""
    encoded = []
    for char in text:
        if jaconv:
            # try to convert japanese text to half-katakanas
            char = jaconv.z2h(jaconv.hira2kata(char))
            # TODO: "the conversion may result in multiple characters"
            # If that really can happen (I am not really shure), than the string would have to be split and every single
            #  character has to passed through the following lines.

        if char in TXT_ENC_KATAKANA_MAP:
            encoded.append(TXT_ENC_KATAKANA_MAP[char])
        else:
            # TODO doesn't this discard all that is not in the map? Can we be sure that the input does contain only
            # encodable characters? We could at least throw an exception if encoding is not possible.
            pass
    return b"".join(encoded)
Esempio n. 33
0
 def normalize(s):
     s = jaconv.hira2kata(s).replace('・', '')
     s = re_symbol.sub('', s)
     return re_tyouon.sub('ー', s)
Esempio n. 34
0
def test_hira2kata():
    assert_equal(jaconv.hira2kata('ともえまみ'), 'トモエマミ')
    assert_equal(jaconv.hira2kata('まどまぎ', ignore='ど'), 'マどマギ')
    _compare(jaconv.hira2kata, HIRAGANA, FULL_KANA)
Esempio n. 35
0
 def normalize_yomi(self, yomi):
     yomi = jaconv.hira2kata(yomi)
     return yomi.replace('ウ゛', 'ヴ').replace(' ', '')