Example #1
0
 def test_kakasi_unknown_rule(self):
     with self.assertRaises(UnsupportedRomanRulesException):
         kakasi = pykakasi.kakasi()
         kakasi.setMode("H","a")
         kakasi.setMode("K","a")
         kakasi.setMode("J","a")
         kakasi.setMode("r","hogefuga")
Example #2
0
    def test_kakasi_kunrei(self):

        TESTS = [
            ("構成",         "Kousei"),
            ("好き",         "Suki"),
            ("大きい",       "Ookii"),
            ("かんたん",     "kantan"),
            ("にゃ",         "nya"),
            ("っき",         "kki"),
            ("っふぁ",       "ffa"),
            ("漢字とひらがな交じり文", "Kanzi tohiragana Maziri Bun"),
            ("Alphabet 123 and 漢字", "Alphabet 123 and Kanzi"),
            ("日経新聞", "Nikkeisinbun"),
            ("日本国民は、","Nihonkokumin ha,")
        ]

        kakasi = pykakasi.kakasi()
        kakasi.setMode("H","a")
        kakasi.setMode("K","a")
        kakasi.setMode("J","a")
        kakasi.setMode("r","Kunrei")
        kakasi.setMode("C",True)
        kakasi.setMode("s",True)
        kakasi.setMode("E","a")
        converter  = kakasi.getConverter()
        for case, result in TESTS:
            self.assertEqual(converter.do(case), result)
Example #3
0
    def test_kakasi_J2H(self):

        TESTS = [
            (u"", ""),
            (u"構成", u"こうせい"),
            (u"好き", u"すき"),
            (u"大きい", u"おおきい"),
            (u"かんたん", u"かんたん"),
            (u"にゃ", u"にゃ"),
            (u"っき", u"っき"),
            (u"っふぁ", u"っふぁ"),
            (u"漢字とひらがな交じり文", u"かんじとひらがなまじりぶん"),
            (u"Alphabet 123 and 漢字", u"Alphabet 123 and かんじ"),
            (u"日経新聞", u"にっけいしんぶん"),
            (u"日本国民は、", u"にほんこくみんは、"),
            (u"苦々しい", u"にがにがしい")
        ]

        kakasi = pykakasi.kakasi()
        kakasi.setMode("H", None)
        kakasi.setMode("K", None)
        kakasi.setMode("J", "H")
        kakasi.setMode("s", False)
        kakasi.setMode("C", True)
        kakasi.setMode("E", None)
        kakasi.setMode("a", None)
        converter = kakasi.getConverter()
        for case, result in TESTS:
            self.assertEqual(converter.do(case), result)
Example #4
0
    def test_kakasi_hepburn(self):

        TESTS = [
            (u"", ""),
            (u"構成", "Kousei"),
            (u"好き", "Suki"),
            (u"大きい", "Ookii"),
            (u"かんたん", "kantan"),
            (u"にゃ", "nya"),
            (u"っき", "kki"),
            (u"っふぁ", "ffa"),
            (u"漢字とひらがな交じり文", "Kanji tohiragana Majiri Bun"),
            (u"Alphabet 123 and 漢字", "Alphabet 123 and Kanji"),
            (u"日経新聞", "Nikkeishinbun"),
            (u"日本国民は、", "Nihonkokumin ha,")
        ]

        kakasi = pykakasi.kakasi()
        kakasi.setMode("H", "a")
        kakasi.setMode("K", "a")
        kakasi.setMode("J", "a")
        kakasi.setMode("r", "Hepburn")
        kakasi.setMode("s", True)
        kakasi.setMode("E", "a")
        kakasi.setMode("a", None)
        kakasi.setMode("C", True)
        converter = kakasi.getConverter()
        for case, result in TESTS:
            self.assertEqual(converter.do(case), result)
Example #5
0
 def test_kakasi_E2a_upper(self):
     TESTS = [
         (u"abcdefghijklmnopqrstuvwxyz", "ABCDEFGHIJKLMNOPQRSTUVWXYZ"),
     ]
     kakasi = pykakasi.kakasi()
     kakasi.setMode("E", "a")
     kakasi.setMode("U", True)
     converter = kakasi.getConverter()
     for case, result in TESTS:
         self.assertEqual(converter.do(case), result)
Example #6
0
 def __init__(self):
     self._load_codepoints('ja')
     self.kakasi = pykakasi.kakasi()
     self.kakasi.setMode("J","a")
     self.kakasi.setMode("E","a")
     self.kakasi.setMode("H","a")
     self.kakasi.setMode("K","a")
     self.kakasi.setMode("s", True)
     self.kakasi.setMode("C", True)
     self.conv=self.kakasi.getConverter()
def get_reading_kakasi(word):
    """Gets reading for a given Japanese word by using kakasi. The reading in
       hiragana is returned by this function."""
    import pykakasi.kakasi as kakasi
    kakasi = kakasi()
    kakasi.setMode("J", "H")
    kakasi.setMode("C", True)  # default: Separator
    kakasi.setMode("c", False)  # default: no Capitalize
    conv = kakasi.getConverter()
    result = conv.do(word)
    return result
Example #8
0
    def test_kakasi_K2H(self):

        TESTS = [
            (u"", ""),
            (u"カンタン", u"かんたん"),
            (u"ニャ", u"にゃ")
        ]
        kakasi = pykakasi.kakasi()
        kakasi.setMode("K", "H")
        converter = kakasi.getConverter()
        for case, result in TESTS:
            self.assertEqual(converter.do(case), result)
Example #9
0
    def test_kakasi_a2E(self):

        TESTS = [
            ("ABCDEFGHIJKLMNOPQRSTUVWXYZ", u"ABCDEFGHIJKLMNOPQRSTUVWXYZ"),
            ("abcdefghijklmnopqrstuvwxyz", u"abcdefghijklmnopqrstuvwxyz"),
            ("!\"#$%&'()*+,-./_ {|}~", u"!"#$%&'()*+,-./_ {|}~")
        ]
        kakasi = pykakasi.kakasi()
        kakasi.setMode("a", "E")
        converter = kakasi.getConverter()
        for case, result in TESTS:
            self.assertEqual(converter.do(case), result)
Example #10
0
 def test_kakasi_J2a_upper(self):
     TESTS = [
         (u"かな漢字", "kana KANJI"),
     ]
     kakasi = pykakasi.kakasi()
     kakasi.setMode("J", "a")
     kakasi.setMode("H", "a")
     kakasi.setMode("s", True)
     kakasi.setMode("U", True)
     converter = kakasi.getConverter()
     for case, result in TESTS:
         self.assertEqual(converter.do(case), result)
Example #11
0
    def test_kakasi_numbers(self):

        TESTS = [
            (u"1234567890", "1234567890"),
            (u"一 二 三 四 五 六 七 八 九 〇", "ichi ni san shi go roku shichi hachi kyuu (maru)")
        ]
        kakasi = pykakasi.kakasi()
        kakasi.setMode("E", "a")
        kakasi.setMode("J", "a")
        converter = kakasi.getConverter()
        for case, result in TESTS:
            self.assertEqual(converter.do(case), result)
Example #12
0
 def test_kakasi_issues60(self):
     TESTS = [
         (u"市立", u"しりつ")
     ]
     kakasi = pykakasi.kakasi()
     kakasi.setMode("H", None)
     kakasi.setMode("K", None)
     kakasi.setMode("J", "H")
     kakasi.setMode("s", False)
     kakasi.setMode("C", True)
     kakasi.setMode("E", None)
     kakasi.setMode("a", None)
     converter = kakasi.getConverter()
     for case, result in TESTS:
         self.assertEqual(converter.do(case), result)
Example #13
0
 def test_katakana_furiagana(self):
     TESTS = [
         (u"変換前の漢字の脇に", u"変換前[ヘンカンマエ]の漢字[カンジ]の脇[ワキ]に")
     ]
     kakasi = pykakasi.kakasi()
     kakasi.setMode("H", None)
     kakasi.setMode("K", None)
     kakasi.setMode("J", "KF")
     kakasi.setMode("f", True)
     kakasi.setMode("s", False)
     kakasi.setMode("E", None)
     kakasi.setMode("a", None)
     converter = kakasi.getConverter()
     for case, result in TESTS:
         self.assertEqual(converter.do(case), result)
Example #14
0
 def test_hiragana_furiagana(self):
     TESTS = [
         (u"変換前の漢字の脇に", u"変換前[へんかんまえ]の漢字[かんじ]の脇[わき]に")
     ]
     kakasi = pykakasi.kakasi()
     kakasi.setMode("H", None)
     kakasi.setMode("K", None)
     kakasi.setMode("J", "HF")
     kakasi.setMode("f", True)
     kakasi.setMode("s", False)
     kakasi.setMode("E", None)
     kakasi.setMode("a", None)
     converter = kakasi.getConverter()
     for case, result in TESTS:
         self.assertEqual(converter.do(case), result)
Example #15
0
    def test_kakasi_constitution(self):

        original_text = u"日本国民は、正当に選挙された国会における代表者を通じて行動し、われらとわれらの子孫のために、諸国民との協和による成果と、わが国全土にわたつて自由のもたらす恵沢を確保し、政府の行為によつて再び戦争の惨禍が起ることのないやうにすることを決意し、ここに主権が国民に存することを宣言し、この憲法を確定する。そもそも国政は、国民の厳粛な信託によるものであつて、その権威は国民に由来し、その権力は国民の代表者がこれを行使し、その福利は国民がこれを享受する。これは人類普遍の原理であり、この憲法は、かかる原理に基くものである。われらは、これに反する一切の憲法、法令及び詔勅を排除する。"
        result = "Nihonkokumin ha, Seitou ni Senkyo sareta Kokkai niokeru Daihyousha wo Tsuuji te Koudou shi, wareratowarerano Shison notameni, Shokokumin tono Kyouwa niyoru Seika to, waga Kuni Zendo niwatatsute Jiyuu nomotarasu Keitaku wo Kakuho shi, Seifu no Koui niyotsute Futatabi Sensou no Sanka ga Okoru kotononaiyaunisurukotowo Ketsui shi, kokoni Shuken ga Kokumin ni Sonsu rukotowo Sengen shi, kono Kenpou wo Kakuteisu ru. somosomo Kokusei ha, Kokumin no Genshuku na Shintaku niyorumonodeatsute, sono Ken'i ha Kokumin ni Yurai shi, sono Kenryoku ha Kokumin no Daihyousha gakorewo Koushi shi, sono Fukuri ha Kokumin gakorewo Kyouju suru. koreha Jinruifuhen no Genri deari, kono Kenpou ha, kakaru Genri ni Motozuku monodearu. wareraha, koreni Hansu ru Issai no Kenpou, Hourei Oyobi Shouchoku wo Haijo suru."

        kakasi = pykakasi.kakasi()
        kakasi.setMode("H","a")
        kakasi.setMode("K","a")
        kakasi.setMode("J","a")
        kakasi.setMode("r","Hepburn")
        kakasi.setMode("C", True)
        kakasi.setMode("s", True)
        converter  = kakasi.getConverter()
        self.maxDiff = None
        self.assertEqual(converter.do(original_text), result)
Example #16
0
    def test_kakasi_passport(self):

        TESTS = [
            (u"", ""),
            (u"構成", "Kosei"),
            (u"大野", "Ono"),
            (u"斎藤", "Saito"),
            (u"菅野", "Kanno"),
            (u"本田", "Honda"),
            (u"一式", "Isshiki"),
            (u"別府", "Beppu"),
            (u"ジェ", "jie"),
            (u"チェ", "chie"),
            (u"ティ", "tei"),
            (u"ディ", "dei"),
            (u"デュ", "deyu"),
            (u"ファ", "fua"),
            (u"フィ", "fui"),
            (u"フェ", "fue"),
            (u"フォ", "fuo"),
            (u"ヴァ", "bua"),
            (u"ヴィ", "bui"),
            (u"ヴ", "bu"),
            (u"ヴェ", "bue"),
            (u"ヴォ", "buo"),
            (u"じぇ", "jie"),
            (u"ちぇ", "chie"),
            (u"てぃ", "tei"),
            (u"でぃ", "dei"),
            (u"でゅ", "deyu"),
            (u"ふぁ", "fua"),
            (u"ふぃ", "fui"),
            (u"ふぇ", "fue"),
            (u"ふぉ", "fuo")
        ]
        kakasi = pykakasi.kakasi()
        kakasi.setMode("H", "a")
        kakasi.setMode("K", "a")
        kakasi.setMode("J", "a")
        kakasi.setMode("r", "Passport")
        kakasi.setMode("E", "a")
        kakasi.setMode("C", True)
        kakasi.setMode("a", None)
        converter = kakasi.getConverter()
        for case, result in TESTS:
            self.assertEqual(converter.do(case), result)
Example #17
0
    def test_kakasi_passport_specialcase(self):

        TESTS = [
            (u"えっちゅう", "etchu"),
            (u"はっちょう", "hatcho"),
            (u"エッチュウ", "etchu"),
            (u"ハッチョウ", "hatcho")
        ]
        kakasi = pykakasi.kakasi()
        kakasi.setMode("H", "a")
        kakasi.setMode("K", "a")
        kakasi.setMode("J", "a")
        kakasi.setMode("r", "Passport")
        kakasi.setMode("E", "a")
        kakasi.setMode("a", None)
        converter = kakasi.getConverter()
        for case, result in TESTS:
            self.assertEqual(converter.do(case), result)
Example #18
0
    def test_kakasi_J2K(self):

        TESTS = [
            (u"", ""),
            (u"構成", u"コウセイ"),
            (u"好き", u"スキ"),
            (u"大きい", u"オオキイ"),
            (u"かんたん", u"かんたん"),
            (u"漢字とひらがな交じり文", u"カンジとひらがなマジリブン"),
            (u"Alphabet 123 and 漢字", u"Alphabet 123 and カンジ")
        ]

        kakasi = pykakasi.kakasi()
        kakasi.setMode("H", None)
        kakasi.setMode("K", None)
        kakasi.setMode("J", "K")
        kakasi.setMode("s", False)
        kakasi.setMode("C", True)
        kakasi.setMode("E", None)
        kakasi.setMode("a", None)
        converter = kakasi.getConverter()
        for case, result in TESTS:
            self.assertEqual(converter.do(case), result)
Example #19
0
def changpoyin(data,files_dir,i):#把长破音都转化为u,这里的i是文件的id

    from pykakasi import kakasi  # 把单词转化为音素
    kakasi = kakasi()
    kakasi.setMode("H", "a")  # Hiragana to ascii, default: no conversion
    kakasi.setMode("K", "a")  # Katakana to ascii, default: no conversion
    kakasi.setMode("J", "a")  # Japanese to ascii, default: no conversion
    kakasi.setMode("r", "Hepburn")  # default: use Hepburn Roman table
    kakasi.setMode("s", True)  # add space, default: no separator
    conv = kakasi.getConverter()

    data_1 = copy.deepcopy(data)
    data_2 = []

    files_dir_1 = os.path.join(files_dir.replace('_yinsu', ''))
    data_danci = pipei.read_out(files_dir_1)  # 单词级别的识别结果

    for danci in data_1:#每次循环检查一个音素(音素単位)

        if ':' in danci[0]:#如果识别结果中出现冒号“:”就把单词单位的识别结果读出来看看到底是“u”还是“-”

            zhenshu = danci[1][0]

            for danci_1 in data_danci:#每一个循环查看一个单词(単語単位)

                if zhenshu >= danci_1[1][0] and zhenshu <= danci_1[1][1]:  # 找到这个音素对应的汉字

                    # print('能找到')

                    tanngou = conv.do(danci_1[0])  # 把这个汉字取出进行转化

                    if danci_1[0] == tanngou:  # 说明是字母

                        tanngou = ztok._make_kana_convertor(danci_1[0])

                    if tanngou[-1] == 'u' or tanngou=='nado':#如果结尾是u那就把识别结果里的:转化为u

                        fenjie = (danci[1][0] + danci[1][1])//2
                        danci_2 = copy.deepcopy(danci)
                        danci_2[1][1] = fenjie
                        danci_2[0] = danci_2[0].replace(':','')
                        danci_3 = copy.deepcopy(danci)
                        danci_3[1][0] = fenjie + 1
                        danci_3[0] = 'u'
                        data_2.append(danci_2)
                        data_2.append(danci_3)

                    #这段代码等做过第一波实验之后再加进去(判断冒号之后是i的情况)
                    ################################################################
                    # elif tanngou[-1] == 'i':#如果结尾是u那就把识别结果里的:转化为u
                    #
                    #     fenjie = (danci[1][0] + danci[1][1])//2
                    #     danci_2 = copy.deepcopy(danci)
                    #     danci_2[1][1] = fenjie
                    #     danci_2[0] = danci_2[0].replace(':','')
                    #     danci_3 = copy.deepcopy(danci)
                    #     danci_3[1][0] = fenjie + 1
                    #     danci_3[0] = 'i'
                    #     data_2.append(danci_2)
                    #     data_2.append(danci_3)
                    ###############################################################

                    elif zifudingwei(tanngou,danci[0].replace(':',''),files_dir_1) == 'u':#把有冒号字母后面的那个字母单独拿出来

                        fenjie = (danci[1][0] + danci[1][1])//2
                        danci_2 = copy.deepcopy(danci)
                        danci_2[1][1] = fenjie
                        danci_2[0] = danci_2[0].replace(':','')
                        danci_3 = copy.deepcopy(danci)
                        danci_3[1][0] = fenjie + 1
                        danci_3[0] = 'u'
                        data_2.append(danci_2)
                        data_2.append(danci_3)

                    else:
                        data_2.append(danci)

                    break

        else:
            data_2.append(danci)#如果不是包函冒号的音素,就直接加入新的list

    # print(i)
    # print(data_danci)
    # print(data)
    # print(data_2)
    # os.system('pause')

    return data_2
Example #20
0
 def setup_converter(self):
     mykakasi = kakasi()
     mykakasi.setMode('H', 'a')
     mykakasi.setMode('K', 'a')
     mykakasi.setMode('J', 'a')
     self.converter = mykakasi.getConverter()
Example #21
0
def dabiaoqian(path):
    from pykakasi import kakasi

    BASE_DIRS = path
    # 批次

    name_tezheng = 'mizhichuli_log'
    # 装有特征值的那个文件的文件名

    xinde = 'xinde_mizhichuli'
    # 装入新的特征值的文件名

    houzhui = '.wav.csv'
    # 特征值文件中除去id号之后的后缀部分

    name = 'align1'
    # 表记着CCCCSSSS标志的文件

    shibiejieguo = {}
    # 安放识别结果的字典

    symbolcidian = {}
    # 这样的词典,标志词典
    # id: C001L_086
    # ['S', 'S', 'S', 'C', 'S', 'D', 'D', 'D', 'C']
    # id: C001L_087
    # ['S', 'D', 'D', 'C']
    # id: C001L_088
    # ['S', 'S', 'S', 'S', 'D', 'D', 'D', 'D', 'C', 'C']
    zhengjie = {}
    # 正解文词典

    kakasi = kakasi()
    kakasi.setMode("H", "a")  # Hiragana to ascii, default: no conversion
    kakasi.setMode("K", "a")  # Katakana to ascii, default: no conversion
    kakasi.setMode("J", "a")  # Japanese to ascii, default: no conversion
    kakasi.setMode("r", "Hepburn")  # default: use Hepburn Roman table
    kakasi.setMode("s", True)  # add space, default: no separator
    conv = kakasi.getConverter()

    for per_dirs in os.listdir(BASE_DIRS):  # per_dirs = C001L,C001R...

        d_9 = os.path.join(BASE_DIRS, per_dirs, xinde)
        d = os.path.join(BASE_DIRS, per_dirs, xinde)
        mulu.mkdir(d)

        zhengjie, symbolcidian = zidian.zidian(per_dirs, BASE_DIRS)
        # 从标志文件中把标志塞进symbolcidian字典里

        for id in os.listdir(
                os.path.join(BASE_DIRS, per_dirs,
                             name_tezheng)):  # id = C001L,C001R下面的文件的名字

            banyun_1 = []  # 存储C的索引
            banyun_2 = []  # 存储正确的单词

            banyun_3 = []  # 存储非C的索引
            banyun_4 = []  # 存储暂时不正确的单词的拼音
            dianout = []

            id = id.replace(houzhui, '')  # 把文件名中的.wav.csv去掉只剩id

            # print(id)
            # print(symbolcidian[id])

            enumerate(symbolcidian[id])

            banyun_1 = [i for i, x in enumerate(symbolcidian[id])
                        if x == 'C']  # 返回标志C的索引
            banyun_3 = [i for i, x in enumerate(symbolcidian[id])
                        if x == 'S']  # 返回替换错误的单词的索引

            t_file = os.path.join(BASE_DIRS, per_dirs, name_tezheng,
                                  id + houzhui)
            a = csv.reader(open(t_file, 'r', encoding='utf-8'))
            t_file_list = [i for i in a]

            # if len(banyun_1) == 0:#如果没有一个是正确的,全错,所有的数据都打标签1
            #     for i in range(len(t_file_list)):
            #         t_file_list[i].insert(0, '1')
            # print(banyun_1)
            # print(banyun_3)
            # os.system("pause")

            for u in banyun_1:  # banyun_1里面装的全是标志C的索引

                if u + 1 <= len(zhengjie[id]):  # 正解文单词的个数可能没有标志的个数多
                    # print(banyun_1)
                    # print(zhengjie[id][u])
                    # print(zhengjie[id])
                    # print("已经把正确单词 %s 加入数组"%str(zhengjie[id][u]))
                    banyun_2.append(zhengjie[id][u])  # banyun_2是存储正确单词的索引的数组
                    # print("此时的banyun_2是")
                    # print(banyun_2)
                    # os.system('pause')
                else:  # 如果C标志的索引号大于正解文单词的索引号,那就只能手动去调整了
                    print("手动调一下这个文件吧%s" % id)
                    print("它的正确单词是")
                    print(banyun_2)
                    os.system("pause")
            # print(banyun_2)
            # os.system('pause')

            for w in banyun_3:  # 存储非C的索引

                if w + 1 <= len(zhengjie[id]):  # 正解文单词的个数可能没有标志的个数多

                    result = conv.do(zhengjie[id][w])
                    banyun_4.append(result)
                    # if result == zhengjie[id][w] and zhengjie[id][w] != '、':#如果是逗号,也按正常的单词处理
                    #
                    #     banyun_4.append(conv.do(_make_kana_convertor(strQ2B(zhengjie[id][w]))))#如果转化之后的值不变,就说明遇到了字母,把字母转化为半角,再再转化为片假名,之后再转化为罗马字加入列表中
                    # else:
                    # #     banyun_4.append(result)#存储暂时不正确的单词
                    # print("此时的banyun_4是")
                    # print(banyun_4)
                    # os.system('pause')

                else:  # 如果C标志的索引号大于正解文单词的索引号,那就只能手动去调整了
                    print("手动调一下这个文件吧%s" % id)
                    print("它的认识出现错误的单词是")
                    print(banyun_4)
                    os.system("pause")

                # print(banyun_2)
                # os.system("pause")

            # for p in symbolcidian[id]:
            #     os.system("pause")
            #     # while p == 'C':
            #     print(p.index('C'))

            dir_out = os.path.join(BASE_DIRS, per_dirs, 'keka', id + '.out')
            dianout = pi.read_out(dir_out)  # 提取出来的帧号跟julius识别结果一样
            # print(dianout)
            # os.system('pause')
            # 最后的效果:[['', [0, 18]], ['お', [19, 24]], ['願い', [25, 49]], ['三', [50, 82]], ['。', [83, 86]]]

            # [  37   58]  0.562999  で+接続詞	[で]
            start = dianout.pop(0)[1][1]

            # print(start)

            for i in range(start + 1):
                t_file_list[i].insert(0, '9')  # 最前面的无音区间全部都打标签9,把它们当做正确认识来处理

            for y in dianout:  # dianout是识别结果跟对应的帧数表

                # print("此时的单词是%s"%y)
                # print("此时的匹配结果是")
                # print(dianout)
                # os.system("pause")

                if y[1][1] + 1 <= len(t_file_list):  # 判断这个单词的范围是否超出了特征值得总行数

                    if y[0] == '':  # 跳过前面的无音区
                        continue

                    if y[0] == dianout[-1][
                            0]:  # 这段代码是为了把最后句号的部分全部打上标签9而设置的注意一下,下面也有一段代码
                        start, end = y[1]
                        for i in range(start, end + 1):
                            t_file_list[i].insert(0, '9')
                        continue

                    if y[0] in banyun_2:  # 如果这个单词存在列表banyun_2中,就给这个单词对应的帧数范围打标签0
                        start, end = y[1]
                        print("正在为文件 %s 的单词 %s 打标签" %
                              (os.path.split(dir_out)[1], y[0]))
                        for i in range(start, end + 1):
                            t_file_list[i].insert(0, '0')
                        banyun_2.remove(y[0])  # 打完标签0之后再从列表中把这个单词删掉

                    elif conv.do(
                            y[0]) == y[0] and y[0] != '、':  # 如果是字母的话,转化之后还是字母

                        print("发现识别结果中的字母%s" % y[0])
                        print("它在文件%s" % dir_out)

                        try:
                            zhuanhuazhi = conv.do(
                                make_kana_convertor._make_kana_convertor(
                                    strQ2B.strQ2B(y[0])))

                        except:
                            zhuanhuazhi = conv.do(
                                make_kana_convertor._make_kana_convertor(y[0]))

                        if zhuanhuazhi in banyun_4:  # 需要先把字母转化为片假名然后再转化为读音
                            print("转化之后的字母为%s" % zhuanhuazhi)
                            # os.system('pause')
                            start, end = y[1]
                            print("正在为文件 %s 的单词 %s 打标签" %
                                  (os.path.split(dir_out)[1], y[0]))
                            for i in range(start, end + 1):
                                t_file_list[i].insert(0, '0')
                            banyun_4.remove(
                                conv.do(zhuanhuazhi))  # 打完标签0之后再从列表中把这个单词删掉
                        else:
                            start, end = y[1]
                            print("正在为文件 %s 的单词 %s 打标签" %
                                  (os.path.split(dir_out)[1], y[0]))
                            for i in range(start, end + 1):
                                t_file_list[i].insert(0, '1')

                    elif conv.do(y[0]) in banyun_4:
                        start, end = y[1]
                        print("正在为文件 %s 的单词 %s 打标签" %
                              (os.path.split(dir_out)[1], y[0]))
                        for i in range(start, end + 1):
                            t_file_list[i].insert(0, '0')
                        banyun_4.remove(conv.do(y[0]))  # 打完标签0之后再从列表中把这个单词删掉

                    else:
                        start, end = y[
                            1]  # 如果这个单词不在列表banyun_2中,就给这个单词对应的帧数范围打标签1
                        print("正在为文件 %s 的单词 %s 打标签" %
                              (os.path.split(dir_out)[1], y[0]))
                        for i in range(start, end + 1):
                            t_file_list[i].insert(0, '1')

                elif y[1][1] + 1 > len(t_file_list):

                    if y[0] == '':
                        continue

                    if y[0] == dianout[-1][0]:
                        start = y[1][0]
                        end = len(t_file_list)
                        for i in range(
                                start, end
                        ):  # 如果是y[1][1]+1 > len(t_file_list)的情况这里end就不能加一了
                            t_file_list[i].insert(0, '9')
                        continue
                    # 这段代码是为了把最后句号的部分全部打上标签9而设置的注意一下,上面也有一段代码

                    if y[0] in banyun_2:
                        start = y[1][0]
                        end = len(
                            t_file_list)  # 如果这个单词的帧数表的范围超出了特征值得行数,就以特征值行数作为end
                        print("正在为文件 %s 的单词 %s 打标签" %
                              (os.path.split(dir_out)[1], y[0]))
                        for i in range(start, end):
                            t_file_list[i].insert(0, '0')
                        banyun_2.remove(y[0])

                    elif conv.do(
                            y[0]) == y[0] and y[0] != '、':  # 如果是字母的话,转化之后还是字母

                        if conv.do(
                                make_kana_convertor._make_kana_convertor(
                                    y[0])) in banyun_4:  # 需要先把字母转化为片假名然后再转化为读音
                            start = y[1][0]
                            end = len(t_file_list)
                            print("正在为文件 %s 的单词 %s 打标签" %
                                  (os.path.split(dir_out)[1], y[0]))
                            for i in range(start, end + 1):
                                t_file_list[i].insert(0, '0')
                            banyun_4.remove(
                                conv.do(
                                    make_kana_convertor._make_kana_convertor(
                                        y[0])))  # 打完标签0之后再从列表中把这个单词删掉

                        else:
                            start = y[1][0]
                            end = len(t_file_list)
                            print("正在为文件 %s 的单词 %s 打标签" %
                                  (os.path.split(dir_out)[1], y[0]))
                            for i in range(start, end + 1):
                                t_file_list[i].insert(0, '1')
                    else:
                        start = y[1][0]
                        end = len(t_file_list)
                        print("正在为文件 %s 的单词 %s 打标签" %
                              (os.path.split(dir_out)[1], y[0]))
                        for i in range(start, end):
                            t_file_list[i].insert(0, '1')

            with open(os.path.join(BASE_DIRS, per_dirs, xinde, id + '.csv'),
                      'w+',
                      encoding='utf-8') as mergen_file:
                for i in t_file_list:
                    mergen_file.write('%s\n' % ','.join(i))

        shanchu.shanchuhang(d_9)  # 把有标记9的特征值全部都删除掉
Example #22
0
from pykakasi import kakasi

with open('../imas-talk-maker/public/asset/icon_data.json',
          mode='r',
          encoding='utf-8') as f:
    old_icon_data: List[Dict[str, Union[str, List[str]]]] = json.load(f)

# 必要なディレクトリを整備する
os.makedirs('./million', exist_ok=True)
os.makedirs('./cinderella', exist_ok=True)
os.makedirs('./other', exist_ok=True)

# 各項目を読み込み、新しいデータセット、およびファイルを組み上げる
new_icon_data = []
kakasi_instance = kakasi()
kakasi_instance.setMode('H', 'a')
kakasi_instance.setMode('K', 'a')
kakasi_instance.setMode('J', 'a')
kakasi_conv = kakasi_instance.getConverter()
for record in old_icon_data:
    category: str = record['category']
    icon_name_list: List[str] = record['image']
    kana: str = record['kana']
    name: str = record['name']
    short_name: str = record['short_name']
    icon_name_list_size = len(icon_name_list)
    kana_roma = kakasi_conv.do(kana.replace('、', '/').split('/')[0])
    new_icon_list = []
    for x in range(0, icon_name_list_size):
        old_name = icon_name_list[x]
Example #23
0
import pykakasi
import re
import hiragana
from typing import List
from itertools import zip_longest

# Initialize space converter
wakati = pykakasi.wakati()
spacer = wakati.getConverter()

# Initialize kanji & katakana converter
kakasi_kanji = pykakasi.kakasi()
kakasi_kanji.setMode('K', 'H')
kakasi_kanji.setMode('J', 'H')
kanji_replacer = kakasi_kanji.getConverter()

# Initialize romanji converter
kakasi_romanji = pykakasi.kakasi()
kakasi_romanji.setMode('H', 'a')
kakasi_romanji.setMode('K', 'a')
kakasi_romanji.setMode('J', 'a')
kakasi_romanji.setMode('r', 'Hepburn')
kakasi_romanji.setMode('s', True)

romanjifier = kakasi_romanji.getConverter()

def to_hiragana(word: str) -> str:
    hiragana_word = kanji_replacer.do(word)

    return hiragana_word
Example #24
0
from pykakasi import kakasi
# 读取文件,如果不加encoding='utf-8'的话,会报编码错误
read_me = open('as_a_rule.txt', 'r', encoding='utf-8')  # 读
read_to = open('output.txt', 'w')  # 输出

#  //转换器1
bachongying = kakasi()
# 转换成平假名
bachongying.setMode('J', 'H')
Bachongying = bachongying.getConverter()

#  //转换器2,平假名,片假名转换成罗马音 :
hequanshawu = kakasi()
hequanshawu.setMode('H', 'a')  # 平转罗
hequanshawu.setMode('K', 'a')  # 片转罗
Hequanshawu = hequanshawu.getConverter()

# 转化器已经写好了,接下来就是无脑读。。。
while True:
    s = read_me.readline()
    if s == '':
        break

# number:数字   jp:日语   jph:平和片   jpa:罗马音   cn:中文
    number = jp = jph = jpa = cn = ''
    index = 0
    for char in s:
        if index == 0:
            if char == '、':
                index = 1
            else:
Example #25
0
                                        ' ').replace(':',
                                                     ' ').replace('!', '')
    text = collapse_whitespace(text)
    return text


import MeCab
import codecs
import argparse
from pykakasi import kakasi
import re

re_hiragana = re.compile(r'^[あ-ん]+$')
re_katakana = re.compile(r'[\u30A1-\u30F4]+')
re_kanji = re.compile(r'^[\u4E00-\u9FD0]+$')
j2h = kakasi()
j2h.setMode('J', 'H')  # J(Kanji) to H(Hiragana)
conv = j2h.getConverter()

k2h = kakasi()
k2h.setMode('K', 'H')
conv2 = k2h.getConverter()
t = MeCab.Tagger('')


def japanese_cleaners(text):
    '''Pipeline for English text, including number and abbreviation expansion.'''
    node = t.parse(text).replace("\t", ",").split("\n")
    res = []
    for i in node:
        #import pdb; pdb.set_trace()
Example #26
0
 def test_kakasi_invalid_flag_value(self):
     with self.assertRaises(InvalidFlagValueException):
         kakasi = pykakasi.kakasi()
         kakasi.setMode("H", "a")
         kakasi.setMode("K", "a")
         kakasi.setMode("s", "yes")
Example #27
0
 def test_kakasi_unknown_mode(self):
     with self.assertRaises(InvalidModeValueException):
         kakasi = pykakasi.kakasi()
         kakasi.setMode("H", "a")
         kakasi.setMode("K", "a")
         kakasi.setMode("J", "X")
import re
import sys
from ssl import SSLWantReadError

import requests
import keyboard
from pykakasi import kakasi, wakati
from bs4 import BeautifulSoup

REQUEST_SESSION = requests.session()
KAKASI = kakasi()
KAKASI.setMode("J", "H")
KAKASI_CONVERTER = KAKASI.getConverter()
WAKATI = wakati()
WAKATI_CONVERTER = WAKATI.getConverter()


def get_relevant_data(result_page):
    """Grabs the relevant information from the term's dictionary page and passes it to the hotkey
     function to be pasted into Anki."""
    kanji = result_page.find("div", class_="jp").text.replace("·", "")
    kanji = re.sub("(\(.{1,3}\))", "", kanji)
    try:
        kana = result_page.find("div", class_="furigana").text.replace(
            "[", "").replace("]", "").replace("·", "")
        kana = re.sub("(\(.{1,3}\))", "", kana)
    except AttributeError:
        kana = kanji
    romaji = result_page.find("div", class_="romaji hide").text
    term_definition = result_page.find(
        "div", class_="en").find("ol").text.rstrip().lstrip()
Example #29
0
def main():
    kakasi_ = kakasi()
    kakasi_.setMode('H', 'a')
    kakasi_.setMode('K', 'a')
    kakasi_.setMode('J', 'a')
    conv = kakasi_.getConverter()
    
    #カスケード分類器の特徴量を取得する
    cascade = cv2.CascadeClassifier(cascade_path)
    color = (255, 255, 255) #白
    path = "./img/"
    label = str(input("人を判別する数字を入力してください ex.0:"))

    OUT_FILE_NAME = "./img/face_recognition.avi"
    FRAME_RATE=1
    w=224 #1280
    h=224 #960
    out = cv2.VideoWriter(OUT_FILE_NAME, \
          cv_fourcc('M', 'P', '4', 'V'), \
          FRAME_RATE, \
         (w, h), \
         True)    

    cap = cv2.VideoCapture(1)

    is_video = 'False'
    s=0.1
    model=model_definition()
    ## Use HGS創英角ゴシックポップ体標準 to write Japanese.
    fontpath ='C:\Windows\Fonts\HGRPP1.TTC' # Windows10 だと C:\Windows\Fonts\ 以下にフォントがあります。
    font = ImageFont.truetype(fontpath, 16) # フォントサイズが32
    font0 = cv2.FONT_HERSHEY_SIMPLEX
    sk=0
    while True:
        b,g,r,a = 0,255,0,0 #B(青)・G(緑)・R(赤)・A(透明度)
        timer = cv2.getTickCount()
        ret, frame = cap.read()
        sleep(s)
        fps = cv2.getTickFrequency() / (cv2.getTickCount() - timer);
        # Display FPS on frame
        cv2.putText(frame, "FPS : " + str(int(1000*fps)), (100,50), font0, 0.75, (50,170,50), 2);
        #グレースケール変換
        image_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        #image_gray = cv2.equalizeHist(image_gray)
        facerect = cascade.detectMultiScale(image_gray, scaleFactor=1.1, minNeighbors=2, minSize=(30, 30))
        #print(len(facerect))
        img=frame
        if len(facerect) > 0:
            #検出した顔を囲む矩形の作成
            for rect in facerect:
                x=rect[0]
                y=rect[1]
                width=rect[2]
                height=rect[3]
                roi = img[y:y+height, x:x+width]  #frame[y:y+h, x:x+w]
                cv2.rectangle(img, tuple(rect[0:2]),tuple(rect[0:2]+rect[2:4]), color, thickness=2)
                
                try:
                    roi = cv2.resize(roi, (int(224), 224))
                    cv2.imshow('roi',roi)
                    txt, preds=yomikomi(model,roi)
                    print("txt, preds",txt,preds*100 ," %")
                    txt2=conv.do(txt)
                    cv2.imwrite(path+"/"+label+"/"+str(sk)+'_'+str(txt2)+'_'+str(int(preds*100))+'.jpg', roi)
                    img_pil = Image.fromarray(img) # 配列の各値を8bit(1byte)整数型(0~255)をPIL Imageに変換。
                    draw = ImageDraw.Draw(img_pil) # drawインスタンスを生成
                    position = (x, y) # テキスト表示位置
                    draw.text(position, txt, font = font , fill = (b, g, r, a) ) # drawにテキストを記載 fill:色 BGRA (RGB)
                    img = np.array(img_pil) # PIL を配列に変換
                except:
                    txt=""
                    continue
        cv2.imshow('test',img)
        sk +=1
        key = cv2.waitKey(1)&0xff
        
        if is_video=="True":
            img_dst = cv2.resize(img, (int(224), 224)) #1280x960
            out.write(img_dst)
            print(is_video)

        if key == ord('q'):   #113
            #cv2.destroyAllWindows()
            break
        elif key == ord('p'):
            s=0.5
            is_video = "True"
        elif key == ord('s'):
            s=0.1
            is_video = "True"   #"False"    
Example #30
0
from pykakasi import kakasi
import datetime

kk = kakasi()
kk.setMode('H', 'a')
kk.setMode('K', 'a')
kk.setMode('J', 'a')
conv = kk.getConverter()


def to_roma(word):
    return conv.do(word)


def to_shell(cmd_list, path, is_parallel=True):
    join_str = ' &\n' if is_parallel else '\n'
    if type(cmd_list) == list:
        text = join_str.join(cmd_list)
    else:
        text = cmd_list
    with open(path, 'w') as file:
        file.write(text)
Example #31
0
 def __init_kakasi(self):
     _kakasi = kakasi()
     _kakasi.setMode('H', 'a')  # hiragana to roman
     _kakasi.setMode('K', 'a')  # katakana to roman
     self.conv = _kakasi.getConverter()
Example #32
0
def calculate_name(filelist, romanize=True, ext='.xci'):
    from Fs import Nsp as squirrelNSP
    from Fs import Xci as squirrelXCI
    import re
    prlist = list()
    for filepath in filelist:
        if filepath.endswith('.nsp'):
            try:
                c = list()
                f = squirrelNSP(filepath)
                contentlist = f.get_content(False, False, True)
                f.flush()
                f.close()
                if len(prlist) == 0:
                    for i in contentlist:
                        prlist.append(i)
                else:
                    for j in range(len(contentlist)):
                        notinlist = False
                        for i in range(len(prlist)):
                            if contentlist[j][1] == prlist[i][1]:
                                if contentlist[j][6] > prlist[i][6]:
                                    del prlist[i]
                                    prlist.append(contentlist[j])
                                    notinlist = False
                                elif contentlist[j][6] == prlist[i][6]:
                                    notinlist = False
                            else:
                                notinlist = True
                        if notinlist == True:
                            prlist.append(contentlist[j])
            except BaseException as e:
                nutPrint.error('Exception: ' + str(e))
        if filepath.endswith('.xci'):
            try:
                c = list()
                f = squirrelXCI(filepath)
                contentlist = f.get_content(False, False, True)
                f.flush()
                f.close()
                if len(prlist) == 0:
                    for i in contentlist:
                        prlist.append(i)
                else:
                    for j in range(len(contentlist)):
                        notinlist = False
                        for i in range(len(prlist)):
                            if contentlist[j][1] == prlist[i][1]:
                                if contentlist[j][6] > prlist[i][6]:
                                    del prlist[i]
                                    prlist.append(contentlist[j])
                                    notinlist = False
                                elif contentlist[j][6] == prlist[i][6]:
                                    notinlist = False
                            else:
                                notinlist = True
                        if notinlist == True:
                            prlist.append(contentlist[j])
            except BaseException as e:
                nutPrint.error('Exception: ' + str(e))
    basecount = 0
    basename = ''
    basever = ''
    baseid = ''
    basefile = ''
    updcount = 0
    updname = ''
    updver = ''
    updid = ''
    updfile = ''
    dlccount = 0
    dlcname = ''
    dlcver = ''
    dlcid = ''
    dlcfile = ''
    ccount = ''
    bctag = ''
    updtag = ''
    dctag = ''
    for i in range(len(prlist)):
        if prlist[i][5] == 'BASE':
            basecount += 1
            if baseid == "":
                basefile = str(prlist[i][0])
                baseid = str(prlist[i][1])
                basever = '[v' + str(prlist[i][6]) + ']'
        if prlist[i][5] == 'UPDATE':
            updcount += 1
            endver = str(prlist[i][6])
            if updid == "":
                updfile = str(prlist[i][0])
                updid = str(prlist[i][1])
                updver = '[v' + str(prlist[i][6]) + ']'
        if prlist[i][5] == 'DLC':
            dlccount += 1
            if dlcid == "":
                dlcfile = str(prlist[i][0])
                dlcid = str(prlist[i][1])
                dlcver = '[v' + str(prlist[i][6]) + ']'
        if basecount != 0:
            bctag = str(basecount) + 'G'
        else:
            bctag = ''
        if updcount != 0:
            if bctag != '':
                updtag = '+' + str(updcount) + 'U'
            else:
                updtag = str(updcount) + 'U'
        else:
            updtag = ''
        if dlccount != 0:
            if bctag != '' or updtag != '':
                dctag = '+' + str(dlccount) + 'D'
            else:
                dctag = str(dlccount) + 'D'
        else:
            dctag = ''
        ccount = '(' + bctag + updtag + dctag + ')'
    if baseid != "":
        if basefile.endswith('.xci'):
            f = squirrelXCI(basefile)
        elif basefile.endswith('.nsp'):
            f = squirrelNSP(basefile)
        ctitl = f.get_title(baseid)
        f.flush()
        f.close()
        if ctitl == 'DLC' or ctitl == '-':
            ctitl = ''
    elif updid != "":
        if updfile.endswith('.xci'):
            f = squirrelXCI(updfile)
        elif updfile.endswith('.nsp'):
            f = squirrelNSP(updfile)
        ctitl = f.get_title(updid)
        f.flush()
        f.close()
        if ctitl == 'DLC' or ctitl == '-':
            ctitl = ''
    elif dlcid != "":
        ctitl = get_title
        if dlcfile.endswith('.xci'):
            f = squirrelXCI(dlcfile)
        elif dlcfile.endswith('.nsp'):
            f = squirrelNSP(dlcfile)
        ctitl = f.get_title(dlcid)
        f.flush()
        f.close()
    else:
        ctitl = 'UNKNOWN'
    baseid = '[' + baseid.upper() + ']'
    updid = '[' + updid.upper() + ']'
    dlcid = '[' + dlcid.upper() + ']'
    if ccount == '(1G)' or ccount == '(1U)' or ccount == '(1D)':
        ccount = ''
    if baseid != "[]":
        if updver != "":
            endname = ctitl + ' ' + baseid + ' ' + updver + ' ' + ccount
        else:
            endname = ctitl + ' ' + baseid + ' ' + basever + ' ' + ccount
    elif updid != "[]":
        endname = ctitl + ' ' + updid + ' ' + updver + ' ' + ccount
    else:
        endname = ctitl + ' ' + dlcid + ' ' + dlcver + ' ' + ccount
    if romanize == True:
        import pykakasi
        kakasi = pykakasi.kakasi()
        kakasi.setMode("H", "a")
        kakasi.setMode("K", "a")
        kakasi.setMode("J", "a")
        kakasi.setMode("s", True)
        kakasi.setMode("E", "a")
        kakasi.setMode("a", None)
        kakasi.setMode("C", False)
        converter = kakasi.getConverter()
        endname = converter.do(endname)
        endname = endname[0].upper() + endname[1:]
    endname = (re.sub(r'[\/\\\:\*\?]+', '', endname))
    endname = re.sub(r'[™©®`~^´ªº¢#£€¥$ƒ±¬½¼♡«»±•²‰œæƳ☆<<>>|]', '', endname)
    endname = re.sub(r'[Ⅰ]', 'I', endname)
    endname = re.sub(r'[Ⅱ]', 'II', endname)
    endname = re.sub(r'[Ⅲ]', 'III', endname)
    endname = re.sub(r'[Ⅳ]', 'IV', endname)
    endname = re.sub(r'[Ⅴ]', 'V', endname)
    endname = re.sub(r'[Ⅵ]', 'VI', endname)
    endname = re.sub(r'[Ⅶ]', 'VII', endname)
    endname = re.sub(r'[Ⅷ]', 'VIII', endname)
    endname = re.sub(r'[Ⅸ]', 'IX', endname)
    endname = re.sub(r'[Ⅹ]', 'X', endname)
    endname = re.sub(r'[Ⅺ]', 'XI', endname)
    endname = re.sub(r'[Ⅻ]', 'XII', endname)
    endname = re.sub(r'[Ⅼ]', 'L', endname)
    endname = re.sub(r'[Ⅽ]', 'C', endname)
    endname = re.sub(r'[Ⅾ]', 'D', endname)
    endname = re.sub(r'[Ⅿ]', 'M', endname)
    endname = re.sub(r'[—]', '-', endname)
    endname = re.sub(r'[√]', 'Root', endname)
    endname = re.sub(r'[àâá@äå]', 'a', endname)
    endname = re.sub(r'[ÀÂÁÄÅ]', 'A', endname)
    endname = re.sub(r'[èêéë]', 'e', endname)
    endname = re.sub(r'[ÈÊÉË]', 'E', endname)
    endname = re.sub(r'[ìîíï]', 'i', endname)
    endname = re.sub(r'[ÌÎÍÏ]', 'I', endname)
    endname = re.sub(r'[òôóöø]', 'o', endname)
    endname = re.sub(r'[ÒÔÓÖØ]', 'O', endname)
    endname = re.sub(r'[ùûúü]', 'u', endname)
    endname = re.sub(r'[ÙÛÚÜ]', 'U', endname)
    endname = re.sub(r'[’]', "'", endname)
    endname = re.sub(r'[“”]', '"', endname)
    endname = re.sub(' {3,}', ' ', endname)
    re.sub(' {2,}', ' ', endname)
    try:
        endname = endname.replace("( ", "(")
        endname = endname.replace(" )", ")")
        endname = endname.replace("[ ", "[")
        endname = endname.replace(" ]", "]")
        endname = endname.replace("[ (", "[(")
        endname = endname.replace(") ]", ")]")
        endname = endname.replace("[]", "")
        endname = endname.replace("()", "")
        endname = endname.replace('" ', '"')
        endname = endname.replace(' "', '"')
        endname = endname.replace(" !", "!")
        endname = endname.replace(" ?", "?")
        endname = endname.replace("  ", " ")
        endname = endname.replace("  ", " ")
        endname = endname.replace('"', '')
        endname = endname.replace(')', ') ')
        endname = endname.replace(']', '] ')
        endname = endname.replace("[ (", "[(")
        endname = endname.replace(") ]", ")]")
        endname = endname.replace("  ", " ")
    except:
        pass
    if endname[-1] == ' ':
        endname = endname[:-1]
    endname = endname + ext
    return endname, prlist
Example #33
0
 def __init__(self, bot: commands.Bot):
     print('Ktoba OK')
     self.bot = bot
     self.kakasi = kakasi()
Example #34
0
def conv_to_kana(word):
    responce = requests.post(
        'https://labs.goo.ne.jp/api/hiragana',
        data={
            "app_id":
            "c6d687dfedd172e4a2b30cc086513cfbb23f8c039c8c157fe08760b3df4092fa",
            "request_id": "test",
            "sentence": word,
            "output_type": "hiragana"
        })

    return responce.json["converted"]


kakasi = kakasi()
kakasi.setMode('H', 'a')
kakasi.setMode('K', 'a')
kakasi.setMode('J', 'a')
converter = kakasi.getConverter()
boin = ['a', 'i', 'u', 'e', 'o']


def get_distance(wordx, wordy):
    wordx = converter.do(wordx)
    wordy = converter.do(wordy)

    #print(wordx + " " + wordy)
    dp = np.empty((len(wordx) + 1, len(wordy) + 1))

    for i in range(len(wordx) + 1):
Example #35
0
def dabiaoqian(path, guanjianzi_1, guanjianzi_2):

    from pykakasi import kakasi
    import csv, os

    name_tezheng = guanjianzi_1
    # 装有特征值的那个文件的文件名

    xinde = guanjianzi_2
    # 装入新的特征值的文件名

    name1 = 'align1'
    name2 = 'symbol.txt'
    #标志文件的名字,当align1不好使的时候,换用symbol.txt,注意,下面的代码相应地也要换掉

    kakasi = kakasi()
    kakasi.setMode("H", "a")  # Hiragana to ascii, default: no conversion
    kakasi.setMode("K", "a")  # Katakana to ascii, default: no conversion
    kakasi.setMode("J", "a")  # Japanese to ascii, default: no conversion
    kakasi.setMode("r", "Hepburn")  # default: use Hepburn Roman table
    kakasi.setMode("s", True)  # add space, default: no separator
    conv = kakasi.getConverter()

    for i in os.listdir(path):

        path_1 = os.path.join(path, i)

        path_out = os.path.join(path_1, 'keka')

        path_tezheng = os.path.join(path_1, name_tezheng)

        #biaozhiwenjian = csv.reader(open(os.path.join(path_1, name1), 'r', encoding='EUC-JP'))  # 把标志文件读进来
        biaozhiwenjian = csv.reader(
            open(os.path.join(path_1, name2), 'r',
                 encoding='utf-8'))  #如果标志文件是.txt文件

        biaozhiwenjian_1 = [i for i in biaozhiwenjian
                            ]  # 转化为list,但是内容是list里面套list
        #[['id: l_8840_9810_T1_F_01'],['REF:  そう です か 、 はい 。 '],['HYP:  そう です か    はい 。 '],['EVAL: C    C    C  D  C    C  '],[],['id: l_10800_13190_T1_F_01']]

        # print(biaozhiwenjian_1)
        # os.system('pause')

        path_xinde = os.path.join(path_1, xinde)
        mulu.mkdir(path_xinde)

        for i in range(0, len(biaozhiwenjian_1)):  # 这里的每一轮可以为一个语音文件打标签

            try:
                biaozhi = biaozhiwenjian_1[i][0]

            except:

                continue

            if 'id:' in biaozhi:

                l_zhengjie_1 = []
                l_jieguo_1 = []

                ID = biaozhiwenjian_1[i][0].replace('id: ', '')

                l_zhengjie = biaozhiwenjian_1[i + 1][0].split()
                l_zhengjie.pop(0)

                l_jieguo = biaozhiwenjian_1[i + 2][0].split()
                l_jieguo.pop(0)

                l_biaozhi = biaozhiwenjian_1[i + 3][0].split()
                l_biaozhi.pop(0)

                # try:
                #     ID = biaozhiwenjian_1[i].replace('id: ', '')
                #
                #     l_zhengjie = biaozhiwenjian_1[i+1].split()
                #     l_zhengjie.pop(0)
                #
                #     l_jieguo = biaozhiwenjian_1[i+2].split()
                #     l_jieguo.pop(0)
                #
                #     l_biaozhi = biaozhiwenjian_1[i+3].split()
                #     l_biaozhi.pop(0)
                #
                # except:
                #     print(biaozhiwenjian_1[i])
                #     os.system("pause")

                #建立严格对应的正解,识别结果,标记,如果标记是d的话,结果就是空
                jishuqi_jieguo = 0
                jishuqi_zhengjie = 0
                jishuqi_biaozhi = 0

                for i in l_biaozhi:

                    if i == "D":
                        l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie])
                        l_jieguo_1.append('')
                        jishuqi_zhengjie += 1
                        jishuqi_biaozhi += 1

                    if i == "C":
                        l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie])

                        # print('l_jieguo')
                        # print(l_jieguo)
                        # os.system('pause')
                        l_jieguo_1.append(l_jieguo[jishuqi_jieguo])
                        jishuqi_zhengjie += 1
                        jishuqi_jieguo += 1
                        jishuqi_biaozhi += 1

                    if i == "I":
                        l_jieguo_1.append(l_jieguo[jishuqi_jieguo])
                        l_zhengjie_1.append('')
                        jishuqi_jieguo += 1
                        jishuqi_biaozhi += 1

                    if i == "S":  #如果是S的话特殊处理一下,转化为字母再比较,如果转化之后相等的话,把标志改为C
                        l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie])
                        l_jieguo_1.append(l_jieguo[jishuqi_jieguo])

                        zhengjie_hanzi = l_zhengjie[jishuqi_zhengjie]
                        jieguo_hanzi = l_jieguo[jishuqi_jieguo]

                        #先处理识别结果
                        if conv.do(
                                jieguo_hanzi
                        ) == jieguo_hanzi and jieguo_hanzi != '、':  #判断是不是字母

                            try:
                                zhuanhuan_jieguo = conv.do(
                                    make_kana_convertor._make_kana_convertor(
                                        strQ2B.strQ2B(jieguo_hanzi)))

                            except:
                                zhuanhuan_jieguo = conv.do(
                                    make_kana_convertor._make_kana_convertor(
                                        jieguo_hanzi))

                        else:
                            zhuanhuan_jieguo = conv.do(jieguo_hanzi)

                        #再处理正解文
                        if conv.do(
                                zhengjie_hanzi
                        ) == zhengjie_hanzi and zhengjie_hanzi != '、':  # 判断是不是字母

                            try:
                                zhuanhuan_zhengjie = conv.do(
                                    make_kana_convertor._make_kana_convertor(
                                        strQ2B.strQ2B(zhengjie_hanzi)))

                            except:
                                zhuanhuan_zhengjie = conv.do(
                                    make_kana_convertor._make_kana_convertor(
                                        zhengjie_hanzi))

                        else:
                            zhuanhuan_zhengjie = conv.do(zhengjie_hanzi)

                        if zhuanhuan_jieguo == zhuanhuan_zhengjie:

                            # print("正解list")
                            # print(l_zhengjie_1)
                            #
                            # print("识别结果list")
                            # print(l_jieguo_1)
                            #
                            # print("zhuanhuan_jieguo")
                            # print(zhuanhuan_jieguo)
                            # print("zhuanhuan_zhengjie")
                            # print(zhuanhuan_zhengjie)
                            # print("有标志被改了")
                            # print(ID)
                            # os.system("pause")

                            l_biaozhi[jishuqi_biaozhi] = 'C'

                        jishuqi_biaozhi += 1
                        jishuqi_zhengjie += 1
                        jishuqi_jieguo += 1

                # print(l_jieguo_1)
                # print(l_zhengjie_1)
                # print(l_biaozhi)
                # os.system('pause')

                path_out_1 = os.path.join(path_out, ID + '.out')  #读出.out文件
                dianout = pi.read_out(path_out_1)
                start = dianout.pop(0)[1][1]  # 给开始的无音区间打标签9,pop掉第一个元素
                start_1 = dianout[-1][1][0]  #给末尾句号打标签9
                # end_1 = dianout.pop(-1)[1][1] 因为在提取特征值的时候最后一帧可能被丢了,所以这个end就用t_file_list的条数代替

                # print(dianout)
                # os.system('pause')
                # 最后的效果:[['', [0, 18]], ['お', [19, 24]], ['願い', [25, 49]], ['三', [50, 82]], ['。', [83, 86]]]

                path_tezheng_1 = os.path.join(path_tezheng, ID + '.wav.csv')
                tezhengzhi = csv.reader(
                    open(path_tezheng_1, 'r', encoding='utf-8'))
                t_file_list = [i for i in tezhengzhi]

                end_1 = len(t_file_list) - 1

                if start < len(t_file_list):  #如果.out文件的空白部分的帧数范围大于特征值的行数,就扔了

                    for i in range(start + 1):
                        t_file_list[i].insert(
                            0, '9')  # 最前面的无音区间全部都打标签9,把它们当做正确认识来处理

                    for i in range(start_1, end_1 + 1):
                        t_file_list[i].insert(0, '9')

                    l_jieguo_1.pop(-1)  #最后句号的部分已经打过标签了,需要把它pop掉

                    print("ID")
                    print(ID)

                    print("l_biaozhi")
                    print(l_biaozhi)
                    print("l_jieguo_1")
                    print(l_jieguo_1)

                    print("dianout")
                    print(dianout)

                    dianout_chongzao = cz.chongzao(
                        l_biaozhi, l_jieguo_1, dianout,
                        ID)  # 生成新的dianoutlist,以后就靠它了

                    print('dianout_chongzao')
                    print(dianout_chongzao)

                    #通过得到的新的list,开始打标签,这个list中的单词是scoring工具输出的识别结果的单词,也就是被拼凑过的
                    # [['災害', [3, 40], 'C'], ['で', [41, 48], 'C'], ['ござい', [49, 77], 'C'], ['ます', [78, 98], 'C'],['から', [99, 130], 'C'], ['、', [131, 152], 'C'], ['その', [153, 177], 'C'], ['場', [178, 190], 'C'],['で', [191, 209], 'C']]
                    for \
                            i in dianout_chongzao:

                        start, end = i[1]
                        if i[2] == 'C':

                            for i in range(start, end + 1):
                                t_file_list[i].insert(0, '0')

                        else:

                            for i in range(start, end + 1):
                                t_file_list[i].insert(0, '1')

                    path_xinde_tezhengzhi = os.path.join(
                        path_xinde, ID + '.csv')

                    with open(path_xinde_tezhengzhi, 'w+',
                              encoding='utf-8') as mergen_file:
                        for i in t_file_list:
                            mergen_file.write('%s\n' % ','.join(i))

        shanchu.shanchuhang(path_xinde)  # 把有标记9的特征值全部都删除掉
 def __init__(self, bot):
     self.bot = bot
     self.jp_channels = [804115491047079966]
     self.KKS = pykakasi.kakasi()
def convert_into_romaji(text):
    _kakasi = kakasi()
    _kakasi.setMode('H', 'a')
    _kakasi.setMode('K', 'a')
    _kakasi.setMode('J', 'a')
    conv = _kakasi.getConverter()

    mecab = MeCab.Tagger(r'-Ochasen -d ' + IPADIC_DIR)
    eng = MeCab.Tagger(r'-d ' + UNIDIC_DIR)
    normalized_text = normalize('NFKC', text)
    filename_romaji = ""

    #ipa unidic併用
    results = mecab.parse(normalized_text)
    for chunk in results.splitlines()[:-1]:
        engFlag = False
        eng_work = ""
        original = chunk.split('\t')
        isEng = eng.parse(original[0])
        for word in isEng.splitlines()[:-1]:
            work = word.split('\t')[1]
            comma = work.split(',')
            if len(comma) > 12 and comma[12] == '外':
                engFlag = True
                engTrance = comma[7]
                hyphen = engTrance.split('-')
                if len(hyphen) < 2:
                    engFlag = False
                    break
                else:
                    eng_work += hyphen[1] + " "
        if not engFlag:
            #数字を英語にするべき?→1-20だけを処理する
            #変換対象を読み(ひらがな)にするべき?→英語判定は漢字、ローマ字変換はカタカナの文字を使う
            if original[0] in d.keys():
                filename_romaji += d[original[0]] + " "
            else:
                filename_romaji += conv.do(original[1]) + " "
        else:
            filename_romaji += eng_work
    """
    # unidicオンリー
    isEng = eng.parse(text)
    for word in isEng.splitlines()[:-1]:
        engFlag = False
        eng_work = ""
        original = word.split('\t')
        comma = original[1].split(',')
        if len(comma) > 12 and comma[12] == '外':
            engTrance = comma[7]
            hyphen = engTrance.split('-')
            if len(hyphen) > 1:
                engFlag = True
                eng_work += hyphen[1] + " "
        if not engFlag:
            #数字を英語にするべき?→1-20だけを処理する
            if original[0] in d.keys():
                filename_romaji += d[original[0]] + " "
            else:
                filename_romaji += conv.do(original[1]) + " "
        else:
            filename_romaji += eng_work
    """

    #ローマ字と数字以外を空白にするべき?→Yes75 No25
    filename_romaji = re.sub(r'[^a-zA-Z0-9_ ]*', "", filename_romaji)
    #小文字にするべき?→Yes
    return filename_romaji.lower()
Example #38
0
    def __init__(self, master=None):
        super().__init__(master)
        self.pack()
        # self.wRoot, self.hRoot = 390, 400
        self.wRoot, self.hRoot = 800, 500
        self.master.title(u"OpenCVの動画表示")       # ウィンドウタイトル
        self.master.geometry("{0}x{1}".format(self.wRoot, self.hRoot))     # ウィンドウサイズ(幅x高さ)

        # Canvasの作成
        self.canvas = tk.Canvas(self.master, highlightthickness=0)
        # Canvasにマウスイベント(左ボタンクリック)の追加
        self.canvas.bind('<Button-1>', self.canvas_click)
        # Canvasを配置
        self.canvas.pack(expand=1, fill=tk.BOTH)

        # 文字入力フォームの作成
        self.entry1 = tkinter.Entry(self.master, font=("", 20))
        self.entry1.focus_set()
        self.entry1.pack()

        # カメラをオープンする
        self.capture = cv2.VideoCapture(0)
        # 画面入力フォームよりカメラの大きさを大きくしておく
        self.capture.set(cv2.CAP_PROP_FRAME_WIDTH, 1920)  # カメラ画像の横幅を1280に設定
        self.capture.set(cv2.CAP_PROP_FRAME_HEIGHT, 1080)  # カメラ画像の縦幅を720に設定
        self.disp_id = None

        self.detector = htm.handDetectior(MaxHands=2, detectonCon=0.7)
        self.kakasi = pykakasi.kakasi()

        # 基本設定
        # ウィンドウの大きさの設定
        self.wCam, self.hCam = 1000, 700
        self.spaceH = 300
        self.spaceW = 150
        self.wVisal = self.wCam-self.spaceW
        self.hVisal = self.hCam-self.spaceH
        self.INPUT_TEXTS = u""
        self.INPUT_TEXTS_UI = u""
        self.KEYBOARDLIST = np.full((5, 5), False).tolist()
        self.KEYBOARDREMEN = True
        self.xx, self.yy = 0, 0
        self.INPUT_FLAG = False
        self.font_size = 50
        # self.font_Path = "C:\Windows\Fonts\メイリオ\meiryo.ttc"
        # self.font_Path_Bold = "C:\Windows\Fonts\メイリオ\meiryob.ttc"
        self.font_Path = "C:\Windows\Fonts\游ゴシック\YuGothR.ttc"
        self.font_Path_Bold = "C:\Windows\Fonts\游ゴシック\YuGothB.ttc"
        # 初期キーボードはかな入力に
        self.KEYBOARD = KEYBOARD_HIRA

        self.EVENT_Flag = 0
        self.search_text = ""

        self.Dict_num = 0
        self.Result_Button_pressed = [False]*4
        self.Detail_Button_pressed = [False]
        self.Books_num = -1

        self.savedIMG_Result = []
        self.savedIMG_Detail = []
Example #39
0
def dabiaoqian(path):

    from pykakasi import kakasi
    import csv, os

    name_tezheng = 'log'
    # 装有特征值的那个文件的文件名

    xinde = 'xinde_log'
    # 装入新的特征值的文件名

    houzhui = '.wav.csv'
    # 特征值文件中除去id号之后的后缀部分

    name = 'align1'
    # 表记着CCCCSSSS标志的文件

    name1 = 'align1'
    name2 = 'align1.txt'

    shibiejieguo = {}
    # 安放识别结果的字典

    symbolcidian = {}
    # 这样的词典,标志词典
    # id: C001L_086
    # ['S', 'S', 'S', 'C', 'S', 'D', 'D', 'D', 'C']
    # id: C001L_087
    # ['S', 'D', 'D', 'C']
    # id: C001L_088
    # ['S', 'S', 'S', 'S', 'D', 'D', 'D', 'D', 'C', 'C']
    zhengjie = {}
    # 正解文词典
    kakasi = kakasi()
    kakasi.setMode("H", "a")  # Hiragana to ascii, default: no conversion
    kakasi.setMode("K", "a")  # Katakana to ascii, default: no conversion
    kakasi.setMode("J", "a")  # Japanese to ascii, default: no conversion
    kakasi.setMode("r", "Hepburn")  # default: use Hepburn Roman table
    kakasi.setMode("s", True)  # add space, default: no separator
    conv = kakasi.getConverter()

    for i in os.listdir(path):

        path_1 = os.path.join(path, i)

        path_out = os.path.join(path_1, 'keka')

        path_tezheng = os.path.join(path_1, name_tezheng)

        biaozhiwenjian = csv.reader(
            open(os.path.join(path_1, name1), 'r',
                 encoding='EUC-JP'))  # 把标志文件读进来
        # biaozhiwenjian = csv.reader(open(os.path.join(path_1, name2), 'r', encoding='utf-8')) #如果标志文件是.txt文件

        biaozhiwenjian_1 = [i for i in biaozhiwenjian
                            ]  # 转化为list,但是内容是list里面套list
        #[['id: l_8840_9810_T1_F_01'],['REF:  そう です か 、 はい 。 '],['HYP:  そう です か    はい 。 '],['EVAL: C    C    C  D  C    C  '],[],['id: l_10800_13190_T1_F_01']]

        # print(biaozhiwenjian_1)
        # os.system('pause')

        path_xinde = os.path.join(path_1, xinde)
        mulu.mkdir(path_xinde)

        for i in range(0, len(biaozhiwenjian_1), 5):  #这里的每一轮可以为一个语音文件打标签

            ID = ''
            l_biaozhi = []
            l_zhengjie = []
            l_zhengjie_1 = []
            l_jieguo = []
            l_jieguo_1 = []

            ID = biaozhiwenjian_1[i][0].replace('id: ', '')

            l_zhengjie = biaozhiwenjian_1[i + 1][0].split()
            l_zhengjie.pop(0)

            l_jieguo = biaozhiwenjian_1[i + 2][0].split()
            l_jieguo.pop(0)

            l_biaozhi = biaozhiwenjian_1[i + 3][0].split()
            l_biaozhi.pop(0)

            # try:
            #     ID = biaozhiwenjian_1[i].replace('id: ', '')
            #
            #     l_zhengjie = biaozhiwenjian_1[i+1].split()
            #     l_zhengjie.pop(0)
            #
            #     l_jieguo = biaozhiwenjian_1[i+2].split()
            #     l_jieguo.pop(0)
            #
            #     l_biaozhi = biaozhiwenjian_1[i+3].split()
            #     l_biaozhi.pop(0)
            #
            # except:
            #     print(biaozhiwenjian_1[i])
            #     os.system("pause")

            #建立严格对应的正解,识别结果,标记,如果标记是d的话,结果就是空
            jishuqi_jieguo = 0
            jishuqi_zhengjie = 0

            for i in l_biaozhi:

                if i == "D":
                    l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie])
                    l_jieguo_1.append('')
                    jishuqi_zhengjie += 1

                if i == "C":
                    l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie])
                    l_jieguo_1.append(l_jieguo[jishuqi_jieguo])
                    jishuqi_zhengjie += 1
                    jishuqi_jieguo += 1

                if i == "I":
                    l_jieguo_1.append(l_jieguo[jishuqi_jieguo])
                    l_zhengjie_1.append('')
                    jishuqi_jieguo += 1

                if i == "S":
                    l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie])
                    l_jieguo_1.append(l_jieguo[jishuqi_jieguo])
                    jishuqi_zhengjie += 1
                    jishuqi_jieguo += 1

            # print(l_jieguo_1)
            # print(l_zhengjie_1)
            # print(l_biaozhi)
            # os.system('pause')

            path_out_1 = os.path.join(path_out, ID + '.out')
            dianout = pi.read_out(path_out_1)

            # print(dianout)
            # os.system('pause')

            path_tezheng_1 = os.path.join(path_tezheng, ID + '.wav.csv')
            tezhengzhi = csv.reader(open(path_tezheng_1, 'r',
                                         encoding='utf-8'))
            t_file_list = [i for i in tezhengzhi]
            dimension = len(t_file_list[0])

            start = dianout.pop(0)[1][1]  #给开始的无音区间打标签9

            for i in range(start + 1):
                t_file_list[i].insert(0, '9')  # 最前面的无音区间全部都打标签9,把它们当做正确认识来处理

            zhenshubiao = {}  #给每个单词都建立一个词典

            for i in dianout:
                zhenshubiao[i[0]] = i[1]  #一个.out文件中的每个单词都建立一个对应的字典

            start, end = zhenshubiao['。']  #给最后面的句号的部分打上标签9

            for i in range(start, end + 1):
                t_file_list[i].insert(0, '9')

            # print(dianout)
            # os.system('pause')
            # 最后的效果:[['', [0, 18]], ['お', [19, 24]], ['願い', [25, 49]], ['三', [50, 82]], ['。', [83, 86]]]
            # while 'D' in l_biaozhi:
            #     l_biaozhi.remove('D')  # 一次性只会删除一个D,所以要用while

            l_biaozhi_1 = [i for i, x in enumerate(l_biaozhi)
                           if x == 'S']  # 返回标志S的索引
            # print(len(l_biaozhi_1))
            # os.system('pause')

            if len(l_biaozhi_1) != 0:  #如果l_biaozhi_1里面没有单词,说明全部都被正确认识了

                # print('l_jieguo_1')
                # print(l_jieguo_1)
                #
                # print('l_biaozhi_1')
                # print(l_biaozhi_1)
                #
                # print('l_biaozhi')
                # print(l_biaozhi)
                #
                # print('l_zhengjie_1')
                # print(l_zhengjie_1)

                # print(l_jieguo_1)
                # print(l_zhengjie_1)
                # print(l_biaozhi)

                for y in l_biaozhi_1:  #处理标志s对应的单词,把正解文和识别结果都转化为字母再比较一次
                    # print("现在输出y的值")
                    # print(y)
                    #
                    # print('现在输出l_jieguo_1[y]')
                    # print(l_jieguo_1[y])
                    # print(ID)
                    # os.system('pause')

                    #先处理识别结果
                    if conv.do(
                            l_jieguo_1[y]
                    ) == l_jieguo_1[y] and l_jieguo_1[y] != '、':  #判断是不是字母

                        try:
                            zhuanhuan_jieguo = conv.do(
                                make_kana_convertor._make_kana_convertor(
                                    strQ2B.strQ2B(l_jieguo_1[y])))

                        except:
                            zhuanhuan_jieguo = conv.do(
                                make_kana_convertor._make_kana_convertor(
                                    l_jieguo_1[y]))

                    else:
                        zhuanhuan_jieguo = conv.do(l_jieguo_1[y])

                    #再处理正解文
                    if conv.do(
                            l_zhengjie_1[y]
                    ) == l_zhengjie_1[y] and l_zhengjie_1[y] != '、':  # 判断是不是字母

                        try:
                            zhuanhuan_zhengjie = conv.do(
                                make_kana_convertor._make_kana_convertor(
                                    strQ2B.strQ2B(l_zhengjie_1[y])))

                        except:
                            zhuanhuan_zhengjie = conv.do(
                                make_kana_convertor._make_kana_convertor(
                                    l_zhengjie_1[y]))

                    else:
                        zhuanhuan_zhengjie = conv.do(l_zhengjie_1[y])

                    # print('l_jieguo_1[y]')
                    # print(l_jieguo_1[y])
                    # os.system('pause')

                    guanjianzi = l_jieguo_1[y]  #把S对应的单词取出来

                    # print('guanjianzi')
                    # print(guanjianzi)
                    # os.system('pause')
                    #
                    # print('zhenshubiao')
                    # print(zhenshubiao[guanjianzi])
                    # os.system('pause')

                    try:
                        start, end = zhenshubiao[guanjianzi]  #把这个单词对应的帧数范围取出来

                    except:
                        print('ID')
                        print(ID)
                        print('zhenshubiao')
                        print(zhenshubiao)
                        print('guanjianzi')
                        print(guanjianzi)
                        os.system('pause')

                    for i in range(start, end + 1):

                        if zhuanhuan_jieguo == zhuanhuan_zhengjie:
                            t_file_list[i].insert(0, '0')

                        else:
                            t_file_list[i].insert(0, '1')

            jishuqi_tezhengzhi = 0

            for i in t_file_list:  #给被正确识别的单词打标签0

                # if i[0] != '0' and i[0] != '1' and i[0] != '9':

                if len(i[0]) == dimension:
                    t_file_list[jishuqi_tezhengzhi].insert(0, '0')

                jishuqi_tezhengzhi += 1

            path_xinde_tezhengzhi = os.path.join(path_xinde, ID + '.csv')

            with open(path_xinde_tezhengzhi, 'w+',
                      encoding='utf-8') as mergen_file:
                for i in t_file_list:
                    mergen_file.write('%s\n' % ','.join(i))

        shanchu.shanchuhang(path_xinde)  # 把有标记9的特征值全部都删除掉
Example #40
0
    def buildVocabulary(self, filePath):
        print('Loading vocabulary from ' + filePath)
        df = pd.read_excel(filePath)
        print('Vocabulary file loaded')
        wordList = []
        partOfSpeechList = {}
        lessonList = {}
        kksi = kakasi()
        kksi.setMode("J", "H")
        for index, row in df.iterrows():
            if self.checkValidData(row):
                lesson_num = row['lesson']
                pos_list = self.parsePartOfSpeech(row['pos'])

                if row['intransitive'] == 't':
                    isTransitive = True
                elif row['intransitive'] == 'i':
                    isTransitive = False
                else:
                    isTransitive = None

                pre_japanese = self.convertNanToEmptyString(row['preJapanese'])
                pre_japanese_particle = self.convertNanToEmptyString(
                    row['preJapaneseParticle'])
                japanese_all_hiragana = row['japaneseAllHiragana']
                japanese = row['japanese']
                post_japanese = self.convertNanToEmptyString(
                    row['postJapanese'])
                pre_english = self.convertNanToEmptyString(row['preEnglish'])
                english = row['english']
                post_english = self.convertNanToEmptyString(row['postEnglish'])

                word = Word(lesson_num, pos_list, isTransitive, pre_japanese,
                            pre_japanese_particle, japanese_all_hiragana,
                            japanese, post_japanese, pre_english, english,
                            post_english)
                wordList.append(word)

                indexOfAddedWord = len(wordList) - 1
                if lesson_num not in lessonList:
                    indices = []
                else:
                    indices = lessonList[lesson_num]
                indices.append(indexOfAddedWord)
                lessonList[lesson_num] = indices
                for pos in pos_list:
                    if pos not in partOfSpeechList:
                        indices = []
                    else:
                        indices = partOfSpeechList[pos]
                    indices.append(indexOfAddedWord)
                    partOfSpeechList[pos] = indices

        # for posType in PartOfSpeech:
        #     for index in partOfSpeechList[posType]:
        #         assert posType in wordList[index].partOfSpeech
        print('Vocabulary built')
        # wordListJson = json.dumps([ob.__dict__ for ob in wordList], indent=4)
        # print(wordListJson)
        with open('../src/vocabulary.json', 'w') as outfile:
            json.dump([ob.__dict__ for ob in wordList], outfile, indent=4)
        print('Vocabulary made into json')

        with open('../src/poslist.json', 'w') as outfile:
            json.dump(partOfSpeechList, outfile, indent=4)

        with open('../src/lessonlist.json', 'w') as outfile:
            json.dump(lessonList, outfile, indent=4)

        quit()
        return wordList, partOfSpeechList, lessonList
      value = value.strip("-")
      pos_list.append(value)
    for p in pos_list:
      j = PartOfSpeech.get(p)
      if(j == True):
        plist.append(p)
    if(len(plist) == 0):
      return None
    else:
      return plist
  else:
    return None



k = kakasi()  # Generate kakasi instance
k.setMode('J', 'H') #漢字からひらがなに変換
k.setMode('K', 'H')
conv = k.getConverter()

def English_to_Kana(str, fname):
  read = str.strip()
  english = re.compile('[a-zA-Z]+')
  words = english.findall(read)
  if((len(words) >= 1) & (fname == "EngDict_")):
    for w in words:
      if(len(w) == 1):
        furigana = alphabet.get(w.upper())
        read = read.replace(w, furigana)
      else:
        count = 0
 def __init__(self):
     self.kakasi = kakasi()
Example #43
0
File: rhyme.py Project: ucpr/rhyme
#!/usr/bin/env/python3

import os
from typing import List
import marisa_trie
from pykakasi import kakasi

kakasi_py = kakasi()
kakasi_py.setMode('H', 'a')
kakasi_py.setMode('K', 'a')
kakasi_py.setMode('J', 'a')
conv = kakasi_py.getConverter()


def _converting_to_roman(word: str) -> str:
    """ 与えられた単語をローマ字に変換して返す """
    return conv.do(word)


def _fetch_vowel(word: str) -> str:
    """ 母音のみの文字列にして返す """
    vowel_str = ""
    for i in _converting_to_roman(word):
        if i in "aiueo":
            vowel_str += i
    return vowel_str


def _find_rhyme_words(word: str) -> List:
    """ 韻を踏んでいそうな単語を探索してリストで返す """
    files = os.listdir("./dictionary/")
Example #44
0
def test_kakasi_structured_constitution():

    original_text = "日本国民は、正当に選挙された国会における代表者を通じて行動し、われらとわれらの子孫のために、" \
                    "諸国民との協和による成果と、わが国全土にわたつて自由のもたらす恵沢を確保し、" \
                    "政府の行為によつて再び戦争の惨禍が起ることのないやうにすることを決意し、ここに主権が国民に存することを宣言し、" \
                    "この憲法を確定する。そもそも国政は、国民の厳粛な信託によるものであつて、その権威は国民に由来し、" \
                    "その権力は国民の代表者がこれを行使し、その福利は国民がこれを享受する。これは人類普遍の原理であり、" \
                    "この憲法は、かかる原理に基くものである。われらは、これに反する一切の憲法、法令及び詔勅を排除する。"

    expected = [
        {
            'orig': "日本国民",
            'kana': "ニホンコクミン",
            'hira': "にほんこくみん",
            'hepburn': "nihonkokumin",
            'kunrei': "nihonkokumin",
            'passport': "nihonkokumin"
        },
        {
            'orig': "は、",
            'kana': "ハ、",
            'hira': "は、",
            'hepburn': "ha,",
            'kunrei': "ha,",
            'passport': "ha,"
        },
        {
            'orig': "正当",
            'kana': "セイトウ",
            'hira': "せいとう",
            'hepburn': "seitou",
            'kunrei': "seitou",
            'passport': "seito"
        },
        {
            'orig': "に",
            'kana': "ニ",
            'hira': "に",
            'hepburn': "ni",
            'kunrei': "ni",
            'passport': "ni"
        },
        {
            'orig': "選挙",
            'kana': "センキョ",
            'hira': "せんきょ",
            'hepburn': "senkyo",
            'kunrei': "senkyo",
            'passport': "senkyo"
        },
        {
            'orig': "された",
            'kana': "サレタ",
            'hira': "された",
            'hepburn': "sareta",
            'kunrei': "sareta",
            'passport': "sareta"
        },
        {
            'orig': "国会",
            'kana': "コッカイ",
            'hira': "こっかい",
            'hepburn': "kokkai",
            'kunrei': "kokkai",
            'passport': "kokkai"
        },
        {
            'orig': "における",
            'kana': "ニオケル",
            'hira': "における",
            'hepburn': "niokeru",
            'kunrei': "niokeru",
            'passport': "niokeru"
        },
        {
            'orig': "代表者",
            'kana': "ダイヒョウシャ",
            'hira': "だいひょうしゃ",
            'hepburn': "daihyousha",
            'kunrei': "daihyousya",
            'passport': "daihyousha"
        },
        {
            'orig': "を",
            'kana': "ヲ",
            'hira': "を",
            'hepburn': "wo",
            'kunrei': "wo",
            'passport': "wo"
        },
        {
            'orig': "通じ",
            'kana': "ツウジ",
            'hira': "つうじ",
            'hepburn': "tsuuji",
            'kunrei': "tuuzi",
            'passport': "tsuuji"
        },
        {
            'orig': "て",
            'kana': "テ",
            'hira': "て",
            'hepburn': "te",
            'kunrei': "te",
            'passport': "te"
        },
        {
            'orig': "行動",
            'kana': "コウドウ",
            'hira': "こうどう",
            'hepburn': "koudou",
            'kunrei': "koudou",
            'passport': "kodou"
        },
        {
            'orig': "し、",
            'kana': "シ、",
            'hira': "し、",
            'hepburn': 'shi,',
            'kunrei': "si,",
            'passport': "shi,"
        },
        {
            'orig': "われらとわれらの",
            'kana': "ワレラトワレラノ",
            'hira': "われらとわれらの",
            'hepburn': "wareratowarerano",
            'kunrei': "wareratowarerano",
            'passport': "wareratowarerano"
        },
        {
            'orig': "子孫",
            'kana': "シソン",
            'hira': "しそん",
            'hepburn': "shison",
            'kunrei': "sison",
            'passport': "shison"
        },
        {
            'orig': "のために、",
            'kana': "ノタメニ、",
            'hira': "のために、",
            'hepburn': "notameni,",
            'kunrei': "notameni,",
            'passport': "notameni,"
        },
        {
            'orig': "諸国民",
            'kana': "ショコクミン",
            'hira': "しょこくみん",
            'hepburn': "shokokumin",
            'kunrei': "syokokumin",
            'passport': "shokokumin"
        },
        {
            'orig': "との",
            'kana': "トノ",
            'hira': "との",
            'hepburn': "tono",
            'kunrei': "tono",
            'passport': "tono"
        },
        {
            'orig': "協和",
            'kana': "キョウワ",
            'hira': "きょうわ",
            'hepburn': "kyouwa",
            'kunrei': "kyouwa",
            'passport': "kyouwa"
        },
        {
            'orig': "による",
            'kana': "ニヨル",
            'hira': "による",
            'hepburn': "niyoru",
            'kunrei': "niyoru",
            'passport': "niyoru"
        },
        {
            'orig': "成果",
            'kana': "セイカ",
            'hira': "せいか",
            'hepburn': "seika",
            'kunrei': "seika",
            'passport': "seika"
        },
        {
            'orig': "と、",
            'kana': "ト、",
            'hira': "と、",
            'hepburn': "to,",
            'kunrei': "to,",
            'passport': "to,"
        },
        {
            'orig': "わが",
            'kana': "ワガ",
            'hira': "わが",
            'hepburn': "waga",
            'kunrei': "waga",
            'passport': "waga"
        },
        {
            'orig': "国",
            'kana': "クニ",
            'hira': "くに",
            'hepburn': "kuni",
            'kunrei': "kuni",
            'passport': "kuni"
        },
        {
            'orig': "全土",
            'kana': "ゼンド",
            'hira': "ぜんど",
            'hepburn': "zendo",
            'kunrei': "zendo",
            'passport': "zendo"
        },
        {
            'orig': "にわたつて",
            'kana': "ニワタツテ",
            'hira': "にわたつて",
            'hepburn': "niwatatsute",
            'kunrei': "niwatatute",
            'passport': "niwatatsute"
        },
        {
            'orig': "自由",
            'kana': "ジユウ",
            'hira': "じゆう",
            'hepburn': "jiyuu",
            'kunrei': "ziyuu",
            'passport': "jiyuu"
        },
        {
            'orig': "のもたらす",
            'kana': "ノモタラス",
            'hira': 'のもたらす',
            'hepburn': "nomotarasu",
            'kunrei': "nomotarasu",
            'passport': "nomotarasu"
        },
        {
            'orig': "恵沢",
            'kana': "ケイタク",
            'hira': "けいたく",
            'hepburn': "keitaku",
            'kunrei': "keitaku",
            'passport': "keitaku"
        },
        {
            'orig': "を",
            'kana': "ヲ",
            'hira': "を",
            'hepburn': 'wo',
            'kunrei': 'wo',
            'passport': 'wo'
        },
        {
            'orig': "確保",
            'kana': "カクホ",
            'hira': "かくほ",
            'hepburn': "kakuho",
            'kunrei': "kakuho",
            'passport': "kakuho"
        },
        {
            'orig': "し、",
            'kana': "シ、",
            'hira': "し、",
            'hepburn': "shi,",
            'kunrei': "si,",
            'passport': "shi,"
        },
        {
            'orig': "政府",
            'kana': "セイフ",
            'hira': "せいふ",
            'hepburn': "seifu",
            'kunrei': "seifu",
            'passport': "seifu"
        },
        {
            'orig': "の",
            'kana': "ノ",
            'hira': "の",
            'hepburn': "no",
            'kunrei': "no",
            'passport': "no"
        },
        {
            'orig': "行為",
            'kana': "コウイ",
            'hira': "こうい",
            'hepburn': "koui",
            'kunrei': "koui",
            'passport': "koi"
        },
        {
            'orig': "によつて",
            'kana': "ニヨツテ",
            'hira': "によつて",
            'hepburn': "niyotsute",
            'kunrei': "niyotute",
            'passport': "niyotsute"
        },
        {
            'orig': "再び",
            'kana': "フタタビ",
            'hira': "ふたたび",
            'hepburn': "futatabi",
            'kunrei': "futatabi",
            'passport': "futatabi"
        },
        {
            'orig': "戦争",
            'kana': "センソウ",
            'hira': "せんそう",
            'hepburn': "sensou",
            'kunrei': "sensou",
            'passport': "senso"
        },
        {
            'orig': "の",
            'kana': "ノ",
            'hira': "の",
            'hepburn': "no",
            'kunrei': "no",
            'passport': "no"
        },
    ]

    kakasi = pykakasi.kakasi()
    result = kakasi.convert(original_text)
    for i, e in enumerate(expected):
        assert result[i]['orig'] == e['orig']
        assert result[i]['hira'] == e['hira']
        assert result[i]['kana'] == e['kana']
        assert result[i]['hepburn'] == e['hepburn']
        assert result[i]['kunrei'] == e['kunrei']
        assert result[i]['passport'] == e['passport']
# coding=UTF-8

import pykakasi, json, argparse, codecs, traceback
from pathlib import Path

kks = pykakasi.kakasi()

parser = argparse.ArgumentParser(
    description=
    'Takes an LLN-exported Japanese Json file and outputs an anki CSV file containing word, pronounciation, meaning, and subtitle.'
)

parser.add_argument('files',
                    metavar='N',
                    type=str,
                    nargs='+',
                    help='Filepaths to plaintext UTF-8 JSON files.')

parser.add_argument(
    '--outdir',
    type=str,
    nargs=1,
    help='File output directory. Default are the same as input paths.')

args = parser.parse_args()

for arg in vars(args):
    if arg != 'files':
        continue

    for file in getattr(args, arg):
Example #46
0
 def test_kakasi_invalid_flag_value(self):
     with self.assertRaises(InvalidFlagValueException):
         kakasi = pykakasi.kakasi()
         kakasi.setMode("H", "a")
         kakasi.setMode("K", "a")
         kakasi.setMode("s", "yes")
Example #47
0
#!/usr/local/bin python3

import os
import sys
from argparse import ArgumentParser
import numpy as np
import json
from tqdm import tqdm
from station import station_generator
from match_score import term_match_score
from speech_sound import consonants, vowels
from preprocess import preprocess
from pykakasi import kakasi

kakasi_h2a = kakasi()
kakasi_h2a.setMode('H', 'a')
conv_h2a = kakasi_h2a.getConverter()
kakasi_j2h = kakasi()
kakasi_j2h.setMode('J', 'H')
conv_j2h = kakasi_j2h.getConverter()


def initial2initials(initial, initials):
    # 歌詞のイニシャルから検索対象の単語のイニシャルを指定 (母音が共通のもの)
    return [
        i for i in initials
        if vowels(conv_h2a.do(initial)) == vowels(conv_h2a.do(i))
    ]


def initial2stations(stations):
Example #48
0
from pykakasi import kakasi
from janome.tokenizer import Tokenizer

# pykakasi
kks = kakasi()
convert = kks.convert
# janome
t = Tokenizer()
# 处理日文字符
'''特殊符号'''
symbols = ('、', '。', '’', '”', '{', '}', '「', '」', 'ー', '=', '_', '+', '/',
           '*', '-', '(', ')')


# 日文处理函数
def dealwith(jp):
    for token in t.tokenize(jp):
        string = str(token)
        origin = string.split('\t')[0]

        if string.split(',')[-1] != '*':
            roma = convert(string.split(',')[-1])[0]['hepburn']
        else:
            roma = convert(origin)[0]['hepburn']

        result_roma = roma + ' '
    return result_roma


result = dealwith(input())
Example #49
0
import json
import pykakasi
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import lxml.html
import os

kakasi = pykakasi.kakasi()
kakasi.setMode("H", "a")  # Hiragana to ascii, default: no conversion
kakasi.setMode("K", "a")  # Katakana to ascii, default: no conversion
kakasi.setMode("J", "a")  # Japanese to ascii, default: no conversion
kakasi.setMode("r", "Hepburn")  # default: use Hepburn Roman table
kakasi.setMode("s", True)  # add space, default: no separator
conv = kakasi.getConverter()

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--window-size=1280x1696')
chrome_options.add_argument('--user-data-dir=/tmp/user-data')
chrome_options.add_argument('--hide-scrollbars')
chrome_options.add_argument('--enable-logging')
chrome_options.add_argument('--log-level=0')
chrome_options.add_argument('--v=99')
chrome_options.add_argument('--single-process')
chrome_options.add_argument('--data-path=/tmp/data-path')
chrome_options.add_argument('--ignore-certificate-errors')
import requests
from bs4 import BeautifulSoup
import re
import os

localFile = './testLyrics.txt'
lyricRoot = "../../distribution/llsif-waifu-lyrics"
rootURL = 'http://love-live.wikia.com'

transDict = {u'君':u'きみ',
	u'見':u'み',
	u'色':u'いろ',
	u'来':u'き'}

segmenter = tinysegmenter.TinySegmenter()
kakasi = kakasi()

enCount = 0
jpCount = 0

siteTabList = ['rōmaji','kanji','english']

def iterateSongList(urlRead='http://love-live.wikia.com/wiki/Category:Aqours_Songs'):
	r = requests.get(urlRead).content
	soup = BeautifulSoup(r,'lxml')
	contDiv = soup.find("div",{"class":"mw-content-ltr"})
	for ulTag in contDiv.findAll("ul"):
		for liTag in ulTag.findAll("li"):
			title = None
			try:
			    title = liTag.find("a").find("img")["alt"]
Example #51
0
 def test_kakasi_unknown_mode(self):
     with self.assertRaises(InvalidModeValueException):
         kakasi = pykakasi.kakasi()
         kakasi.setMode("H", "a")
         kakasi.setMode("K", "a")
         kakasi.setMode("J", "X")