コード例 #1
0
def dabiaoqian(path,guanjianzi_1,guanjianzi_2):

    from pykakasi import kakasi
    import csv, os

    name_tezheng = guanjianzi_1
    # 装有特征值的那个文件的文件名

    xinde = guanjianzi_2
    # 装入新的特征值的文件名

    name1 = 'align1'
    name2 = 'symbol.txt'
    #标志文件的名字,当align1不好使的时候,换用symbol.txt,注意,下面的代码相应地也要换掉

    kakasi = kakasi()
    kakasi.setMode("H", "a")  # Hiragana to ascii, default: no conversion
    kakasi.setMode("K", "a")  # Katakana to ascii, default: no conversion
    kakasi.setMode("J", "a")  # Japanese to ascii, default: no conversion
    kakasi.setMode("r", "Hepburn")  # default: use Hepburn Roman table
    kakasi.setMode("s", True)  # add space, default: no separator
    conv = kakasi.getConverter()

    for i in os.listdir(path):

        path_1 = os.path.join(path,i)

        path_out = os.path.join(path_1,'keka')

        path_tezheng = os.path.join(path_1, name_tezheng)

        biaozhiwenjian = csv.reader(open(os.path.join(path_1, name1), 'r', encoding='EUC-JP'))  # 把标志文件读进来
        #biaozhiwenjian = csv.reader(open(os.path.join(path_1, name2), 'r', encoding='utf-8')) #如果标志文件是.txt文件

        biaozhiwenjian_1 = [i for i in biaozhiwenjian]  # 转化为list,但是内容是list里面套list
        #[['id: l_8840_9810_T1_F_01'],['REF:  そう です か 、 はい 。 '],['HYP:  そう です か    はい 。 '],['EVAL: C    C    C  D  C    C  '],[],['id: l_10800_13190_T1_F_01']]

        # print(biaozhiwenjian_1)
        # os.system('pause')

        path_xinde = os.path.join(path_1, xinde)
        mulu.mkdir(path_xinde)

        for i in range(0, len(biaozhiwenjian_1)):  # 这里的每一轮可以为一个语音文件打标签

            try:
                biaozhi = biaozhiwenjian_1[i][0]

            except:

                continue

            if 'id:' in biaozhi:

                l_zhengjie_1 = []
                l_jieguo_1 = []

                ID = biaozhiwenjian_1[i][0].replace('id: ', '')

                l_zhengjie = biaozhiwenjian_1[i + 1][0].split()#取REF
                l_zhengjie.pop(0)

                l_jieguo = biaozhiwenjian_1[i + 2][0].split()#取HYP
                l_jieguo.pop(0)

                l_biaozhi = biaozhiwenjian_1[i + 3][0].split()#取EVAL
                l_biaozhi.pop(0)

                # try:
                #     ID = biaozhiwenjian_1[i].replace('id: ', '')
                #
                #     l_zhengjie = biaozhiwenjian_1[i+1].split()
                #     l_zhengjie.pop(0)
                #
                #     l_jieguo = biaozhiwenjian_1[i+2].split()
                #     l_jieguo.pop(0)
                #
                #     l_biaozhi = biaozhiwenjian_1[i+3].split()
                #     l_biaozhi.pop(0)
                #
                # except:
                #     print(biaozhiwenjian_1[i])
                #     os.system("pause")

                #建立严格对应的正解,识别结果,标记,如果标记是d的话,结果就是空
                jishuqi_jieguo = 0
                jishuqi_zhengjie = 0
                jishuqi_biaozhi = 0

                for i in l_biaozhi:

                    if i == "D":#删除错误
                        l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie])
                        l_jieguo_1.append('')#发生删除错误,就在识别结果的列表里面加上一个空格
                        jishuqi_zhengjie += 1
                        jishuqi_biaozhi += 1

                    if i == "C":#正解
                        l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie])
                        #正确的话就在识别结果和正解文两个列表里面都加入单词
                        # print('l_jieguo')
                        # print(l_jieguo)
                        # os.system('pause')
                        l_jieguo_1.append(l_jieguo[jishuqi_jieguo])#
                        jishuqi_zhengjie += 1
                        jishuqi_jieguo += 1
                        jishuqi_biaozhi += 1

                    if i == "I":#插入错误
                        l_jieguo_1.append(l_jieguo[jishuqi_jieguo])
                        l_zhengjie_1.append('')#发生插入错误,就在正解文的里面加入空格
                        jishuqi_jieguo += 1
                        jishuqi_biaozhi += 1

                    if i == "S":#如果是S的话特殊处理一下,转化为字母再比较,如果转化之后相等的话,把标志改为C
                        l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie])
                        l_jieguo_1.append(l_jieguo[jishuqi_jieguo])

                        zhengjie_hanzi = l_zhengjie[jishuqi_zhengjie]
                        jieguo_hanzi = l_jieguo[jishuqi_jieguo]

                        #先处理识别结果
                        if conv.do(jieguo_hanzi) == jieguo_hanzi and jieguo_hanzi != '、':#判断是不是字母

                            try:
                                zhuanhuan_jieguo = conv.do(make_kana_convertor._make_kana_convertor(strQ2B.strQ2B(jieguo_hanzi)))

                            except:
                                zhuanhuan_jieguo = conv.do(make_kana_convertor._make_kana_convertor(jieguo_hanzi))

                        else:
                            zhuanhuan_jieguo = conv.do(jieguo_hanzi)

                        #再处理正解文
                        if conv.do(zhengjie_hanzi) == zhengjie_hanzi and zhengjie_hanzi != '、':  # 判断是不是字母

                            try:
                                zhuanhuan_zhengjie = conv.do(make_kana_convertor._make_kana_convertor(strQ2B.strQ2B(zhengjie_hanzi)))

                            except:
                                zhuanhuan_zhengjie = conv.do(make_kana_convertor._make_kana_convertor(zhengjie_hanzi))

                        else:
                            zhuanhuan_zhengjie = conv.do(zhengjie_hanzi)

                        if zhuanhuan_jieguo == zhuanhuan_zhengjie:

                            # print("正解list")
                            # print(l_zhengjie_1)
                            #
                            # print("识别结果list")
                            # print(l_jieguo_1)
                            #
                            # print("zhuanhuan_jieguo")
                            # print(zhuanhuan_jieguo)
                            # print("zhuanhuan_zhengjie")
                            # print(zhuanhuan_zhengjie)
                            # print("有标志被改了")
                            # print(ID)
                            # os.system("pause")

                            l_biaozhi[jishuqi_biaozhi] = 'C'

                        jishuqi_biaozhi += 1
                        jishuqi_zhengjie += 1
                        jishuqi_jieguo += 1

                # print(l_jieguo_1)
                # print(l_zhengjie_1)
                # print(l_biaozhi)
                # os.system('pause')

                path_out_1 = os.path.join(path_out, ID + '.out')#读出.out文件
                dianout = pi.read_out(path_out_1)
                start = dianout.pop(0)[1][1]  # 给开始的无音区间打标签9,pop掉第一个元素
                start_1 = dianout[-1][1][0]#给末尾句号打标签9
                # end_1 = dianout.pop(-1)[1][1] 因为在提取特征值的时候最后一帧可能被丢了,所以这个end就用t_file_list的条数代替

                # print(dianout)
                # os.system('pause')
                # 最后的效果:[['', [0, 18]], ['お', [19, 24]], ['願い', [25, 49]], ['三', [50, 82]], ['。', [83, 86]]]

                path_tezheng_1 = os.path.join(path_tezheng, ID + '.wav.csv')

                if os.path.exists(path_tezheng_1) == True:

                    tezhengzhi = csv.reader(open(path_tezheng_1, 'r', encoding='utf-8'))
                    t_file_list = [i for i in tezhengzhi]

                    end_1 = len(t_file_list)-1

                    if start< len(t_file_list):#如果.out文件的空白部分的帧数范围大于特征值的行数,就扔了

                        for i in range(start + 1):
                            t_file_list[i].insert(0, '9')  #最前面的无音区间全部都打标签9,把它们当做正确认识来处理

                        # for i in range(start_1, end_1 + 1):
                        #     t_file_list[i].insert(0, '9')

                        # l_jieguo_1.pop(-1)#最后句号的部分已经打过标签了,需要把它pop掉

                        # print("ID")
                        # print(ID)

                        # print("l_biaozhi")
                        # print(l_biaozhi)
                        # print("l_jieguo_1")
                        # print(l_jieguo_1)

                        # print("dianout")
                        # print(dianout)

                        dianout_chongzao = cz.chongzao(l_biaozhi, l_jieguo_1, dianout, ID)  # 生成新的dianoutlist,以后就靠它了

                        # print('dianout_chongzao')
                        # print(dianout_chongzao)

                        #通过得到的新的list,开始打标签
                        # [['災害', [3, 40], 'C'], ['で', [41, 48], 'C'], ['ござい', [49, 77], 'C'], ['ます', [78, 98], 'C'],['から', [99, 130], 'C'], ['、', [131, 152], 'C'], ['その', [153, 177], 'C'], ['場', [178, 190], 'C'],['で', [191, 209], 'C']]
                        for i in dianout_chongzao:

                        #想要在音素上进行比较,得看标签为S的部分

                            start, end = i[1]

                            if start <= end_1:

                                if end_1 <= end:

                                    end = end_1

                                if i[2] == 'C':

                                    for i in range(start, end + 1):
                                        t_file_list[i].insert(0, '0')

                                else:

                                    for i in range(start, end + 1):
                                        t_file_list[i].insert(0, '1')#标记为I或者S的单词的特征值需要打上标签1

                        path_xinde_tezhengzhi = os.path.join(path_xinde, ID + '.csv')

                        with open(path_xinde_tezhengzhi, 'w+', encoding='utf-8') as mergen_file:
                            for i in t_file_list:
                                mergen_file.write('%s\n' % ','.join(i))

        shanchu.shanchuhang(path_xinde)  # 把有标记9的特征值全部都删除掉
コード例 #2
0
import os, os.path,shutil
import sys
import muluzai as mulu

str = ".out"

path = r"C:\Users\a7825\Desktop\shiyan\symbol"#批次


for filename in os.listdir(path):
    path_1 = os.path.join(path,filename,'wav')#这里要根据实际的目录进行修改
    path_out = os.path.join(path,filename,'keka_yinsu')
    mulu.mkdir(path_out)

    for filename_1 in os.listdir(path_1):
        if filename_1.endswith(str):

            wenjian = os.path.join(path_1,filename_1)
            shutil.move(wenjian,path_out)

コード例 #3
0
if __name__ == '__main__':

    kakasi = kakasi()
    kakasi.setMode("H", "a")  # Hiragana to ascii, default: no conversion
    kakasi.setMode("K", "a")  # Katakana to ascii, default: no conversion
    kakasi.setMode("J", "a")  # Japanese to ascii, default: no conversion
    kakasi.setMode("r", "Hepburn")  # default: use Hepburn Roman table
    kakasi.setMode("s", True)  # add space, default: no separator
    conv = kakasi.getConverter()

    for per_dirs in os.listdir(BASE_DIRS):  # per_dirs = C001L,C001R...

        d_9 = os.path.join(BASE_DIRS, per_dirs, xinde)
        d = os.path.join(BASE_DIRS, per_dirs, xinde)
        mulu.mkdir(d)

        zhengjie, symbolcidian = zidian(per_dirs)
        #从标志文件中把标志塞进symbolcidian字典里

        for id in os.listdir(os.path.join(
                BASE_DIRS, per_dirs, name_tezheng)):  #id = C001L,C001R下面的文件的名字

            banyun_1 = []  #存储C的索引
            banyun_2 = []  #存储正确的单词

            banyun_3 = []  #存储非C的索引
            banyun_4 = []  #存储暂时不正确的单词的拼音
            dianout = []

            id = id.replace(houzhui, '')  #把文件名中的.wav.csv去掉只剩id
コード例 #4
0
            for n in range(q, block):  #之前括号里面的值是block,因为要扔掉一半所以改了
                zhongzhuan = []
                zhongzhuan.append(
                    ma.sqrt(
                        ma.pow(tezheng_2[n].imag, 2) +
                        ma.pow(tezheng_2[n].real, 2)))
                tezheng_3[m].extend(zhongzhuan)
                # tezheng_3 = ma.sqrt(ma.pow(tezheng_2[n].imag,2)+ma.pow(tezheng_2[n].real,2))
            # print(tezheng_3)
            # print("新的特征值")
            # print(tezheng_3[m])
            # print('下一波')
            start = start + 1
            end = end + 1

    newtezheng = np.array(tezheng_3, dtype=np.float)
    # newtezheng = np.transpose(newtezheng)
    np.savetxt(final_path, newtezheng, delimiter=',')


for mulu_1 in os.listdir(basedir):

    lujing = os.path.join(basedir, mulu_1, mizhichuli_log)
    # 处理之后的特征值文件应该存放的地方
    filepath = os.path.join(basedir, mulu_1, bulin)
    # 需要被处理的特征值文件

    mulu.mkdir(lujing)
    for name in os.listdir(filepath):
        print("正在处理文件%s" % name)
        mizhichuli(name)
コード例 #5
0
ファイル: zhengli.py プロジェクト: shuyuqing/-
def zhengli(path):

    path_data = os.path.join(path,'data')
    muluzai.mkdir(path_data)

    path_fbank = os.path.join(path_data,'fbank')
    path_mizhichuli = os.path.join(path_data,'muzhichuli')
    muluzai.mkdir(path_fbank)
    muluzai.mkdir(path_mizhichuli)

    path_fbank_all = os.path.join(path_fbank,'all')
    path_fbank_opentest = os.path.join(path_fbank,'opentest')
    path_fbank_closetest = os.path.join(path_fbank,'closetest')
    path_fbank_xuexi = os.path.join(path_fbank,'xuexi')

    path_mizhichuli_all = os.path.join(path_mizhichuli,'all')
    path_mizhichuli_opentest = os.path.join(path_mizhichuli,'opentest')
    path_mizhichuli_closetest = os.path.join(path_mizhichuli,'closetest')
    path_mizhichuli_xuexi = os.path.join(path_mizhichuli,'xuexi')

    muluzai.mkdir(path_fbank_all)
    muluzai.mkdir(path_fbank_opentest)
    muluzai.mkdir(path_fbank_closetest)
    muluzai.mkdir(path_fbank_xuexi)

    muluzai.mkdir(path_mizhichuli_all)
    muluzai.mkdir(path_mizhichuli_opentest)
    muluzai.mkdir(path_mizhichuli_closetest)
    muluzai.mkdir(path_mizhichuli_xuexi)

    for wenjian in os.listdir(path):

        if wenjian != 'data':
            path_1 = os.path.join(path,wenjian)
            path_2 = os.path.join(path_1,'xinde_log')
            path_3 = os.path.join(path_1,'xinde_mizhichuli')

            #这个部分是fbank的部分
            for u in os.listdir(path_2):
                copyfile(os.path.join(path_2,u), os.path.join(path_fbank_all,u))  # 自动复制

            wenjian_fbank_all = os.listdir(path_2)

            wenjian_fbank_open = []
            wenjian_fbank_xuexi = []

            perm = np.random.permutation(len(wenjian_fbank_all))#随机生成size跟文件数量相等的列表

            perm_1 = perm[:int(len(wenjian_fbank_all)/fenshu)]#这里的5是全部数据的五分之一作为测试数据
            for a in perm_1:
                wenjian_fbank_open.append(wenjian_fbank_all[a])

            perm_2 = perm[int(len(wenjian_fbank_all)/fenshu):]#剩下的五分之四是学习数据
            for i in perm_2:
                wenjian_fbank_xuexi.append(wenjian_fbank_all[i])

            wenjian_fbank_close = wenjian_fbank_xuexi[:int(len(wenjian_fbank_all)/fenshu)]#学习数据里面的一部分是闭测的数据

            # wenjian_fbank_open = wenjian_fbank_all[perm[:int(len(wenjian_fbank_all)/fenshu)]]
            # wenjian_fbank_xuexi = wenjian_fbank_all[int(len(wenjian_fbank_all)/fenshu):]#剩下的五分之四是学习数据
            # wenjian_fbank_close = wenjian_fbank_xuexi[:int(len(wenjian_fbank_all)/fenshu)]#学习数据里面的一部分是闭测的数据

            for u in wenjian_fbank_close:
                copyfile(os.path.join(path_2,u),os.path.join(path_fbank_closetest,u))
            for u in wenjian_fbank_xuexi:
                copyfile(os.path.join(path_2,u),os.path.join(path_fbank_xuexi,u))
            for u in wenjian_fbank_open:
                copyfile(os.path.join(path_2,u),os.path.join(path_fbank_opentest,u))

            #这个部分是mizhichuli的部分
            for i in os.listdir(path_3):
                copyfile(os.path.join(path_3,i), os.path.join(path_mizhichuli_all,i))

            wenjian_mizhichuli_all = os.listdir(path_3)

            wenjian_mizhichuli_open = []
            wenjian_mizhichuli_xuexi = []

            perm = np.random.permutation(len(wenjian_mizhichuli_all))  # 随机生成size跟文件数量相等的列表

            perm_1 = perm[:int(len(wenjian_mizhichuli_all) / fenshu)]  # 这里的5是全部数据的五分之一作为测试数据
            for a in perm_1:
                wenjian_mizhichuli_open.append(wenjian_mizhichuli_all[a])

            perm_2 = perm[int(len(wenjian_mizhichuli_all) / fenshu):]  # 剩下的五分之四是学习数据
            for i in perm_2:
                wenjian_mizhichuli_xuexi.append(wenjian_mizhichuli_all[i])

            wenjian_mizhichuli_close = wenjian_mizhichuli_xuexi[:int(len(wenjian_mizhichuli_all) / fenshu)]  # 学习数据里面的一部分是闭测的数据

            # wenjian_mizhichuli_open = wenjian_mizhichuli_all[:int(len(wenjian_mizhichuli_all)/5)]#这里的5是全部数据的五分之一作为测试数据
            # wenjian_mizhichuli_xuexi = wenjian_mizhichuli_all[int(len(wenjian_mizhichuli_all)/5):]
            # wenjian_mizhichuli_close = wenjian_mizhichuli_xuexi[:int(len(wenjian_mizhichuli_all)/5)]

            for u in wenjian_mizhichuli_close:
                copyfile(os.path.join(path_3,u),os.path.join(path_mizhichuli_closetest,u))
            for u in wenjian_mizhichuli_xuexi:
                copyfile(os.path.join(path_3,u),os.path.join(path_mizhichuli_xuexi,u))
            for u in wenjian_mizhichuli_open:
                copyfile(os.path.join(path_3,u),os.path.join(path_mizhichuli_opentest,u))
コード例 #6
0
s1 = s1 + '_' + 'pingheng'
dataname_1 = dataname_1 + '_' + 'pingheng'

pzs.dabiaoqian(path, guanjianzi_1=s2, guanjianzi_2=s2 + '_' + 'biaoqian')
zx.zuixiao(path, guanjianzi=s2 + '_' + 'biaoqian', xiaxian=10)
bl.kongwenjian(path, guanjianzi=s2 + '_' + 'biaoqian')  #把大小为0的文件都删除了
bl.pingheng_2(path, guanjianzi=s2 + '_' + 'biaoqian')  #把标签全部是0的文件都移动到桌面去
bl.pingheng_3(path, guanjianzi=s2 + '_' + 'biaoqian')  #把标签全部是1的文件都移动到桌面去
s2 = s2 + '_' + 'biaoqian'
dataname_2 = dataname_2 + '_' + 'biaoqian'

for wenjian in os.listdir(path):  #因为特征值里面0太多了,要切掉一些,这个会把文件切成不同小段
    path_1 = os.path.join(path, wenjian, s2)
    path_new = os.path.join(path, wenjian, s2 + '_' + 'pingheng')
    mu.mkdir(path_new)
    for wenjian_1 in os.listdir(path_1):
        path_2 = os.path.join(path_1, wenjian_1)
        qie.qiexiao(path_2, wenjian_1, path_new)
bl.pingheng_2(path, guanjianzi=s2 + '_' +
              'pingheng')  #把标签全部是0的文件都移动到桌面去,因为切割之后会留下很多标签全是0的文件
zx.zuixiao(path, guanjianzi=s2 + '_' + 'pingheng', xiaxian=10)

s2 = s2 + '_' + 'pingheng'
dataname_2 = dataname_2 + '_' + 'pingheng'

# zhengli.zhengli(path,guanjianzi_1 = s1,guanjianzi_2 = s2,dataname_1 = dataname_1,dataname_2 =dataname_2)#把opentest,closetest,整理出来
# zhengli_mizhichuli.zhengli(path,guanjianzi_2 = s2,dataname_1 = dataname_1,dataname_2=dataname_2)
zhengli_1.zhengli(path,
                  guanjianzi_2=s2,
                  dataname_1=dataname_1,
コード例 #7
0
def dabiaoqian(path, guanjianzi_1, guanjianzi_2):

    name_tezheng = guanjianzi_1
    # 装有特征值的那个文件的文件名

    xinde = guanjianzi_2
    # 装入新的特征值的文件名

    name1 = 'align1'
    name2 = 'symbol.txt'
    #标志文件的名字,当align1不好使的时候,换用symbol.txt,注意,下面的代码相应地也要换掉

    for i in os.listdir(path):

        path_1 = os.path.join(path, i)

        path_out = os.path.join(path_1, 'keka_yinsu')

        path_tezheng = os.path.join(path_1, name_tezheng)

        print(os.path.join(path_1, name1))

        biaozhiwenjian = csv.reader(
            open(os.path.join(path_1, name1), 'r',
                 encoding='EUC-JP'))  # 出现错误时候可能是没加encoding
        # biaozhiwenjian = csv.reader(open(os.path.join(path_1, name1), 'r'))  # 把标志文件读进来

        # biaozhiwenjian = csv.reader(open(os.path.join(path_1, name2), 'r', encoding='utf-8')) #如果标志文件是.txt文件
        biaozhiwenjian_1 = [i for i in biaozhiwenjian
                            ]  # 转化为list,但是内容是list里面套list
        #[['id: l_8840_9810_T1_F_01'],['REF:  そう です か 、 はい 。 '],['HYP:  そう です か    はい 。 '],['EVAL: C    C    C  D  C    C  '],[],['id: l_10800_13190_T1_F_01']]

        path_xinde = os.path.join(path_1, xinde)
        mulu.mkdir(path_xinde)

        for i in range(0, len(biaozhiwenjian_1)):  # 这里的每一轮可以为一个语音文件打标签

            try:
                biaozhi = biaozhiwenjian_1[i][0]

            except:

                continue

            if 'id:' in biaozhi:

                ID = biaozhiwenjian_1[i][0].replace('id: ', '')

                l_zhengjie = biaozhiwenjian_1[i + 1][0].split()  #取REF
                l_zhengjie.pop(0)

                l_jieguo = biaozhiwenjian_1[i + 2][0].split()  #取HYP
                l_jieguo.pop(0)

                l_biaozhi = biaozhiwenjian_1[i + 3][0].split()  #取EVAL
                l_biaozhi.pop(0)

                #建立严格对应的正解,识别结果,标记,如果标记是d的话,结果就是空
                jishuqi_jieguo = 0
                jishuqi_zhengjie = 0
                jishuqi_biaozhi = 0

                for i in l_biaozhi:

                    if i == "D":  #删除错误
                        jishuqi_zhengjie += 1
                        jishuqi_biaozhi += 1

                    if i == "C":  #正解
                        jishuqi_zhengjie += 1
                        jishuqi_jieguo += 1
                        jishuqi_biaozhi += 1

                    if i == "I":  #插入错误
                        jishuqi_jieguo += 1
                        jishuqi_biaozhi += 1

                    if i == "S":  #如果是S的话特殊处理一下,转化为字母再比较,如果转化之后相等的话,把标志改为C

                        jishuqi_zhengjie += 1
                        jishuqi_jieguo += 1
                        jishuqi_biaozhi += 1

                path_out_1 = os.path.join(path_out, ID + '.out')  #读出.out文件
                dianout = pi.read_out(path_out_1)
                # start = dianout.pop(0)[1][1]  # 给开始的无音区间打标签9,pop掉第一个元素
                start = dianout[0][1][0] - 1
                # start_1 = dianout[-1][1][0]#给末尾句号打标签9
                start_1 = dianout[-1][1][1] + 1
                # end_1 = dianout.pop(-1)[1][1] 因为在提取特征值的时候最后一帧可能被丢了,所以这个end就用t_file_list的条数代替
                # print(dianout)
                # os.system('pause')
                # 最后的效果:[['', [0, 18]], ['お', [19, 24]], ['願い', [25, 49]], ['三', [50, 82]], ['。', [83, 86]]]

                path_tezheng_1 = os.path.join(path_tezheng, ID + '.wav.csv')
                tezhengzhi = csv.reader(
                    open(path_tezheng_1, 'r', encoding='utf-8'))
                t_file_list = [i for i in tezhengzhi]

                end_1 = len(t_file_list) - 1

                if start < len(t_file_list):  #如果.out文件的空白部分的帧数范围大于特征值的行数,就扔了

                    for i in range(start + 1):
                        t_file_list[i].insert(
                            0, '9')  #最前面的无音区间全部都打标签9,把它们当做正确认识来处理

                    for i in range(start_1, end_1 + 1):
                        t_file_list[i].insert(0, '9')

                    # l_jieguo_1.pop(-1)#最后句号的部分已经打过标签了,需要把它pop掉

                    print("ID")
                    print(ID)

                    _dianout = changpoyin(dianout, path_out_1,
                                          ID + '.out')  #把有长母音的地方处理一下

                    dianout_chongzao = cz.chongzao(l_biaozhi, _dianout,
                                                   ID)  #生成新的dianoutlist,以后就靠它了

                    # print('dianout_chongzao')
                    # print(dianout_chongzao)
                    #通过得到的新的list,开始打标签
                    # [['災害', [3, 40], 'C'], ['で', [41, 48], 'C'], ['ござい', [49, 77], 'C'], ['ます', [78, 98], 'C'],['から', [99, 130], 'C'], ['、', [131, 152], 'C'], ['その', [153, 177], 'C'], ['場', [178, 190], 'C'],['で', [191, 209], 'C']]
                    for i in dianout_chongzao:
                        start, end = i[1]
                        if start <= end_1:
                            if end_1 <= end:
                                end = end_1
                            if i[2] == 'C':
                                for b in range(start, end + 1):
                                    t_file_list[b].insert(0, '0')
                            else:

                                for b in range(start, end + 1):
                                    t_file_list[b].insert(0, '1')

                    changdu = len(
                        t_file_list[0])  #为了给逗号的部分打标签做准备,逗号在取得.out文件之前就已经被删掉了

                    n = 0
                    for item in t_file_list:  #给逗号的部分打标签
                        if len(item) != changdu:
                            t_file_list[n].insert(0, '0')
                        n = n + 1

                    path_xinde_tezhengzhi = os.path.join(
                        path_xinde, ID + '.csv')

                    with open(path_xinde_tezhengzhi, 'w+',
                              encoding='utf-8') as mergen_file:
                        for i in t_file_list:
                            mergen_file.write('%s\n' % ','.join(i))

        shanchuhang(path_xinde)  # 把有标记9的特征值全部都删除掉
コード例 #8
0
def dabiaoqian(path):
    from pykakasi import kakasi

    BASE_DIRS = path
    # 批次

    name_tezheng = 'mizhichuli_log'
    # 装有特征值的那个文件的文件名

    xinde = 'xinde_mizhichuli'
    # 装入新的特征值的文件名

    houzhui = '.wav.csv'
    # 特征值文件中除去id号之后的后缀部分

    name = 'align1'
    # 表记着CCCCSSSS标志的文件

    shibiejieguo = {}
    # 安放识别结果的字典

    symbolcidian = {}
    # 这样的词典,标志词典
    # id: C001L_086
    # ['S', 'S', 'S', 'C', 'S', 'D', 'D', 'D', 'C']
    # id: C001L_087
    # ['S', 'D', 'D', 'C']
    # id: C001L_088
    # ['S', 'S', 'S', 'S', 'D', 'D', 'D', 'D', 'C', 'C']
    zhengjie = {}
    # 正解文词典

    kakasi = kakasi()
    kakasi.setMode("H", "a")  # Hiragana to ascii, default: no conversion
    kakasi.setMode("K", "a")  # Katakana to ascii, default: no conversion
    kakasi.setMode("J", "a")  # Japanese to ascii, default: no conversion
    kakasi.setMode("r", "Hepburn")  # default: use Hepburn Roman table
    kakasi.setMode("s", True)  # add space, default: no separator
    conv = kakasi.getConverter()

    for per_dirs in os.listdir(BASE_DIRS):  # per_dirs = C001L,C001R...

        d_9 = os.path.join(BASE_DIRS, per_dirs, xinde)
        d = os.path.join(BASE_DIRS, per_dirs, xinde)
        mulu.mkdir(d)

        zhengjie, symbolcidian = zidian.zidian(per_dirs, BASE_DIRS)
        # 从标志文件中把标志塞进symbolcidian字典里

        for id in os.listdir(
                os.path.join(BASE_DIRS, per_dirs,
                             name_tezheng)):  # id = C001L,C001R下面的文件的名字

            banyun_1 = []  # 存储C的索引
            banyun_2 = []  # 存储正确的单词

            banyun_3 = []  # 存储非C的索引
            banyun_4 = []  # 存储暂时不正确的单词的拼音
            dianout = []

            id = id.replace(houzhui, '')  # 把文件名中的.wav.csv去掉只剩id

            # print(id)
            # print(symbolcidian[id])

            enumerate(symbolcidian[id])

            banyun_1 = [i for i, x in enumerate(symbolcidian[id])
                        if x == 'C']  # 返回标志C的索引
            banyun_3 = [i for i, x in enumerate(symbolcidian[id])
                        if x == 'S']  # 返回替换错误的单词的索引

            t_file = os.path.join(BASE_DIRS, per_dirs, name_tezheng,
                                  id + houzhui)
            a = csv.reader(open(t_file, 'r', encoding='utf-8'))
            t_file_list = [i for i in a]

            # if len(banyun_1) == 0:#如果没有一个是正确的,全错,所有的数据都打标签1
            #     for i in range(len(t_file_list)):
            #         t_file_list[i].insert(0, '1')
            # print(banyun_1)
            # print(banyun_3)
            # os.system("pause")

            for u in banyun_1:  # banyun_1里面装的全是标志C的索引

                if u + 1 <= len(zhengjie[id]):  # 正解文单词的个数可能没有标志的个数多
                    # print(banyun_1)
                    # print(zhengjie[id][u])
                    # print(zhengjie[id])
                    # print("已经把正确单词 %s 加入数组"%str(zhengjie[id][u]))
                    banyun_2.append(zhengjie[id][u])  # banyun_2是存储正确单词的索引的数组
                    # print("此时的banyun_2是")
                    # print(banyun_2)
                    # os.system('pause')
                else:  # 如果C标志的索引号大于正解文单词的索引号,那就只能手动去调整了
                    print("手动调一下这个文件吧%s" % id)
                    print("它的正确单词是")
                    print(banyun_2)
                    os.system("pause")
            # print(banyun_2)
            # os.system('pause')

            for w in banyun_3:  # 存储非C的索引

                if w + 1 <= len(zhengjie[id]):  # 正解文单词的个数可能没有标志的个数多

                    result = conv.do(zhengjie[id][w])
                    banyun_4.append(result)
                    # if result == zhengjie[id][w] and zhengjie[id][w] != '、':#如果是逗号,也按正常的单词处理
                    #
                    #     banyun_4.append(conv.do(_make_kana_convertor(strQ2B(zhengjie[id][w]))))#如果转化之后的值不变,就说明遇到了字母,把字母转化为半角,再再转化为片假名,之后再转化为罗马字加入列表中
                    # else:
                    # #     banyun_4.append(result)#存储暂时不正确的单词
                    # print("此时的banyun_4是")
                    # print(banyun_4)
                    # os.system('pause')

                else:  # 如果C标志的索引号大于正解文单词的索引号,那就只能手动去调整了
                    print("手动调一下这个文件吧%s" % id)
                    print("它的认识出现错误的单词是")
                    print(banyun_4)
                    os.system("pause")

                # print(banyun_2)
                # os.system("pause")

            # for p in symbolcidian[id]:
            #     os.system("pause")
            #     # while p == 'C':
            #     print(p.index('C'))

            dir_out = os.path.join(BASE_DIRS, per_dirs, 'keka', id + '.out')
            dianout = pi.read_out(dir_out)  # 提取出来的帧号跟julius识别结果一样
            # print(dianout)
            # os.system('pause')
            # 最后的效果:[['', [0, 18]], ['お', [19, 24]], ['願い', [25, 49]], ['三', [50, 82]], ['。', [83, 86]]]

            # [  37   58]  0.562999  で+接続詞	[で]
            start = dianout.pop(0)[1][1]

            # print(start)

            for i in range(start + 1):
                t_file_list[i].insert(0, '9')  # 最前面的无音区间全部都打标签9,把它们当做正确认识来处理

            for y in dianout:  # dianout是识别结果跟对应的帧数表

                # print("此时的单词是%s"%y)
                # print("此时的匹配结果是")
                # print(dianout)
                # os.system("pause")

                if y[1][1] + 1 <= len(t_file_list):  # 判断这个单词的范围是否超出了特征值得总行数

                    if y[0] == '':  # 跳过前面的无音区
                        continue

                    if y[0] == dianout[-1][
                            0]:  # 这段代码是为了把最后句号的部分全部打上标签9而设置的注意一下,下面也有一段代码
                        start, end = y[1]
                        for i in range(start, end + 1):
                            t_file_list[i].insert(0, '9')
                        continue

                    if y[0] in banyun_2:  # 如果这个单词存在列表banyun_2中,就给这个单词对应的帧数范围打标签0
                        start, end = y[1]
                        print("正在为文件 %s 的单词 %s 打标签" %
                              (os.path.split(dir_out)[1], y[0]))
                        for i in range(start, end + 1):
                            t_file_list[i].insert(0, '0')
                        banyun_2.remove(y[0])  # 打完标签0之后再从列表中把这个单词删掉

                    elif conv.do(
                            y[0]) == y[0] and y[0] != '、':  # 如果是字母的话,转化之后还是字母

                        print("发现识别结果中的字母%s" % y[0])
                        print("它在文件%s" % dir_out)

                        try:
                            zhuanhuazhi = conv.do(
                                make_kana_convertor._make_kana_convertor(
                                    strQ2B.strQ2B(y[0])))

                        except:
                            zhuanhuazhi = conv.do(
                                make_kana_convertor._make_kana_convertor(y[0]))

                        if zhuanhuazhi in banyun_4:  # 需要先把字母转化为片假名然后再转化为读音
                            print("转化之后的字母为%s" % zhuanhuazhi)
                            # os.system('pause')
                            start, end = y[1]
                            print("正在为文件 %s 的单词 %s 打标签" %
                                  (os.path.split(dir_out)[1], y[0]))
                            for i in range(start, end + 1):
                                t_file_list[i].insert(0, '0')
                            banyun_4.remove(
                                conv.do(zhuanhuazhi))  # 打完标签0之后再从列表中把这个单词删掉
                        else:
                            start, end = y[1]
                            print("正在为文件 %s 的单词 %s 打标签" %
                                  (os.path.split(dir_out)[1], y[0]))
                            for i in range(start, end + 1):
                                t_file_list[i].insert(0, '1')

                    elif conv.do(y[0]) in banyun_4:
                        start, end = y[1]
                        print("正在为文件 %s 的单词 %s 打标签" %
                              (os.path.split(dir_out)[1], y[0]))
                        for i in range(start, end + 1):
                            t_file_list[i].insert(0, '0')
                        banyun_4.remove(conv.do(y[0]))  # 打完标签0之后再从列表中把这个单词删掉

                    else:
                        start, end = y[
                            1]  # 如果这个单词不在列表banyun_2中,就给这个单词对应的帧数范围打标签1
                        print("正在为文件 %s 的单词 %s 打标签" %
                              (os.path.split(dir_out)[1], y[0]))
                        for i in range(start, end + 1):
                            t_file_list[i].insert(0, '1')

                elif y[1][1] + 1 > len(t_file_list):

                    if y[0] == '':
                        continue

                    if y[0] == dianout[-1][0]:
                        start = y[1][0]
                        end = len(t_file_list)
                        for i in range(
                                start, end
                        ):  # 如果是y[1][1]+1 > len(t_file_list)的情况这里end就不能加一了
                            t_file_list[i].insert(0, '9')
                        continue
                    # 这段代码是为了把最后句号的部分全部打上标签9而设置的注意一下,上面也有一段代码

                    if y[0] in banyun_2:
                        start = y[1][0]
                        end = len(
                            t_file_list)  # 如果这个单词的帧数表的范围超出了特征值得行数,就以特征值行数作为end
                        print("正在为文件 %s 的单词 %s 打标签" %
                              (os.path.split(dir_out)[1], y[0]))
                        for i in range(start, end):
                            t_file_list[i].insert(0, '0')
                        banyun_2.remove(y[0])

                    elif conv.do(
                            y[0]) == y[0] and y[0] != '、':  # 如果是字母的话,转化之后还是字母

                        if conv.do(
                                make_kana_convertor._make_kana_convertor(
                                    y[0])) in banyun_4:  # 需要先把字母转化为片假名然后再转化为读音
                            start = y[1][0]
                            end = len(t_file_list)
                            print("正在为文件 %s 的单词 %s 打标签" %
                                  (os.path.split(dir_out)[1], y[0]))
                            for i in range(start, end + 1):
                                t_file_list[i].insert(0, '0')
                            banyun_4.remove(
                                conv.do(
                                    make_kana_convertor._make_kana_convertor(
                                        y[0])))  # 打完标签0之后再从列表中把这个单词删掉

                        else:
                            start = y[1][0]
                            end = len(t_file_list)
                            print("正在为文件 %s 的单词 %s 打标签" %
                                  (os.path.split(dir_out)[1], y[0]))
                            for i in range(start, end + 1):
                                t_file_list[i].insert(0, '1')
                    else:
                        start = y[1][0]
                        end = len(t_file_list)
                        print("正在为文件 %s 的单词 %s 打标签" %
                              (os.path.split(dir_out)[1], y[0]))
                        for i in range(start, end):
                            t_file_list[i].insert(0, '1')

            with open(os.path.join(BASE_DIRS, per_dirs, xinde, id + '.csv'),
                      'w+',
                      encoding='utf-8') as mergen_file:
                for i in t_file_list:
                    mergen_file.write('%s\n' % ','.join(i))

        shanchu.shanchuhang(d_9)  # 把有标记9的特征值全部都删除掉
コード例 #9
0
ファイル: fuliye_gai_yinsu.py プロジェクト: shuyuqing/-
def mizhichuli(basedir, chuangkou, padding, lintianchong):

    jiachuang = True
    bulin = 'log_qian5'
    mizhichuli_log = 'mizhichuli'

    for mulu_1 in os.listdir(basedir):
        lujing = os.path.join(basedir, mulu_1, mizhichuli_log)
        # 处理之后的特征值文件应该存放的地方
        filepath = os.path.join(basedir, mulu_1, bulin)
        # 需要被处理的特征值文件

        mulu.mkdir(lujing)

        for name in os.listdir(filepath):

            file_path = os.path.join(filepath, name)
            final_path = os.path.join(lujing, name.replace('.log', ''))

            tezheng = np.loadtxt(file_path, delimiter=',')
            tezheng = np.array(tezheng, float)

            hang = tezheng.shape[0]
            lie = tezheng.shape[1]

            # 获得特征值矩阵的行数和列数

            newtezheng = []

            global block, start, end, huishu
            start = 0
            block = chuangkou
            # block是窗口的大小
            huishu = hang - block + 1
            # 每一列要做的傅里叶变化的次数

            end = block

            tezheng_3 = [[] for row in range(huishu)]
            # print(len(tezheng_3))

            for i in range(lie):
                # print("现在处理第%d列"%i)
                start = 0
                end = block
                tezheng_1 = np.loadtxt(file_path, delimiter=',', usecols=(i))
                # 一列一列第取出数据,usecols起到了这个作用

                N = block

                for m in range(huishu):  #m是tezheng_3这个列表里面的序号

                    zhenggui_list_1 = []

                    for u in tezheng_1[start:end]:

                        zhenggui_list_1.append(u)

                    zhenggui_list_1 = np.array(zhenggui_list_1)

                    if jiachuang == True:

                        zhenggui_list_1 = jc.jiachuangzi(zhenggui_list_1)  #加窗

                    if lintianchong == True:  #如果要进行零填充,这个变量就要设置为True

                        # print(zhenggui_list_1)
                        # os.system('pause')

                        lin = np.zeros(padding)  #进行零补充
                        zhenggui_list_1 = np.hstack((zhenggui_list_1, lin))

                        # print(zhenggui_list_1)
                        # os.system('pause')

                    tezheng_2 = nf.fft(zhenggui_list_1)

                    # print(tezheng_2)
                    # print("第%d列的第%d波"%(i,m))
                    # print(tezheng_2)
                    block_1 = block + padding
                    # print('输出block')
                    # print(block_1)

                    q = int(block_1 / 2)
                    # for n in range(q, block_1):  # 之前括号里面的值是block_1,因为要扔掉一半所以改了

                    zhongzhuan = []

                    for n in range(0, q):  # 要取前半部分

                        zhongzhuan.append(
                            ma.sqrt(
                                ma.pow(tezheng_2[n].imag, 2) +
                                ma.pow(tezheng_2[n].real, 2)))

                    zhongzhuan = np.array(zhongzhuan)

                    zhongzhuan = zg.zhenggui(zhongzhuan,
                                             int(zhongzhuan.shape[0]))  #正规化的部分

                    tezheng_3[m].extend(zhongzhuan)

                    # print(tezheng_3)
                    #
                    # os.system('pause')

                    # tezheng_3[m] = np.array(tezheng_3[m])

                    # tezheng_3[m] = zg.zhenggui(tezheng_3[m],int(tezheng_3[m].shape[0]))

                    start = start + 1
                    end = end + 1

            newtezheng = np.array(tezheng_3, dtype=np.float)
            # newtezheng = np.transpose(newtezheng)
            np.savetxt(final_path, newtezheng, delimiter=',')
コード例 #10
0
ファイル: fuliye_gai.py プロジェクト: shuyuqing/-
def mizhichuli(basedir):

    bulin = 'bulin'

    mizhichuli_log = 'mizhichuli_log'

    for mulu_1 in os.listdir(basedir):
        lujing = os.path.join(basedir, mulu_1, mizhichuli_log)
        # 处理之后的特征值文件应该存放的地方
        filepath = os.path.join(basedir, mulu_1, bulin)
        # 需要被处理的特征值文件

        mulu.mkdir(lujing)

        for name in os.listdir(filepath):

            file_path = os.path.join(filepath, name)
            final_path = os.path.join(lujing, name.replace('.log', ''))

            tezheng = np.loadtxt(file_path, delimiter=',')
            tezheng = np.array(tezheng, float)

            hang = tezheng.shape[0]
            lie = tezheng.shape[1]

            # 获得特征值矩阵的行数和列数

            newtezheng = []

            global block, start, end, huishu
            start = 0
            block = 32
            # block是窗口的大小
            huishu = hang - block + 1
            # 每一列要做的傅里叶变化的次数

            end = block

            tezheng_3 = [[] for row in range(huishu)]
            # print(len(tezheng_3))

            for i in range(lie):
                # print("现在处理第%d列"%i)
                start = 0
                end = block
                tezheng_1 = np.loadtxt(file_path, delimiter=',', usecols=(i))
                # 一列一列第取出数据,usecols起到了这个作用

                for m in range(huishu):

                    # print(tezheng_1[start:end])
                    tezheng_2 = nf.fft(tezheng_1[start:end])

                    # print("第%d列的第%d波"%(i,m))
                    # print(tezheng_2)
                    q = int(block / 2)
                    for n in range(q, block):  # 之前括号里面的值是block,因为要扔掉一半所以改了
                        zhongzhuan = []
                        zhongzhuan.append(ma.sqrt(ma.pow(tezheng_2[n].imag, 2) + ma.pow(tezheng_2[n].real, 2)))
                        tezheng_3[m].extend(zhongzhuan)
                        # tezheng_3 = ma.sqrt(ma.pow(tezheng_2[n].imag,2)+ma.pow(tezheng_2[n].real,2))
                    # print(tezheng_3)
                    # print("新的特征值")
                    # print(tezheng_3[m])
                    # print('下一波')
                    start = start + 1
                    end = end + 1

            newtezheng = np.array(tezheng_3, dtype=np.float)
            # newtezheng = np.transpose(newtezheng)
            np.savetxt(final_path, newtezheng, delimiter=',')
コード例 #11
0
import os
import muluzai as mu

path = r"C:\Users\a7825\Desktop\新建文件夹 (3)"  #提供文件名列表
path_1 = r"C:\Users\a7825\Desktop\新建文件夹 (5)"  #要把新建文件夹放在哪

mingdan = os.listdir(path)

for i in mingdan:

    path_2 = os.path.join(path_1, i.replace(os.path.splitext(i)[-1],
                                            ''))  #把文件的后缀名去掉
    mu.mkdir(path_2)

    path_wav = os.path.join(path_2, 'wav')  #想在生成的文件夹里创建什么文件夹都可以
    path_wav_all = os.path.join(path_2, 'wav_all')
    path_keka = os.path.join(path_2, 'keka')

    mu.mkdir(path_wav_all)
    mu.mkdir(path_wav)
    mu.mkdir(path_keka)
    file = open(
        os.path.join(path_2,
                     i.replace(os.path.splitext(i)[-1], '') + '.txt'), 'w')
    file_1 = open(os.path.join(path_2, 'chasen.txt'), 'w')
コード例 #12
0
def zhengli(path, guanjianzi_2, dataname_1, dataname_2):

    if len(dataname_1) > len(dataname_2):

        dataname = dataname_1

    else:

        dataname = dataname_2

    path_data = os.path.join(path, dataname)
    muluzai.mkdir(path_data)

    path_mizhichuli = os.path.join(path_data, 'mizhichuli')
    muluzai.mkdir(path_mizhichuli)

    path_mizhichuli_all = os.path.join(path_mizhichuli, 'all')
    path_mizhichuli_opentest = os.path.join(path_mizhichuli, 'opentest')
    path_mizhichuli_closetest = os.path.join(path_mizhichuli, 'closetest')
    path_mizhichuli_xuexi = os.path.join(path_mizhichuli, 'xuexi')

    muluzai.mkdir(path_mizhichuli_all)
    muluzai.mkdir(path_mizhichuli_opentest)
    muluzai.mkdir(path_mizhichuli_closetest)
    muluzai.mkdir(path_mizhichuli_xuexi)

    for wenjian in os.listdir(path):

        if wenjian != dataname:
            path_1 = os.path.join(path, wenjian)
            path_3 = os.path.join(path_1, guanjianzi_2)

            #这个部分是mizhichuli的部分
            for i in os.listdir(path_3):
                copyfile(os.path.join(path_3, i),
                         os.path.join(path_mizhichuli_all, i))

            wenjian_mizhichuli_all = os.listdir(path_3)

            wenjian_mizhichuli_open = []
            wenjian_mizhichuli_xuexi = []

            perm = np.random.permutation(
                len(wenjian_mizhichuli_all))  #随机生成size跟文件数量相等的列表

            perm_1 = perm[:int(len(wenjian_mizhichuli_all) /
                               fenshu)]  # 这里的5是全部数据的五分之一作为测试数据
            for a in perm_1:
                wenjian_mizhichuli_open.append(wenjian_mizhichuli_all[a])

            perm_2 = perm[int(len(wenjian_mizhichuli_all) /
                              fenshu):]  # 剩下的五分之四是学习数据
            for i in perm_2:
                wenjian_mizhichuli_xuexi.append(wenjian_mizhichuli_all[i])

            wenjian_mizhichuli_close = wenjian_mizhichuli_xuexi[:int(
                len(wenjian_mizhichuli_all) / fenshu)]  # 学习数据里面的一部分是闭测的数据

            # wenjian_mizhichuli_open = wenjian_mizhichuli_all[:int(len(wenjian_mizhichuli_all)/5)]#这里的5是全部数据的五分之一作为测试数据
            # wenjian_mizhichuli_xuexi = wenjian_mizhichuli_all[int(len(wenjian_mizhichuli_all)/5):]
            # wenjian_mizhichuli_close = wenjian_mizhichuli_xuexi[:int(len(wenjian_mizhichuli_all)/5)]

            for u in wenjian_mizhichuli_close:
                copyfile(os.path.join(path_3, u),
                         os.path.join(path_mizhichuli_closetest, u))
            for u in wenjian_mizhichuli_xuexi:
                copyfile(os.path.join(path_3, u),
                         os.path.join(path_mizhichuli_xuexi, u))
            for u in wenjian_mizhichuli_open:
                copyfile(os.path.join(path_3, u),
                         os.path.join(path_mizhichuli_opentest, u))