def createFileLst(dataDirs, dataExts, dataDim, dataListDirs, trnList, valList): """ create data lists output *.scp will be in dataListDirs """ dataDirs = dataDirs.split(',') dataExts = dataExts.split(',') dataDims = [int(x) for x in dataDim.split('_')] assert len(dataDirs) == len( dataExts), 'Error: sub_1_prepare_list.py dataDirs and dataExts wrong' # get the cross-set of file lists dataList = lstdirNoExt(dataDirs[0], dataExts[0]) for dataDir, dataExt in zip(dataDirs[1:], dataExts[1:]): listTmp = lstdirNoExt(dataDir, dataExt) dataList = crossSet(dataList, listTmp) # check if file exists if len(dataList) < 1: display.self_print("Error: fail to found data. Please check:", 'error') display.self_print( "path_acous_feats, ext_acous_feats, path_waveform in config.py;", 'error') display.self_print("Please also check the names of input data files.", 'error') raise Exception("Error: fail to generate file list.") # check if data exists pre_defined_trn_list = readwrite.read_txt_list(trnList) pre_defined_val_list = readwrite.read_txt_list(valList) diff_trn_list = diff_list(pre_defined_trn_list, dataList) diff_val_list = diff_list(pre_defined_val_list, dataList) if len(diff_trn_list): display.self_print("Error: training data missing. Please check:", 'error') print(diff_trn_list) raise Exception("Error: fail to prepare file list.") if len(diff_val_list): display.self_print("Error: validation data missing. Please check:", 'error') print(diff_val_list) raise Exception("Error: fail to prepare file list.") # before start, take a simple test on the configuration of feature dimension frameNum = None for inputDir, featDim, featName in zip(dataDirs[0:-1], dataDims[0:-1], dataExts[0:-1]): inputFile = os.path.join(inputDir, dataList[0]) + '.' + featName.lstrip('.') if os.path.isfile(inputFile): tmpframeNum = readwrite.read_raw_mat(inputFile, featDim).shape[0] if frameNum is None or frameNum < tmpframeNum: frameNum = tmpframeNum for inputDir, featDim, featName in zip(dataDirs[0:-1], dataDims[0:-1], dataExts[0:-1]): inputFile = os.path.join(inputDir, dataList[0]) + '.' + featName.lstrip('.') if os.path.isfile(inputFile): tmpframeNum = readwrite.read_raw_mat(inputFile, featDim).shape[0] if np.abs(frameNum - tmpframeNum) * 1.0 / frameNum > 0.1: if featDim == readwrite.read_raw_mat(inputFile, 1).shape[0]: pass else: display.self_print("Large mismatch of frame numbers %s" % (inputFile)) display.self_print( "Please check whether inputDim are correct", 'error') display.self_print("Or check input features are corrupted", 'error') raise Exception("Error: mismatch of frame numbers") display.self_print('Generating data lists in to %s' % (dataListDirs), 'highlight') if True: trainSet = pre_defined_trn_list valSet = pre_defined_val_list if len(valSet) > len(trainSet): display.self_print( "Warning: validation set is larger than training set", 'warning') display.self_print("It's better to change train_utts in config.py", 'warning') trainFileOut = dataListDirs + os.path.sep + 'train.lst' trainFilePtr = open(trainFileOut, 'w') for fileName in trainSet: trainFilePtr.write('%s\n' % (fileName)) trainFilePtr.close() if len(valSet): valFileOut = dataListDirs + os.path.sep + 'val.lst' valFilePtr = open(valFileOut, 'w') for fileName in valSet: valFilePtr.write('%s\n' % (fileName)) valFilePtr.close() display.self_print( '\ttrain/val sizes: %d, %d' % (len(trainSet), len(valSet)), 'warning') # done return
#!/usr/bin/python import os import sys from ioTools import readwrite def common_list(list1, list2): return list(set(list1).intersection(list2)) if __name__ == "__main__": len_arg = len(sys.argv) - 1 for idx in range(len_arg): if idx == 0: file_common_list = readwrite.read_txt_list(sys.argv[idx + 1]) else: file_temp_list = readwrite.read_txt_list(sys.argv[idx + 1]) file_common_list = common_list(file_common_list, file_temp_list) for file_name in file_common_list: print(file_name)
try: os.mkdir(tmp_nc_dir) except OSError: pass tmp_data_nc_config = cfg.tmp_test_data_nc_config cmd = 'sh %s' % ( cfg.path_scripts) + os.path.sep + 'sub_05_package_datanc.sh' cmd = cmd + ' %s %s' % (tmp_sub_nc_dir, tmp_idx_dir) cmd = cmd + ' testset %s %s %s' % (tmp_test_lst, tmp_data_nc_config, cfg.path_pyTools_scripts) exe_cmd(cmd, cfg.debug) if os.path.isfile(tmp_test_nc_scp): tmp_test_data_nc_list = readwrite.read_txt_list(tmp_test_nc_scp) if len(tmp_test_data_nc_list) < 1: display.self_print( 'Error: not found test data.nc in %s' % (tmp_sub_nc_dir), 'error') quit() tmp_test_data_nc_args = ','.join(tmp_test_data_nc_list) else: display.self_print('Error: not found %s' % (tmp_test_nc_scp), 'error') quit() # No need to get F0 mean and std if False: # get F0 mean and std dimCnt = 0 f0Dim = -1
labdir = "/home/smg/takaki/FEAT/F009/data/ver01/full" labout = "/home/smg/wang/DATA/speech/F009A/nndata/labels/full_align/test_set" prefix = "ATR_Ximera_F009A_" resolu = 50000 ncData = io.netcdf_file(ncFile, 'r') sentNm = ncData.dimensions['numSeqs'] sentNa = ncData.variables['seqTags'][:].copy() sentTi = ncData.variables['seqLengths'][:].copy() start = 0 for id, sentId in enumerate(sentNa): sentId = ''.join(sentId) labinpfile = labdir + os.path.sep + sentId + '.lab' laboutfile = labout + os.path.sep + sentId + '.lab' labentrys = py_rw.read_txt_list(labinpfile) stime, etime = start, start + sentTi[id] data = ncData.variables['inputs'][stime:etime, 0:-3].copy() data = (data * data).sum(axis=1) difd = np.diff(data) indx = np.concatenate( (np.array([0]), np.argwhere(difd).flatten(), np.array([etime]))) if len(indx) == len(labentrys) + 1: temp = '' for x in range(len(labentrys)): st = indx[x] * resolu et = indx[x + 1] * resolu lab = labentrys[x].split() temp += "%d %d %s\n" % (st, et, lab[2]) fil = open(laboutfile, 'w') fil.write(temp[0:-1])
tmp_data_dir = os.getcwd() + os.path.sep + cfg.tmp_data_dir tmp_nc_dir = tmp_data_dir + os.path.sep + cfg.tmp_nc_dir tmp_trn_nc_dir = tmp_nc_dir + os.path.sep + cfg.tmp_nc_dir_train tmp_trn_nc_scp = tmp_trn_nc_dir + os.path.sep + 'data.scp' tmp_val_nc_dir = tmp_nc_dir + os.path.sep + cfg.tmp_nc_dir_val tmp_val_nc_scp = tmp_val_nc_dir + os.path.sep + 'data.scp' tmp_mv_data = os.getcwd() + os.path.sep + cfg.tmp_name_mean_file tmp_mdn_config = os.getcwd() + os.path.sep + cfg.tmp_data_dir tmp_mdn_config = tmp_mdn_config + os.path.sep + cfg.tmp_mdn_config_name # Get the string of training data.nc files if os.path.isfile(tmp_trn_nc_scp): tmp_trn_data_nc_list = readwrite.read_txt_list(tmp_trn_nc_scp) if len(tmp_trn_data_nc_list) < 1: display.self_print( 'Error: not found train data.nc in %s' % (tmp_trn_nc_dir), 'error') quit() tmp_trn_data_nc_args = ','.join(tmp_trn_data_nc_list) else: display.self_print('Error: not found %s' % (tmp_trn_nc_scp), 'error') quit() if os.path.isfile(tmp_val_nc_scp): tmp_val_data_nc_list = readwrite.read_txt_list(tmp_val_nc_scp) if len(tmp_val_data_nc_list) < 1: display.self_print('Warning: val data.nc is not used', 'warning') tmp_val_data_nc_args = ''
def prepareData(): """ prepreData: 1. create the file list 2. create the symbolic link to the feature data 3. create the index file (used by CURRENNT) 4. create package data of index file (data.nc) 5. calculate the mean and std for a specific data set """ # create directories dataDir = cfg.nnDataDirName try: os.mkdir(dataDir) except OSError: pass dataListPath = dataDir + os.path.sep + 'lists' try: os.mkdir(dataListPath) except OSError: pass dataRawDir = dataDir + os.path.sep + cfg.idxDirName try: os.mkdir(dataRawDir) except OSError: pass # decide whether create the symbolic link to each file if len(cfg.inputDirs) == 1 and len(cfg.outputDirs) == 1: # no validation set flagFileUseSymbolLink = False elif listSameContent(cfg.inputDirs) and listSameContent( cfg.outputDirs) and listSameContent( cfg.inputDirs[0]) and listSameContent(cfg.outputDirs[0]): # all data have been in the same directory flagFileUseSymbolLink = False else: flagFileUseSymbolLink = True #dataLinkDir = dataDir + os.path.sep + cfg.linkDirname dataLinkDirInput = dataDir + os.path.sep + cfg.linkDirname_input dataLinkDirOutput = dataDir + os.path.sep + cfg.linkDirname_output # prepare for data link if flagFileUseSymbolLink: try: os.mkdir(dataLinkDirInput) os.mkdir(dataLinkDirOutput) except OSError: pass else: if os.path.islink(dataLinkDirInput): os.system("rm %s" % (dataLinkDirInput)) if os.path.islink(dataLinkDirOutput): os.system("rm %s" % (dataLinkDirOutput)) os.system("ln -s %s %s" % (cfg.inputDirs[0][0], dataLinkDirInput)) os.system("ln -s %s %s" % (cfg.outputDirs[0][0], dataLinkDirOutput)) # create file list for dataList, inputDirSet, outputDirSet, dataPart in zip( cfg.dataLists, cfg.inputDirs, cfg.outputDirs, cfg.dataDivision): display.self_print('Processing ' + dataPart + ' data', 'highlight') if dataList is None: # get the cross-set of file list listInput = readwrite.list_file_name_in_dir(inputDirSet[0]) listOutput = readwrite.list_file_name_in_dir(outputDirSet[0]) fileList = listInput if inputDirSet: for inputDir in inputDirSet: listInput2 = readwrite.list_file_name_in_dir(inputDir) fileList, diffSet = crossSet(fileList, listInput2) tmpName = os.path.join( dataListPath, dataPart + os.path.basename(inputDir) + '.dif.lst') readwrite.write_txt_list(diffSet, tmpName) if outputDirSet: for outputDir in outputDirSet: listOutput2 = readwrite.list_file_name_in_dir(outputDir) fileList, diffSet = crossSet(fileList, listOutput2) tmpName = os.path.join( dataListPath, dataPart + os.path.basename(outputDir) + '.dif.lst') readwrite.write_txt_list(diffSet, tmpName) # writing the list of file name random.shuffle(fileList) fileListFilePath = dataListPath + os.path.sep + dataPart + '.lst' readwrite.write_txt_list(fileList, fileListFilePath) else: fileListFilePath = dataListPath + os.path.sep + dataPart + '.lst' os.system("cp %s %s" % (dataList, fileListFilePath)) fileList = readwrite.read_txt_list(fileListFilePath) # before start, take a simple test on the configuration of feature dimension frameNum = None for inputDir, featDim, featName in zip(inputDirSet, cfg.inputDim, cfg.inputExt): inputFile = os.path.join(inputDir, fileList[0]) + '.' + featName if os.path.isfile(inputFile): tmpframeNum = readwrite.read_raw_mat(inputFile, featDim).shape[0] if frameNum is None: frameNum = tmpframeNum elif np.abs(frameNum - tmpframeNum) * 1.0 / frameNum > 0.1: display.self_print("Large mismatch of frame numbers %s" % (fileList[0])) display.self_print( "Please check whether inputDim are correct", 'error') display.self_print("Or check input features are corrupted", 'error') raise Exception("Error: mismatch of frame numbers") for outputDir, featDim, featName in zip(outputDirSet, cfg.outputDim, cfg.outputExt): outputFile = os.path.join(outputDir, fileList[0]) + '.' + featName if os.path.isfile(outputFile): tmpframeNum = readwrite.read_raw_mat(outputFile, featDim).shape[0] if np.abs(frameNum - tmpframeNum) * 1.0 / frameNum > 0.1: display.self_print("Large mismatch of frame numbers %s" % (fileList[0])) display.self_print( "Please check whether inputDim are correct", 'error') display.self_print("Or check input features are corrupted", 'error') raise Exception("Error: mismatch of frame numbers") # create file directories dataSaveDir = dataDir + os.path.sep + dataPart try: os.mkdir(dataSaveDir) except OSError: pass inputScpList = [] outputScpList = [] # create the fileName + fileExt lists # create symbolic link for inputDir, featDim, featName in zip(inputDirSet, cfg.inputDim, cfg.inputExt): tmpFileScp = dataSaveDir + os.path.sep + featName + '.scp' inputScpList.append(tmpFileScp) filePtr = open(tmpFileScp, 'w') for fileName in fileList: # write full path to the feature filePtr.write('%s%s%s.%s\n' % (inputDir, os.path.sep, fileName, featName)) if cfg.step01Prepare_LINK is True and flagFileUseSymbolLink: os.system("ln -f -s %s%s%s.%s %s%s%s.%s" % \ (inputDir, os.path.sep, fileName, featName, dataLinkDirInput, os.path.sep, fileName, featName)) filePtr.close() for outputDir, featDim, featName in zip(outputDirSet, cfg.outputDim, cfg.outputExt): tmpFileScp = dataSaveDir + os.path.sep + featName + '.scp' outputScpList.append(tmpFileScp) filePtr = open(tmpFileScp, 'w') for fileName in fileList: filePtr.write('%s%s%s.%s\n' % (outputDir, os.path.sep, fileName, featName)) if cfg.step01Prepare_LINK is True and flagFileUseSymbolLink: os.system("ln -f -s %s%s%s.%s %s%s%s.%s" % \ (outputDir, os.path.sep, fileName, featName, dataLinkDirOutput, os.path.sep, fileName, featName)) filePtr.close() # create index file list filePtr = open(dataSaveDir + os.path.sep + cfg.idxFileName + '.scp', 'w') for fileName in fileList: filePtr.write('%s%s%s.%s\n' % (dataRawDir, os.path.sep, fileName, cfg.idxFileName)) filePtr.close() # create index files if cfg.step01Prepare_IDX is True or cfg.step01Prepare_PACK is True: # create the lab index lists cmd = 'python %s/dataPrepare/getLabIdx5ms.py' % ( cfg.path_pyTools_scripts) cmd = '%s %s %s %s %s %s %s' % ( cmd, inputDirSet[0], cfg.inputExt[0], cfg.inputDim[0], dataRawDir, cfg.idxFileName, fileListFilePath) display.self_print('Creating time index files', 'highlight') exe_cmd(cmd, cfg.debug) else: display.self_print('skip creating time index', 'highlight') # package the data if cfg.step01Prepare_IDX is True or cfg.step01Prepare_PACK is True: # write data_config.cfg writeDataConfig(dataSaveDir + os.path.sep + 'data_config.py', cfg.idxFileName + '.scp', cfg.fileNumInEachNCPack) # pack data packDataCmd = 'sh %s/sub_05_package_datanc.sh %s %s' % ( cfg.path_scripts, dataSaveDir, cfg.path_pyTools_scripts) display.self_print('Packing data', 'highlight') exe_cmd(packDataCmd, cfg.debug) else: display.self_print('skip packing data', 'highlight') # create file list for inputDirSet, outputDirSet, dataPart in zip(cfg.inputDirs, cfg.outputDirs, cfg.dataDivision): dataSaveDir = dataDir + os.path.sep + dataPart inputScpList = [] outputScpList = [] for inputDir, featDim, featName in zip(inputDirSet, cfg.inputDim, cfg.inputExt): inputScpList.append(dataSaveDir + os.path.sep + featName + '.scp') for outputDir, featDim, featName in zip(outputDirSet, cfg.outputDim, cfg.outputExt): outputScpList.append(dataSaveDir + os.path.sep + featName + '.scp') # calculate mean and std if dataPart == cfg.computMeanStdOn and cfg.step01Prepare_MV is True: display.self_print('Calculating mean and std', 'highlight') meanStdTool.meanStdNormMask( inputScpList, cfg.inputDim, cfg.inputNormMask, dataSaveDir + os.path.sep + cfg.nnDataInputMV) display.self_print( "\nSave input mean-std as %s" % (os.path.join(dataSaveDir, cfg.nnDataInputMV)), 'highlight') meanStdTool.meanStdNormMask( outputScpList, cfg.outputDim, cfg.outputNormMask, dataSaveDir + os.path.sep + cfg.nnDataOutputMV) display.self_print( "\nSave output mean-std as %s" % (os.path.join(dataSaveDir, cfg.nnDataOutputMV)), 'highlight') else: display.self_print('skip calculating mean and std', 'highlight')
#!/usr/bin/python from speechTools import wavTool from scipy.io import wavfile from ioTools import readwrite as py_rw import os import sys import numpy as np dirPath = sys.argv[1] quantiBitNum = int(sys.argv[2]) samplingRate = int(sys.argv[3]) fileList = py_rw.read_txt_list(dirPath + '/gen.scp') for fileName in fileList: fileName = fileName.rstrip('\n') nameHtk = dirPath + os.path.sep + os.path.basename(fileName).rstrip( '.htk') + '.htk' nameRaw = dirPath + os.path.sep + os.path.basename(fileName).rstrip( '.htk') + '.raw' nameWav = dirPath + os.path.sep + os.path.basename(fileName).rstrip( '.htk') + '.wav' print nameRaw, nameWav data = py_rw.read_htk(nameHtk, 'f4', 'b') if quantiBitNum > 0: quantiLevel = np.power(2, quantiBitNum) - 1 py_rw.write_raw_mat(data, nameRaw) wavTool.raw2wav(nameRaw, nameWav, quantiLevel,
#!/usr/bin/python from speechTools import wavTool from scipy.io import wavfile from ioTools import readwrite as py_rw import os import sys import numpy as np dirPath = sys.argv[1] quantiBitNum = int(sys.argv[2]) samplingRate = int(sys.argv[3]) fileList = py_rw.read_txt_list(dirPath + '/gen.scp') for fileName in fileList: fileName = fileName.rstrip('\n') nameHtk = dirPath + os.path.sep + os.path.basename(fileName).rstrip('.htk') + '.htk' nameRaw = dirPath + os.path.sep + os.path.basename(fileName).rstrip('.htk') + '.raw' nameWav = dirPath + os.path.sep + os.path.basename(fileName).rstrip('.htk') + '.wav' print nameRaw, nameWav data = py_rw.read_htk(nameHtk, 'f4', 'b') if quantiBitNum > 0: quantiLevel = np.power(2, quantiBitNum)-1 py_rw.write_raw_mat(data, nameRaw) wavTool.raw2wav(nameRaw, nameWav, quantiLevel, samplingRate=samplingRate) else: wavfile.write(nameWav, samplingRate, data)
#!/usr/bin/python import os import sys from ioTools import readwrite def common_list(list1, list2): return list(set(list1).intersection(list2)) if __name__ == "__main__": list1 = readwrite.read_txt_list(sys.argv[1]) list2 = readwrite.read_txt_list(sys.argv[2]) common_list = common_list(list1, list2) if len(sys.argv) > 3 and sys.argv[3] == 'part': idx = 0 for file_name in common_list: print(file_name) idx = idx + 1 if idx > 10: break print("%d lines" % (len(common_list))) else: common_list.sort() for file_name in common_list: print(file_name)
labdir = "/home/smg/takaki/FEAT/F009/data/ver01/full" labout = "/home/smg/wang/DATA/speech/F009A/nndata/labels/full_align/test_set" prefix = "ATR_Ximera_F009A_" resolu = 50000 ncData = io.netcdf_file(ncFile, 'r') sentNm = ncData.dimensions['numSeqs'] sentNa = ncData.variables['seqTags'][:].copy() sentTi = ncData.variables['seqLengths'][:].copy() start = 0 for id, sentId in enumerate(sentNa): sentId = ''.join(sentId) labinpfile = labdir+os.path.sep+sentId+'.lab' laboutfile = labout+os.path.sep+sentId+'.lab' labentrys = py_rw.read_txt_list(labinpfile) stime, etime = start, start+sentTi[id] data = ncData.variables['inputs'][stime:etime, 0:-3].copy() data = (data*data).sum(axis=1) difd = np.diff(data) indx = np.concatenate((np.array([0]), np.argwhere(difd).flatten(), np.array([etime]))) if len(indx)==len(labentrys)+1: temp = '' for x in xrange(len(labentrys)): st = indx[x]*resolu et = indx[x+1]*resolu lab = labentrys[x].split() temp += "%d %d %s\n" % (st, et, lab[2]) fil = open(laboutfile, 'w') fil.write(temp[0:-1]) fil.close()
if syllabel != preSyl: dataMat[frameStart] = np.bitwise_or(dataMat[frameStart], bitInfo['syl']) if worlabel != preWor: dataMat[frameStart] = np.bitwise_or(dataMat[frameStart], bitInfo['wor']) if len(preWor)==0 or preWor == phraseSym or worlabel == phraseSym: dataMat[frameStart] = np.bitwise_or(dataMat[frameStart], bitInfo['phr']) preSyl = syllabel preWor = worlabel if CheckBinary: pholabel = phodata[2][idx1] for t in range(np.int(frameStart), np.int(frameEnd)): print "%d, %s [%s %s %s]" % (t,np.binary_repr(dataMat[t], len(bitInfo)), pholabel[0:6], syllabel[0:6], worlabel[0:6]) py_rw.write_raw_mat(dataMat, DataDir+os.path.sep+DataFile+'.bin', 'u1') return DataTime DataFiles = py_rw.read_txt_list(DataList) frameNum = 0 for idx1, DataFile in enumerate(DataFiles): print "Process %s (%d / %d)" % (DataFile, idx1+1, len(DataFiles)) frameNum = frameNum + CreateTimeMatrix(DataFile) print "Total %d frames" % (frameNum)
if syllabel != preSyl: dataMat[frameStart] = np.bitwise_or(dataMat[frameStart], bitInfo['syl']) if worlabel != preWor: dataMat[frameStart] = np.bitwise_or(dataMat[frameStart], bitInfo['wor']) if len(preWor) == 0 or preWor == phraseSym or worlabel == phraseSym: dataMat[frameStart] = np.bitwise_or(dataMat[frameStart], bitInfo['phr']) preSyl = syllabel preWor = worlabel if CheckBinary: pholabel = phodata[2][idx1] for t in range(np.int(frameStart), np.int(frameEnd)): print("%d, %s [%s %s %s]" % (t, np.binary_repr(dataMat[t], len(bitInfo)), pholabel[0:6], syllabel[0:6], worlabel[0:6])) py_rw.write_raw_mat(dataMat, DataDir + os.path.sep + DataFile + '.bin', 'u1') return DataTime DataFiles = py_rw.read_txt_list(DataList) frameNum = 0 for idx1, DataFile in enumerate(DataFiles): print("Process %s (%d / %d)" % (DataFile, idx1 + 1, len(DataFiles))) frameNum = frameNum + CreateTimeMatrix(DataFile) print("Total %d frames" % (frameNum))