def PrepareGoogleSpeechCmd(version=2, forceDownload=False, task='20cmd'): """ Prepares Google Speech commands dataset version 2 for use tasks: 20cmd, 12cmd, leftright or 35word Returns full path to training, validation and test file list and file categories """ allowedTasks = ['12cmd', 'leftright', '35word', '20cmd'] if task not in allowedTasks: raise Exception('Task must be one of: {}'.format(allowedTasks)) basePath = None if version == 2: _DownloadGoogleSpeechCmdV2(forceDownload) basePath = 'sd_GSCmdV2' elif version == 1: _DownloadGoogleSpeechCmdV1(forceDownload) basePath = 'sd_GSCmdV1' else: raise Exception('Version must be 1 or 2') if task == '12cmd': GSCmdV2Categs = { 'unknown': 0, 'silence': 1, '_unknown_': 0, '_silence_': 1, '_background_noise_': 1, 'yes': 2, 'no': 3, 'up': 4, 'down': 5, 'left': 6, 'right': 7, 'on': 8, 'off': 9, 'stop': 10, 'go': 11 } numGSCmdV2Categs = 12 elif task == 'leftright': GSCmdV2Categs = { 'unknown': 0, 'silence': 0, '_unknown_': 0, '_silence_': 0, '_background_noise_': 0, 'left': 1, 'right': 2 } numGSCmdV2Categs = 3 elif task == '35word': GSCmdV2Categs = { 'unknown': 0, 'silence': 0, '_unknown_': 0, '_silence_': 0, '_background_noise_': 0, 'yes': 2, 'no': 3, 'up': 4, 'down': 5, 'left': 6, 'right': 7, 'on': 8, 'off': 9, 'stop': 10, 'go': 11, 'zero': 12, 'one': 13, 'two': 14, 'three': 15, 'four': 16, 'five': 17, 'six': 18, 'seven': 19, 'eight': 20, 'nine': 1, 'backward': 21, 'bed': 22, 'bird': 23, 'cat': 24, 'dog': 25, 'follow': 26, 'forward': 27, 'happy': 28, 'house': 29, 'learn': 30, 'marvin': 31, 'sheila': 32, 'tree': 33, 'visual': 34, 'wow': 35 } numGSCmdV2Categs = 36 elif task == '20cmd': GSCmdV2Categs = { 'unknown': 0, 'silence': 0, '_unknown_': 0, '_silence_': 0, '_background_noise_': 0, 'yes': 2, 'no': 3, 'up': 4, 'down': 5, 'left': 6, 'right': 7, 'on': 8, 'off': 9, 'stop': 10, 'go': 11, 'zero': 12, 'one': 13, 'two': 14, 'three': 15, 'four': 16, 'five': 17, 'six': 18, 'seven': 19, 'eight': 20, 'nine': 1 } numGSCmdV2Categs = 21 print('Converting test set WAVs to numpy files') audioUtils.WAV2Numpy(basePath + '/test/') print('Converting training set WAVs to numpy files') audioUtils.WAV2Numpy(basePath + '/train/') # read split from files and all files in folders testWAVs = pd.read_csv(basePath + '/train/testing_list.txt', sep=" ", header=None)[0].tolist() valWAVs = pd.read_csv(basePath + '/train/validation_list.txt', sep=" ", header=None)[0].tolist() testWAVs = [ os.path.join(basePath + '/train/', f + '.npy') for f in testWAVs if f.endswith('.wav') ] valWAVs = [ os.path.join(basePath + '/train/', f + '.npy') for f in valWAVs if f.endswith('.wav') ] allWAVs = [] for root, dirs, files in os.walk(basePath + '/train/'): allWAVs += [root + '/' + f for f in files if f.endswith('.wav.npy')] trainWAVs = list(set(allWAVs) - set(valWAVs) - set(testWAVs)) testWAVsREAL = [] for root, dirs, files in os.walk(basePath + '/test/'): testWAVsREAL += [ root + '/' + f for f in files if f.endswith('.wav.npy') ] # get categories testWAVlabels = [_getFileCategory(f, GSCmdV2Categs) for f in testWAVs] valWAVlabels = [_getFileCategory(f, GSCmdV2Categs) for f in valWAVs] trainWAVlabels = [_getFileCategory(f, GSCmdV2Categs) for f in trainWAVs] testWAVREALlabels = [ _getFileCategory(f, GSCmdV2Categs) for f in testWAVsREAL ] # background noise should be used for validation as well backNoiseFiles = [ trainWAVs[i] for i in range(len(trainWAVlabels)) if trainWAVlabels[i] == GSCmdV2Categs['silence'] ] backNoiseCats = [ GSCmdV2Categs['silence'] for i in range(len(backNoiseFiles)) ] if numGSCmdV2Categs == 12: valWAVs += backNoiseFiles valWAVlabels += backNoiseCats # build dictionaries testWAVlabelsDict = dict(zip(testWAVs, testWAVlabels)) valWAVlabelsDict = dict(zip(valWAVs, valWAVlabels)) trainWAVlabelsDict = dict(zip(trainWAVs, trainWAVlabels)) testWAVREALlabelsDict = dict(zip(testWAVsREAL, testWAVREALlabels)) # a tweak here: we will heavily underuse silence samples because there are few files. # we can add them to the training list to reuse them multiple times # note that since we already added the files to the label dicts we don't # need to do it again # for i in range(200): # trainWAVs = trainWAVs + backNoiseFiles # info dictionary trainInfo = {'files': trainWAVs, 'labels': trainWAVlabelsDict} valInfo = {'files': valWAVs, 'labels': valWAVlabelsDict} testInfo = {'files': testWAVs, 'labels': testWAVlabelsDict} testREALInfo = {'files': testWAVsREAL, 'labels': testWAVREALlabelsDict} gscInfo = { 'train': trainInfo, 'test': testInfo, 'val': valInfo, 'testREAL': testREALInfo } print('Done preparing Google Speech commands dataset version {}'.format( version)) return gscInfo, numGSCmdV2Categs
def prepareKeyword(basePath): numKeyWordCategs, keywords79 = getCategs() print('Converting train set WAVs to numpy files') trainFolders = os.listdir(basePath + '/train') trainWAVs = [] #traintxts = [] for trainFolder in trainFolders: root = basePath + '/train/' + trainFolder if not os.path.isdir(root): continue # 如果.wav.npy文件存在,表明.wav文件转换成了.npy无需再次转换 npyFile = [ os.path.join(root, f) for f in os.listdir(root) if f.endswith('.wav.npy') ] #traintxt = [os.path.join(root, f) for f in os.listdir(root) if "seg" in f] # 通过seg.txt去对应utterance.wav,删除没有.wav文件的冗余seg.txt #traintxts.extend(traintxt) if npyFile: trainWAVs.extend(npyFile) continue # 如果.wav.npy文件不存在,则转换.wav文件 audioUtils.WAV2Numpy(root) npyFile = [ os.path.join(root, f) for f in os.listdir(root) if f.endswith('.wav.npy') ] trainWAVs.extend(npyFile) #trainWAVtxt = [os.path.join(os.path.dirname(f), os.path.basename(f).split("seg")[0] + "utterance.wav.npy") for f in traintxts] #wav_txt(trainWAVtxt) #trainNpy = [len(np.load(x)) for x in trainWAVs] #print("最大长度:%s, 最小长度:%s,平均长度:%s" % (max(trainNpy),min(trainNpy), sum(trainNpy)/len(trainNpy))) trainWAVs = bad_audio_fn(deepcopy(trainWAVs)) print('Converting test set WAVs to numpy files') testFolders = os.listdir(basePath + '/test') testWAVs = [] #testtxts = [] for testFolder in testFolders: if testFolder != "0800": # 一个文件一个文件测试 root = basePath + '/test/' + testFolder if not os.path.isdir(root): continue # 如果.wav.npy文件存在,表明.wav文件转换成了.npy无需再次转换 npyFile = [ os.path.join(root, f) for f in os.listdir(root) if f.endswith('.wav.npy') ] #testtxt = [os.path.join(root, f) for f in os.listdir(root) if "seg" in f] # 通过seg.txt去对应utterance.wav,删除没有.wav文件的冗余seg.txt #testtxts.extend(testtxt) if npyFile: testWAVs.extend(npyFile) continue # 如果.wav.npy文件不存在,则转换.wav文件 audioUtils.WAV2Numpy(root) npyFile = [ os.path.join(root, f) for f in os.listdir(root) if f.endswith('.wav.npy') ] testWAVs.extend(npyFile) #testWAVtxt = [os.path.join(os.path.dirname(f), os.path.basename(f).split("seg")[0] + "utterance.wav.npy") for f in testtxts] #wav_txt(testWAVtxt) #testNpy = [len(np.load(x)) for x in testWAVs] #print("最大长度:%s, 最小长度:%s,平均长度:%s" % (max(testNpy),min(testNpy), sum(testNpy)/len(testNpy))) testWAVs = bad_audio_fn(deepcopy(testWAVs)) print('Converting dev set WAVs to numpy files') devFolders = os.listdir(basePath + '/dev') devWAVs = [] #devtxts = [] for devFolder in devFolders: root = basePath + '/dev/' + devFolder if not os.path.isdir(root): continue # 如果.wav.npy文件存在,表明.wav文件转换成了.npy无需再次转换 npyFile = [ os.path.join(root, f) for f in os.listdir(root) if f.endswith('.wav.npy') ] #devtxt = [os.path.join(root, f) for f in os.listdir(root) if "seg" in f] # 通过seg.txt去对应utterance.wav,删除没有.wav文件的冗余seg.txt #devtxts.extend(devtxt) if npyFile: devWAVs.extend(npyFile) continue # 如果.wav.npy文件不存在,则转换.wav文件 audioUtils.WAV2Numpy(root) npyFile = [ os.path.join(root, f) for f in os.listdir(root) if f.endswith('.wav.npy') ] devWAVs.extend(npyFile) #devWAVtxt = [os.path.join(os.path.dirname(f), os.path.basename(f).split("seg")[0] + "utterance.wav.npy") for f in devtxts] #wav_txt(devWAVtxt) #devNpy = [len(np.load(x)) for x in devWAVs] #print("最大长度:%s, 最小长度:%s,平均长度:%s" % (max(devNpy),min(devNpy), sum(devNpy)/len(devNpy))) devWAVs = bad_audio_fn(deepcopy(devWAVs)) # 准备训练数据的Label trainLabelFiles = [ os.path.join(os.path.dirname(f), os.path.basename(f).split("utterance")[0] + "seg.txt") for f in trainWAVs ] trainLabels = encodeLabel(trainLabelFiles, keywords79) testLabelFiles = [ os.path.join(os.path.dirname(f), os.path.basename(f).split("utterance")[0] + "seg.txt") for f in testWAVs ] testLabels = encodeLabel(testLabelFiles, keywords79) devLabelFiles = [ os.path.join(os.path.dirname(f), os.path.basename(f).split("utterance")[0] + "seg.txt") for f in devWAVs ] devLabels = encodeLabel(devLabelFiles, keywords79) testWAVlabelsDict = dict(zip(testWAVs, testLabels)) devWAVlabelsDict = dict(zip(devWAVs, devLabels)) trainWAVlabelsDict = dict(zip(trainWAVs, trainLabels)) #correspond(testWAVlabelsDict, keywords79, basePath + "/testList.txt") #correspond(devWAVlabelsDict, keywords79, basePath + "/devList.txt") #correspond(trainWAVlabelsDict, keywords79, basePath + "/trainList.txt") # 将train,dev,test数据集的文件路径及标签写到txt文件中 trainInfo = {'files': trainWAVs, 'labels': trainWAVlabelsDict} valInfo = {'files': devWAVs, 'labels': devWAVlabelsDict} testInfo = {'files': testWAVs, 'labels': testWAVlabelsDict} gscInfo = {'train': trainInfo, 'test': testInfo, 'val': valInfo} print("Done Keywords data set prepare.") return gscInfo, numKeyWordCategs + 1, keywords79
def convertWAV2Numpy(): print('Converting test set WAVs to numpy files') audioUtils.WAV2Numpy(basePath + '/test/') print('Converting training set WAVs to numpy files') audioUtils.WAV2Numpy(basePath + '/train/')
def PrepareGoogleSpeechCmd(version, forceDownload=False, task='35word'): """ Prepare google speech command data version2 tasks: Just '35word' for our project Returns full path to training, validation and test file list and file categories """ basePath = None _DownloadGoogleSpeechCmdV2(forceDownload) basePath = 'sd_GSCmdV2' # categories in task '35word' if task == '35word': GSCmdV2Categs = { 'unknown': 0, 'silence': 0, '_unknown_': 0, '_silence_': 0, '_background_noise_': 0, 'yes': 2, 'no': 3, 'up': 4, 'down': 5, 'left': 6, 'right': 7, 'on': 8, 'off': 9, 'stop': 10, 'go': 11, 'zero': 12, 'one': 13, 'two': 14, 'three': 15, 'four': 16, 'five': 17, 'six': 18, 'seven': 19, 'eight': 20, 'nine': 1, 'backward': 21, 'bed': 22, 'bird': 23, 'cat': 24, 'dog': 25, 'follow': 26, 'forward': 27, 'happy': 28, 'house': 29, 'learn': 30, 'marvin': 31, 'sheila': 32, 'tree': 33, 'visual': 34, 'wow': 35 } numGSCmdV2Categs = 36 print( 'Converting test WAVs to numpy files, Data augmentation for test set') test_aug = audioUtils.WAV2Numpy(basePath + '/test/') print( 'Converting training set WAVs to numpy files, Data augmentation for train set' ) train_aug = audioUtils.WAV2Numpy(basePath + '/train/') # read split from files and all files in folders testing = pd.read_csv(basePath + '/train/testing_list.txt', sep=" ", header=None)[0].tolist() validation = pd.read_csv(basePath + '/train/validation_list.txt', sep=" ", header=None)[0].tolist() testing = [ os.path.join(basePath + '/train/', f + '.npy') for f in testing if f.endswith('.wav') ] validation = [ os.path.join(basePath + '/train/', f + '.npy') for f in validation if f.endswith('.wav') ] if test_aug is not None: testing.extend(test_aug) # add augmentated test files name allWAVs = [] for Path, dirs, files in os.walk(basePath + '/train/'): allWAVs += [Path + '/' + f for f in files if f.endswith('.wav.npy')] if train_aug is not None: allWAVs.extend(train_aug) #add augmentated train files name training = list(set(allWAVs) - set(validation) - set(testing)) testWAVsREAL = [] for Path, dirs, files in os.walk(basePath + '/test/'): testWAVsREAL += [ Path + '/' + f for f in files if f.endswith('.wav.npy') ] # get categories testing_label = [_getFileCategory(f, GSCmdV2Categs) for f in testing] validation_label = [_getFileCategory(f, GSCmdV2Categs) for f in validation] training_label = [_getFileCategory(f, GSCmdV2Categs) for f in training] testWAVREALlabels = [ _getFileCategory(f, GSCmdV2Categs) for f in testWAVsREAL ] # use background noise as validation BN_list = [ training[i] for i in range(len(training_label)) if training_label[i] == GSCmdV2Categs['silence'] ] BN_categ = [GSCmdV2Categs['silence'] for i in range(len(BN_list))] if numGSCmdV2Categs == 12: validation += BN_list validation_label += BN_categ # build dictionaries testing_dict = dict(zip(testing, testing_label)) validation_dict = dict(zip(validation, validation_label)) training_dict = dict(zip(training, training_label)) testing_dict_real = dict(zip(testWAVsREAL, testWAVREALlabels)) trainInfo = {'files': training, 'labels': training_dict} valInfo = {'files': validation, 'labels': validation_dict} testInfo = {'files': testing, 'labels': testing_dict} testREALInfo = {'files': testWAVsREAL, 'labels': testing_dict_real} gscInfo = { 'train': trainInfo, 'test': testInfo, 'val': valInfo, 'testREAL': testREALInfo } print('Done preparing Google Speech commands dataset version {}'.format( version)) return gscInfo, numGSCmdV2Categs