def writeOutForRegression(self): dataset = DataReader(self.currentDataPath, onlineFeat=self.onlineFeat, resampleTarget=self.resampleTarget) dataset.setDatasetFeatOnly(self.experiment.testOn, self.experiment.data["feature"]) if self.experiment.data["featModelPath"] != "" and self.onlineFeat: dataset.getModelFeat( self.experiment.data["featModelPath"], normalised=self.experiment.data["featModelNorm"], maxDur=self.experiment.data["featModelMaxDur"], cuda=self.cuda) print("fairseq model for writting model outputs loaded") if self.testRun: dataset.keepOneOnly() dataloader = DataLoader(dataset=dataset, batch_size=1, shuffle=False) wrapper = self.getWrapper(getBest=True) modelOutPath = os.path.join(wrapper.savePath, "outputs") if not os.path.exists(modelOutPath): os.makedirs(modelOutPath) for idx, (ID, feat) in enumerate(dataloader): output = wrapper.forwardModel(feat) output = output.detach().cpu().numpy() # print(ID, feat.shape, output.shape) savePath = os.path.join(modelOutPath, ID[0] + ".csv") headers = ["output_" + str(i) for i in range(output.shape[2])] df = pd.DataFrame(output[0], columns=headers) df.to_csv(savePath, index=False) printProgressBar(idx + 1, len(dataloader), prefix='Writing outputs:', suffix='', length="fit")
def main(mainPath, partitions): """ Make the json file only given the wavs files of a dataset. example: python makeJson.py -m "/mnt/HD-Storage/Datasets/ESTER1" -p "train/*.wav" "dev/*.wav" "test/*.wav" python makeJson.py -m "/mnt/HD-Storage/Datasets/RAVDESS" python makeJson.py -m "/mnt/HD-Storage/Datasets/SEMAINE" python makeJson.py -m "/mnt/HD-Storage/Datasets/IEMOCAP" python makeJson.py -m "/mnt/HD-Storage/Datasets/MaSS_Fr" """ jsonPath = os.path.join(mainPath, "data.json") wavsPath = os.path.join(mainPath, "Wavs") if partitions == []: trainPath = os.path.join(wavsPath, "**", "*.wav") else: trainPath = os.path.join(wavsPath, partitions[0]) print("trainPath", trainPath) trainFiles = glob.glob(trainPath, recursive=True) print("trainFiles", trainFiles) if partitions == []: devFiles, testFiles = [], [] else: devPath = os.path.join(wavsPath, partitions[1]) devFiles = glob.glob(devPath, recursive=True) testPath = os.path.join(wavsPath, partitions[2]) testFiles = glob.glob(testPath, recursive=True) allDics = {} for f, filesPaths in enumerate([trainFiles, devFiles, testFiles]): partition = "" if f == 0: partition = "train" if f == 1: partition = "dev" if f == 2: partition = "test" for i, filePath in enumerate(filesPaths): baseName = os.path.basename(filePath)[:-4] fileDict = AudioSample() fileDict.setParams(baseName, filePath, partition) # fileDict.features = getFeatures(main_path, baseName) dic = classToDic(fileDict.__dict__) # dic = changePaths(dic, main_path, ".") dic = localizePaths(dic, mainPath) allDics[baseName] = dic printProgressBar(i + 1, len(filesPaths), prefix='Processing Files for ' + partition + ":", suffix='Complete') with open(jsonPath, 'w') as fp: json.dump(allDics, fp, indent=4, ensure_ascii=False)
def testClassification(self): dataset = DataReader(self.currentDataPath, onlineFeat=self.onlineFeat, resampleTarget=self.resampleTarget) dataset.setDatasetClassic(self.experiment.testOn, self.experiment.data["feature"], self.experiment.data["annotation"]) if self.experiment.data["featModelPath"] != "" and self.onlineFeat: dataset.getModelFeat( self.experiment.data["featModelPath"], normalised=self.experiment.data["featModelNorm"], maxDur=self.experiment.data["featModelMaxDur"], cuda=self.cuda) print("fairseq model for testing model outputs loaded") inp, tar = dataset[0] self.experiment.inputDim = inp.shape[1] firstID1 = list(dataset.dataPart.keys())[0] firstID2 = list(dataset.dataPart[firstID1]["annotations"])[0] headers = dataset.dataPart[firstID1]["annotations"][firstID2][ "headers"] # print(headers) wrapper = getWrapper(self.experiment, seed=self.seed, getBest=True) modelOutPath = os.path.join(wrapper.savePath, "outputs") savePath = os.path.join(modelOutPath, "outputs.csv") outputsCSV = pd.read_csv(savePath) if self.testRun: dataset.keepOneOnly() IDs = dataset.dataPart.keys() AllOuts = [] AllTars = [] for idx, ID in enumerate(IDs): outputs = outputsCSV[ID].to_numpy() targets = dataset.targetReader(ID) AllOuts.append(np.argmax(outputs)) AllTars.append(targets[0, 0]) # print(np.argmax(outputs), targets[0,0]) printProgressBar(idx + 1, len(IDs), prefix='Testing model :', suffix='', length="fit") # if idx > 50: break target = np.array(AllTars) output = np.array(AllOuts) evaluation = {} for key in self.experiment.metrics: evaluation[key] = getMetric(target, output, metric=key) self.experiment.evaluation = evaluation confMat = confMatrix(target, output, numTars=self.experiment.outputDim) # print(confMatrix(target, output, numTars=experiment.outputDim)) savePath = os.path.join(wrapper.savePath, "confMat.csv") np.savetxt(savePath, confMat, delimiter=",") return evaluation
def trainModel(self, datasetTrain, datasetDev, batchSize=1, maxEpoch=200, loadBefore=True, tolerance=15, minForTolerance=15, limitTrainData=False, limitDevData=False): if loadBefore: self.loadCheckpoint() trainDataloader = DataLoader(dataset=datasetTrain, batch_size=batchSize, shuffle=True) devDataloader = DataLoader(dataset=datasetDev, batch_size=batchSize, shuffle=False) while self.currentEpoch <= maxEpoch: if self.noMoreTrain: if self.printLvl > 0: print("Early stopping has been achieved!") break if limitTrainData: datasetTrain.limitData(limitTrainData) trainDataloader = DataLoader(dataset=datasetTrain, batch_size=batchSize, shuffle=True) if limitDevData: datasetDev.limitData(limitTrainData) devDataloader = DataLoader(dataset=datasetDev, batch_size=batchSize, shuffle=False) self.trainEpoch(trainDataloader) devLoss = self.evaluateModel(devDataloader) self.modelStates[self.currentEpoch] = copy.deepcopy( self.model.state_dict()) self.epochDevLosses.append(devLoss) if self.printLvl > 1: printProgressBar(self.currentEpoch, maxEpoch, prefix='Training model:', suffix='| epoch loss: ' + str(devLoss), length="fit") # print("loss", self.currentEpoch, devLoss) self.currentEpoch += 1 # --- Early Stopping --- if (self.currentEpoch - self.getBestEpochIdx() >= tolerance) and self.currentEpoch > minForTolerance: self.noMoreTrain = True self.saveCheckpoint() self.saveLogToCSV() if self.printLvl > 0: print("Training the model has been finished!")
def main(inputPath, newPath, ignorePath, ext): """ Transform all audio files under a folder into wav PCM 16kHz 16bits signed-intege. example: python Preprocess.py --input "../../Data/Wavs" --output "../../Data/WavsProcessed" python Preprocess.py --input "/mnt/HD-Storage/Databases/AlloSat_corpus/audio" --output "/mnt/HD-Storage/Datasets/AlloSat/Wavs" python Preprocess.py --input "/mnt/HD-Storage/Databases/ESTER1/ESTER1_TRAIN/wav" --output "/mnt/HD-Storage/Datasets/ESTER1/Wavs/train" python Preprocess.py --input "/mnt/HD-Storage/Databases/ESTER1/ESTER1_DEV/wav" --output "/mnt/HD-Storage/Datasets/ESTER1/Wavs/dev" python Preprocess.py --input "/mnt/HD-Storage/Databases/ESTER1/ESTER1_TEST/wav" --output "/mnt/HD-Storage/Datasets/ESTER1/Wavs/test" python Preprocess.py --input "/mnt/HD-Storage/Databases/RAVDESS_Audio_Speech_Actors_01-24" --output "/mnt/HD-Storage/Datasets/RAVDESS/Wavs" python Preprocess.py --input "/mnt/HD-Storage/Databases/Noises4" --output "/mnt/HD-Storage/Datasets/NoiseFiles" python Preprocess.py --input "/mnt/HD-Storage/Databases/SEMAINE/wav_data_original" --output "/mnt/HD-Storage/Datasets/SEMAINE/Wavs" python Preprocess.py --input "/mnt/HD-Storage/Databases/IEMOCAP/IEMOCAP_full_release" --output "/mnt/HD-Storage/Datasets/IEMOCAP/Wavs" -e "avi" python Preprocess.py --input "/mnt/HD-Storage/Databases/MaSS/output_waves" --output "/mnt/HD-Storage/Datasets/MaSS_Fr/Wavs" """ path = os.path.join(inputPath, "**") #"../PMDOM2FR/**/" theFiles = get_files_in_path(path, ext=ext) for i, filePath in enumerate(theFiles): # Making wav files fileNewPath = filePath.replace(inputPath, newPath) if ignorePath: fileNewPath = os.path.join(newPath, os.path.split(filePath)[-1]) makeDirFor(fileNewPath) if ext == "avi": os.system('ffmpeg -i ' + filePath + ' -y ' + "temp.wav") os.system('sox ' + "temp.wav" + ' -r 16000 -c 1 -b 16 -e signed-integer ' + fileNewPath[:-4] + ".wav") os.remove("temp.wav") else: os.system('sox ' + filePath + ' -r 16000 -c 1 -b 16 -e signed-integer ' + fileNewPath) printProgressBar(i + 1, len(theFiles), prefix='Transforming Files:', suffix='Complete')
def main(featsFolder, jsonPath): """ Adding mel-frequency filterbank features to a given dataset and writting its reference to a json file Example ---------- python MelFilterBank.py -f "MFB" -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" python MelFilterBank.py -f "MFB" -j "/mnt/HD-Storage/Datasets/AlloSat/data.json" python MelFilterBank.py -f "MFB" -j "/mnt/HD-Storage/Datasets/Recola_46_S/data.json" python MelFilterBank.py -f "MFB" -j "/mnt/HD-Storage/Datasets/ESTER1/data.json" python MelFilterBank.py -f "MFB" -j "/home/getalp/alisamis/Datasets/ESTER1/data.json" python MelFilterBank.py -f "MFB" -j "/mnt/HD-Storage/Datasets/RAVDESS/data.json" """ samples = loadFromJson(jsonPath) for i, ID in enumerate(samples.keys()): sample = samples[ID] wavePath = sample["path"] wavsFolder = wavePath.split(os.sep)[0] waveFullPath = os.path.join(os.path.split(jsonPath)[0], wavePath) featsLocalPath = wavePath.replace(wavsFolder, featsFolder).replace(".wav", ".csv") featsLocalPath = os.path.join("Feats", featsLocalPath) featsFullPath = os.path.join( os.path.split(jsonPath)[0], featsLocalPath) # print(featsLocalPath, featsFullPath) dim = makeFeatsCsv(waveFullPath, featsFullPath) if dim == 0: continue featsDict = getFeatsDict(dim, featsFolder, featsLocalPath) samples[ID]["features"][featsDict["ID"]] = featsDict # saveToJson(jsonPath, sample) printProgressBar(i + 1, len(samples), prefix='Adding mel-frequency filterbank features:', suffix='Complete', length="fit") with open(jsonPath, 'w') as jsonFile: json.dump(samples, jsonFile, indent=4, ensure_ascii=False)
def main(annotsList, headers, genres, jsonPath): """ Adding annotations to a given dataset and writting its reference to a json file Example ---------- python addAnnots.py -a "gs_arousal_0.01_std" "gs_valence_0.01_std" "gen_gs_arousal_0.01_std" "gen_gs_valence_0.01_std" -d "GoldStandard" -g arousal valence arousal valence -j "/mnt/HD-Storage/Datasets/Recola_46_S/data.json" python addAnnots.py -a "VAD_0.01" -d "label" -g "VAD" -j "/mnt/HD-Storage/Datasets/ESTER1/data.json" python addAnnots.py -a "turns_0.01" -d "label" -g "VAD" -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" python addAnnots.py -a "turns_0.01" -d "label" -g "VAD" -j "/mnt/HD-Storage/Datasets/SEMAINE/data.json" python addAnnots.py -a "turns_0.01" -d "label" -g "VAD" -j "/mnt/HD-Storage/Datasets/IEMOCAP/data.json" python addAnnots.py -a "turns_0.01" -d "label" -g "VAD" -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data.json" """ samples = loadFromJson(jsonPath) for t, annotName in enumerate(annotsList): trainFilePaths = [] print("annot:", annotName) for i, ID in enumerate(samples.keys()): sample = samples[ID] wavePath = sample["path"] wavsFolder = wavePath.split(os.sep)[0] waveFullPath = os.path.join(os.path.split(jsonPath)[0], wavePath) featsLocalPath = wavePath.replace(wavsFolder, annotName).replace(".wav", ".csv") featsLocalPath = os.path.join("Annots", featsLocalPath) featsFullPath = os.path.join(os.path.split(jsonPath)[0], featsLocalPath) try: df = pd.read_csv(featsFullPath, delimiter=',') out = df[headers].to_numpy().astype('float64') dim = list(out.shape) annotsDict = getAnnotsDict(annotName, genres[t], dim, featsLocalPath, headers) samples[ID]["annotations"][annotsDict["ID"]] = annotsDict except: print("Warning: could not read", featsFullPath) printProgressBar(i + 1, len(samples), prefix = 'Adding '+ annotName +' annotation', suffix = 'Complete', length = "fit") with open(jsonPath, 'w') as jsonFile: json.dump(samples, jsonFile, indent=4, ensure_ascii=False)
def testRegression(self): dataset = DataReader(self.currentDataPath, onlineFeat=self.onlineFeat, resampleTarget=self.resampleTarget) dataset.setDatasetAnnotOnly(self.experiment.testOn, self.experiment.data["annotation"]) # if self.experiment.data["featModelPath"] != "" and self.onlineFeat: # dataset.getModelFeat(self.experiment.data["featModelPath"], normalised=self.experiment.data["featModelNorm"], maxDur=self.experiment.data["featModelMaxDur"]) # print("fairseq model for testing model outputs loaded") idx, tar = dataset[0] # experiment.inputDim = inp.shape[1] # if not "classification" in experiment.genre: self.experiment.outputDim = tar.shape[1] firstID1 = list(dataset.dataPart.keys())[0] firstID2 = list(dataset.dataPart[firstID1]["annotations"])[0] headers = dataset.dataPart[firstID1]["annotations"][firstID2][ "headers"] # print(headers) wrapper = self.getWrapper(getBest=True) modelOutPath = os.path.join(wrapper.savePath, "outputs") if self.testRun: dataset.keepOneOnly() IDs = dataset.dataPart.keys() # for key in self.experiment.metrics: evaluations = {} evaluation = {} allTars = [] allOuts = [] results = np.zeros( (len(self.experiment.metrics), len(headers), len(IDs))) for idx, ID in enumerate(IDs): savePath = os.path.join(modelOutPath, ID + ".csv") outputs = pd.read_csv(savePath).to_numpy() targets = dataset.targetReader(ID) # RESAMPLE OUTPUT TO TARGETS FOR TESTING! if self.resampleTarget: from Utils.Funcs import reshapeMatrix outputs = reshapeMatrix(outputs, len(targets)) # print(targets.shape, outputs.shape) # if idx == 0: #[[] for _ in range(targets.shape[1])] # bestresult = 0; bestID = "0" # for dim in range(targets.shape[1]): evaluation[headers[dim]] = {} for dim in range(targets.shape[1]): output = outputs[:, dim] target = targets[:, dim] # while target.shape[0] > output.shape[0]: output = np.append(output, outputs[-1]) # while target.shape[0] < output.shape[0]: output = outputs[:target.shape[0]].reshape(target.shape[0]) while target.shape[0] != output.shape[0]: output = outputs.reshape(target.shape[0]) if self.testConcated: allTars += list(target) allOuts += list(output) for k, key in enumerate(self.experiment.metrics): result = getMetric(target, output, metric=key) results[k, dim, idx] = result # if result > bestresult: bestresult=result; bestID = ID # print(ID, result, len(output)) printProgressBar(idx + 1, len(IDs), prefix='Testing model:', suffix='', length="fit") for k, key in enumerate(self.experiment.metrics): for dim in range(targets.shape[1]): if self.testConcated: evaluation[headers[dim]] = getMetric(np.array(allTars), np.array(allOuts), metric=key) if key == "AUC": # write fpr & tpr to plot ROCs! from sklearn import metrics fpr, tpr, thresholds = metrics.roc_curve( np.array(allTars), np.array(allOuts)) fpr = reshapeMatrix(np.expand_dims(fpr, axis=1), 100) tpr = reshapeMatrix(np.expand_dims(tpr, axis=1), 100) savePath = os.path.join( wrapper.savePath, "ROC_resampled_" + str(dim) + "_" + os.path.split(self.currentDataPath)[-1] + ".csv") np.savetxt( savePath, [np.squeeze(fpr), np.squeeze(tpr)], delimiter=",") else: evaluation[headers[dim]] = {} evaluation[headers[dim]]['mean'] = np.mean(results[k, dim]) evaluation[headers[dim]]['std'] = np.std(results[k, dim]) evaluations[key] = evaluation.copy() # print("evaluation",evaluation,key) # print("self.experiment.evaluation[key]", key, self.experiment.evaluation[key]) return evaluations
def main(featsFolder, jsonPath, modelPath, maxDur, normalised, cuda): """ Adding wav2vec2 features to a given dataset and writting its reference to a json file Example ---------- python wav2vec2.py -f "xlsr_53_56k_cut30" -d 29.99 -n True -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/xlsr_53_56k.pt" python wav2vec2.py -f "FlowBERT_2952h_base_cut30_noNorm" -d 29.99 -n False -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/FlowBERT_2952h_base.pt" python wav2vec2.py -f "FlowBERT_2952h_base_cut30" -d 29.99 -n True -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/FlowBERT_2952h_base.pt" python wav2vec2.py -f "FlowBERT_2952h_large_cut30" -d 29.99 -n True -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/FlowBERT_2952h_large.pt" python wav2vec2.py -f "FlowBERT_2952h_large_noNorm_cut30" -d 29.99 -n False -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/FlowBERT_2952h_large.pt" python wav2vec2.py -f "xlsr_53_56k_cut30" -d 29.98 -n True -j "/mnt/HD-Storage/Datasets/AlloSat/data.json" -m "/mnt/HD-Storage/Models/xlsr_53_56k.pt" python wav2vec2.py -f "FlowBERT_2952h_base_cut30" -d 29.99 -n False -j /home/getalp/alisamis/Datasets/AlloSat/data.json -m /home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/2952h_base/checkpoint_best.pt python wav2vec2.py -f "FlowBERT_2952h_base_cut30_noNorm" -d 29.99 -n False -j "/mnt/HD-Storage/Datasets/Recola_46_S/data.json" -m "/mnt/HD-Storage/Models/FlowBERT_2952h_base.pt" not working: python wav2vec2.py -f "wav2vec2-large-xlsr-53-french" -d 29.99 -n True -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/wav2vec2-large-xlsr-53-french.zip" python wav2vec2.py -f "mls_french_base_cut30" -d 29.98 -n False -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/mls_french_base/checkpoint_best.pt" python wav2vec2.py -f "xlsr_53_56k_cut30" -d 29.98 -n True -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/alisamis/Models/xlsr_53_56k.pt" python wav2vec2.py -f "mls_french_base_cut30" -d 29.98 -n False -j "/home/getalp/alisamis/Datasets/Recola_46/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/mls_french_base/checkpoint_best.pt" python wav2vec2.py -f "mls_french_base_cut30" -d 29.98 -n False -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/mls_french_base/checkpoint_best.pt" python wav2vec2.py -f "libri960_big_cut30" -d 29.98 -n True -j "/home/getalp/alisamis/Datasets/Recola_46/data.json" -m "/home/getalp/dinarelm/work/data/Exchance/wav2vec/models/libri960_big.pt" python wav2vec2.py -f "libri960_big_cut30" -d 29.98 -n True -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/dinarelm/work/data/Exchance/wav2vec/models/libri960_big.pt" python wav2vec2.py -f "mls_french_large_cut30" -d 29.98 -n True -j "/home/getalp/alisamis/Datasets/Recola_46/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/mls_french_large/checkpoint_best.pt" && \ python wav2vec2.py -f "FlowBERT_2952h_base_cut30" -d 29.98 -n False -j "/home/getalp/alisamis/Datasets/Recola_46/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/2952h_base/checkpoint_best.pt" && \ python wav2vec2.py -f "FlowBERT_2952h_large_cut30" -d 29.98 -n True -j "/home/getalp/alisamis/Datasets/Recola_46/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/2952h_large/checkpoint_best.pt" && \ python wav2vec2.py -f "FlowBERT_2952h_large_cut30" -d 29.98 -n True -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/2952h_large/checkpoint_best.pt" && \ python wav2vec2.py -f "mls_french_large_cut30" -d 29.98 -n True -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/mls_french_large/checkpoint_best.pt" python wav2vec2.py -f "mls_french_base_cut30" -d 29.98 -n False -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/mls_french_base/checkpoint_best.pt" """ # cp = torch.load(modelPath, map_location=torch.device('cpu')) # model = Wav2VecModel.build_model(cp['args'], task=None) # model.load_state_dict(cp['model']) # model.eval() # cp = torch.load(modelPath, map_location=torch.device('cpu')) model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task( [modelPath]) model = model[0] if cuda: model = model.cuda() model.eval() samples = loadFromJson(jsonPath) for i, ID in enumerate(samples.keys()): sample = samples[ID] wavePath = sample["path"] wavsFolder = wavePath.split(os.sep)[0] waveFullPath = os.path.join(os.path.split(jsonPath)[0], wavePath) featsLocalPath = wavePath.replace(wavsFolder, featsFolder).replace(".wav", ".csv") featsLocalPath = os.path.join("Feats", featsLocalPath) featsFullPath = os.path.join( os.path.split(jsonPath)[0], featsLocalPath) # print(featsLocalPath, featsFullPath) dim = makeFeatsCsv(waveFullPath, featsFullPath, model, maxDur, normalised, cuda=cuda) if dim == 0: continue featsDict = getFeatsDict(dim, featsFolder, featsLocalPath) samples[ID]["features"][featsDict["ID"]] = featsDict # saveToJson(jsonPath, sample) printProgressBar(i + 1, len(samples), prefix='Adding wav2vec features:', suffix='Complete', length="fit") with open(jsonPath, 'w') as jsonFile: json.dump(samples, jsonFile, indent=4, ensure_ascii=False)
def main(jsonPath, outJson, noiseFilesPaths, addWhite, SNRs, ignoreExisting): """ Adding noise to a given dataset and making a different json file for it to reference it Example ---------- python addNoise.py -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -aw True -n "/mnt/HD-Storage/Datasets/NoiseFiles/train" "/mnt/HD-Storage/Datasets/NoiseFiles/dev" "/mnt/HD-Storage/Datasets/NoiseFiles/test" python addNoise.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy.json" -aw True -n "/mnt/HD-Storage/Datasets/NoiseFiles/train" "/mnt/HD-Storage/Datasets/NoiseFiles/dev" "/mnt/HD-Storage/Datasets/NoiseFiles/test" python addNoise.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -aw True -n "/mnt/HD-Storage/Datasets/NoiseFiles/train" "/mnt/HD-Storage/Datasets/NoiseFiles/dev" "/mnt/HD-Storage/Datasets/NoiseFiles/test" python addNoise.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -aw True -n "/mnt/HD-Storage/Datasets/NoiseFiles/train" "/mnt/HD-Storage/Datasets/NoiseFiles/dev" "/mnt/HD-Storage/Datasets/NoiseFiles/test" """ datasetPath = os.path.split(jsonPath)[0] noisyFolder = "Wavs_Noisy" noisyWavsPath = os.path.join(datasetPath, noisyFolder) if not os.path.exists(noisyWavsPath): os.makedirs(noisyWavsPath) trainPath = os.path.join(noiseFilesPaths[0], "**", "*.wav") trainNoises = glob.glob(trainPath, recursive=True) devPath = os.path.join(noiseFilesPaths[1], "**", "*.wav") devNoises = glob.glob(devPath, recursive=True) testPath = os.path.join(noiseFilesPaths[2], "**", "*.wav") testNoises = glob.glob(testPath, recursive=True) samples = loadFromJson(jsonPath) newSamples = samples.copy() for i, ID in enumerate(samples.keys()): sample = samples[ID].copy() wavePath = sample["path"] wavFullPath = os.path.join(datasetPath, wavePath) sample["features"] = {} # to avoid reading the wrong feature extracted from clean speech wavsFolder = wavePath.split(os.sep)[0] splits = wavePath.split(os.sep) fileName = splits[-1].replace(".wav", "") ## MAKE NOISY FILES AND ADD TO SAMPLES, GIVE A NEW ID (which would be name of file) noiseFiles = trainNoises if sample["partition"] == "dev" : noiseFiles = devNoises if sample["partition"] == "test": noiseFiles = testNoises for snr in SNRs: for noiseFile in noiseFiles: outWavPath = noisyFolder for split in splits[1:-1]: outWavPath = os.path.join(outWavPath, split) outWavName = fileName +'_snr' + str(snr) + '_' + noiseFile.split(os.sep)[-1] outWavPath = os.path.join(outWavPath, outWavName) outWavFullPath = os.path.join(datasetPath, outWavPath) if not (ignoreExisting and os.path.exists(outWavFullPath)): addNoiseFile(wavFullPath, noiseFile, outWavFullPath, snr=snr) ID = outWavName.replace(".wav", "") newSample = sample.copy() newSample["path"] = outWavPath newSample["ID"] = ID newSamples[ID] = newSample if addWhite: outWavPath = noisyFolder for split in splits[1:-1]: outWavPath = os.path.join(outWavPath, split) outWavName = fileName +'_snr' + str(snr) + '_whiteNoise.wav' outWavPath = os.path.join(outWavPath, outWavName) outWavFullPath = os.path.join(datasetPath, outWavPath) if not (ignoreExisting and os.path.exists(outWavFullPath)): addWhiteNoise(wavFullPath, outWavFullPath, snr=snr) ID = outWavName.replace(".wav", "") newSample = sample.copy() newSample["path"] = outWavPath newSample["ID"] = ID newSamples[ID] = newSample printProgressBar(i + 1, len(samples), prefix = 'Making wav files noisy:', suffix = 'Complete', length = "fit") with open(outJson, 'w') as jsonFile: json.dump(newSamples, jsonFile, indent=4, ensure_ascii=False)
def main(jsonPath, outJson, percentageReduction, keepTest, blackList, whiteList): """ Reduce data in a json file to have a smaller subset of the original dataset Example ---------- python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_short_10.json" -p 10 python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr5_whiteNoise.json" -w snr5_whiteNoise && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr5_Ads_2018_2020.json" -w snr5_trainNoise_Ads_2018_2020 snr5_devNoise_Ads_2018_2020 snr5_testNoise_Ads_2018_2020 && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr5_News.json" -w snr5_trainNoise_News snr5_devNoise_News snr5_testNoise_News && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr5_TalkShows.json" -w snr5_trainNoise_TalkShows snr5_devNoise_TalkShows snr5_testNoise_TalkShows && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr5_Ambient_Music_top_charts.json" -w snr5_trainNoise_Ambient_Music_top_charts snr5_devNoise_Ambient_Music_top_charts snr5_testNoise_Ambient_Music_top_charts && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr15_whiteNoise.json" -w snr15_whiteNoise && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr15_Ads_2018_2020.json" -w snr15_trainNoise_Ads_2018_2020 snr15_devNoise_Ads_2018_2020 snr15_testNoise_Ads_2018_2020 && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr15_News.json" -w snr15_trainNoise_News snr15_devNoise_News snr15_testNoise_News && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr15_TalkShows.json" -w snr15_trainNoise_TalkShows snr15_devNoise_TalkShows snr15_testNoise_TalkShows && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr15_Ambient_Music_top_charts.json" -w snr15_trainNoise_Ambient_Music_top_charts snr15_devNoise_Ambient_Music_top_charts snr15_testNoise_Ambient_Music_top_charts python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr5_whiteNoise.json" -w snr5_whiteNoise && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr5_Ads_2018_2020.json" -w snr5_trainNoise_Ads_2018_2020 snr5_devNoise_Ads_2018_2020 snr5_testNoise_Ads_2018_2020 && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr5_News.json" -w snr5_trainNoise_News snr5_devNoise_News snr5_testNoise_News && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr5_TalkShows.json" -w snr5_trainNoise_TalkShows snr5_devNoise_TalkShows snr5_testNoise_TalkShows && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr5_Ambient_Music_top_charts.json" -w snr5_trainNoise_Ambient_Music_top_charts snr5_devNoise_Ambient_Music_top_charts snr5_testNoise_Ambient_Music_top_charts && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr15_whiteNoise.json" -w snr15_whiteNoise && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr15_Ads_2018_2020.json" -w snr15_trainNoise_Ads_2018_2020 snr15_devNoise_Ads_2018_2020 snr15_testNoise_Ads_2018_2020 && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr15_News.json" -w snr15_trainNoise_News snr15_devNoise_News snr15_testNoise_News && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr15_TalkShows.json" -w snr15_trainNoise_TalkShows snr15_devNoise_TalkShows snr15_testNoise_TalkShows && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr15_Ambient_Music_top_charts.json" -w snr15_trainNoise_Ambient_Music_top_charts snr15_devNoise_Ambient_Music_top_charts snr15_testNoise_Ambient_Music_top_charts python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr5_whiteNoise.json" -w snr5_whiteNoise && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr5_Ads_2018_2020.json" -w snr5_trainNoise_Ads_2018_2020 snr5_devNoise_Ads_2018_2020 snr5_testNoise_Ads_2018_2020 && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr5_News.json" -w snr5_trainNoise_News snr5_devNoise_News snr5_testNoise_News && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr5_TalkShows.json" -w snr5_trainNoise_TalkShows snr5_devNoise_TalkShows snr5_testNoise_TalkShows && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr5_Ambient_Music_top_charts.json" -w snr5_trainNoise_Ambient_Music_top_charts snr5_devNoise_Ambient_Music_top_charts snr5_testNoise_Ambient_Music_top_charts && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr15_whiteNoise.json" -w snr15_whiteNoise && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr15_Ads_2018_2020.json" -w snr15_trainNoise_Ads_2018_2020 snr15_devNoise_Ads_2018_2020 snr15_testNoise_Ads_2018_2020 && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr15_News.json" -w snr15_trainNoise_News snr15_devNoise_News snr15_testNoise_News && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr15_TalkShows.json" -w snr15_trainNoise_TalkShows snr15_devNoise_TalkShows snr15_testNoise_TalkShows && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr15_Ambient_Music_top_charts.json" -w snr15_trainNoise_Ambient_Music_top_charts snr15_devNoise_Ambient_Music_top_charts snr15_testNoise_Ambient_Music_top_charts python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr5_whiteNoise.json" -w snr5_whiteNoise && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr5_Ads_2018_2020.json" -w snr5_trainNoise_Ads_2018_2020 snr5_devNoise_Ads_2018_2020 snr5_testNoise_Ads_2018_2020 && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr5_News.json" -w snr5_trainNoise_News snr5_devNoise_News snr5_testNoise_News && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr5_TalkShows.json" -w snr5_trainNoise_TalkShows snr5_devNoise_TalkShows snr5_testNoise_TalkShows && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr5_Ambient_Music_top_charts.json" -w snr5_trainNoise_Ambient_Music_top_charts snr5_devNoise_Ambient_Music_top_charts snr5_testNoise_Ambient_Music_top_charts && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr15_whiteNoise.json" -w snr15_whiteNoise && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr15_Ads_2018_2020.json" -w snr15_trainNoise_Ads_2018_2020 snr15_devNoise_Ads_2018_2020 snr15_testNoise_Ads_2018_2020 && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr15_News.json" -w snr15_trainNoise_News snr15_devNoise_News snr15_testNoise_News && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr15_TalkShows.json" -w snr15_trainNoise_TalkShows snr15_devNoise_TalkShows snr15_testNoise_TalkShows && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr15_Ambient_Music_top_charts.json" -w snr15_trainNoise_Ambient_Music_top_charts snr15_devNoise_Ambient_Music_top_charts snr15_testNoise_Ambient_Music_top_charts """ reductionAmount = percentageReduction / 100 dirname = os.path.dirname(outJson) if not os.path.exists(dirname): os.makedirs(dirname) samples = loadFromJson(jsonPath) trainSamples = {} devSamples = {} testSamples = {} for i, ID in enumerate(samples.keys()): sample = samples[ID] if sample["partition"] == "train": trainSamples[sample["ID"]] = sample.copy() if sample["partition"] == "dev": devSamples[sample["ID"]] = sample.copy() if sample["partition"] == "test": testSamples[sample["ID"]] = sample.copy() if reductionAmount > 0: print("Performing data reduction based on percentage") trainKeys = random.sample( trainSamples.keys(), int(reductionAmount * len(trainSamples.keys()))) devKeys = random.sample(devSamples.keys(), int(reductionAmount * len(devSamples.keys()))) testKeys = random.sample( testSamples.keys(), int(reductionAmount * len(testSamples.keys()))) if keepTest: testKeys = testSamples.keys() newSamples = {} for keys in [trainKeys, devKeys, testKeys]: for ID in keys: sample = samples[ID] newSamples[ID] = sample.copy() print("Data reduction completed!") else: newSamples = samples.copy() for filterString in blackList: blackKeys = [] for i, key in enumerate(newSamples.keys()): if filterString in key: blackKeys.append(key) printProgressBar(i + 1, len(newSamples.keys()), prefix='removing black-listed IDs :', suffix='', length="fit") [newSamples.pop(key) for key in blackKeys] if len(whiteList) > 0: blackKeys = [] for i, key in enumerate(newSamples.keys()): flag = True for whiteString in whiteList: if whiteString in key: flag = False if flag: blackKeys.append(key) printProgressBar(i + 1, len(newSamples.keys()), prefix='Keeping only white-listed IDs :', suffix='', length="fit") [newSamples.pop(key) for key in blackKeys] with open(outJson, 'w') as jsonFile: json.dump(newSamples, jsonFile, indent=4, ensure_ascii=False)
def main(partitions, jsonPath, basedOnFolder, trainIdsForced, devIdsForced, testIdsForced, outJson): """ Repartitioning files in a json file Example ---------- python repartitionJson.py -p 60 20 20 -f True -j "/mnt/HD-Storage/Datasets/RAVDESS/data.json" python repartitionJson.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_parted.json" -tr user_2 user_3 user_4 user_5 user_7 user_8 user_9 user_10 user_11 -de user_12 user_13 user_14 user_15 -te user_16 user_17 user_18 user_19 python repartitionJson.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -tr user_2 user_3 user_4 user_5 user_7 user_8 user_9 user_10 user_11 -de user_12 user_13 user_14 user_15 -te user_16 user_17 user_18 user_19 python repartitionJson.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted.json" -tr Ses01 Ses02 Ses03 -de Ses04 -te Ses05 python repartitionJson.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted.json" -tr B01 B02 B03 B04 B05 B06 B07 B08 B09 B10 B11 B12 B13 -de B14 B15 B16 B17 B18 B19 B20 -te B21 B22 B23 B24 B25 B26 B27 """ samples = loadFromJson(jsonPath) ids = list(samples.keys()) if basedOnFolder: folders = [] foldersIds = {} for i, ID in enumerate(samples.keys()): sample = samples[ID] wavePath = sample["path"] folder = os.path.split(wavePath)[0] if not folder in folders: folders.append(folder) foldersIds[folder] = [ID] else: foldersIds[folder].append(ID) ids = folders # print(folders) if len(partitions) != 0: total = len(ids) random.shuffle(ids) trainCut = int(total * partitions[0] / 100) devCut = int(total * (partitions[0] + partitions[1]) / 100) trainIds = ids[:trainCut] devIds = ids[trainCut:devCut] testIds = ids[devCut:] if trainIdsForced != []: trainIds = [] devIds = [] testIds = [] for ID in ids: trainFlag = False devFlag = False testFlag = False for trainId in trainIdsForced: if trainId in ID: trainFlag = True break for devId in devIdsForced: if devId in ID: devFlag = True break for testId in testIdsForced: if testId in ID: testFlag = True break if trainFlag: trainIds.append(ID) if devFlag: devIds.append(ID) if testFlag: testIds.append(ID) # print(trainIds) for i, idx in enumerate(ids): partition = "train" if idx in devIds: partition = "dev" if idx in testIds: partition = "test" if basedOnFolder: for eachIdx in foldersIds[idx]: samples[eachIdx]["partition"] = partition else: samples[idx]["partition"] = partition printProgressBar(i + 1, len(ids), prefix='Repartitioning ', suffix='Complete', length="fit") if outJson == "": outJson = jsonPath directory = os.path.dirname(outJson) if not os.path.exists(directory): os.makedirs(directory) with open(outJson, 'w') as jsonFile: json.dump(samples, jsonFile, indent=4, ensure_ascii=False)