def main(jsonPaths, outJson): """ given a list of data.json files, combine all the data and put them all in one json file Example ---------- python dataCombiner.py -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" "/mnt/HD-Storage/Datasets/MaSS_Fr/data.json" -o "/mnt/HD-Storage/Datasets/Mixed/data_RecolaMaSS.json" """ dirname = os.path.dirname(outJson) if not os.path.exists(dirname): os.makedirs(dirname) newSamples = {} for jsonPath in jsonPaths: # print(jsonPath) # print(os.path.dirname(os.path.relpath(jsonPath, outJson))) addPath = os.path.dirname(os.path.relpath(jsonPath, outJson)) samples = loadFromJson(jsonPath) for sample in samples: # print("before",samples[sample]) addToPaths(samples[sample], addPath=addPath) # print("after",samples[sample]) newSamples[sample] = samples[sample] with open(outJson, 'w') as jsonFile: json.dump(newSamples, jsonFile, indent=4, ensure_ascii=False)
def main(featsFolder, jsonPath): """ Adding mel-frequency filterbank features to a given dataset and writting its reference to a json file Example ---------- python MelFilterBank.py -f "MFB" -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" python MelFilterBank.py -f "MFB" -j "/mnt/HD-Storage/Datasets/AlloSat/data.json" python MelFilterBank.py -f "MFB" -j "/mnt/HD-Storage/Datasets/Recola_46_S/data.json" python MelFilterBank.py -f "MFB" -j "/mnt/HD-Storage/Datasets/ESTER1/data.json" python MelFilterBank.py -f "MFB" -j "/home/getalp/alisamis/Datasets/ESTER1/data.json" python MelFilterBank.py -f "MFB" -j "/mnt/HD-Storage/Datasets/RAVDESS/data.json" """ samples = loadFromJson(jsonPath) for i, ID in enumerate(samples.keys()): sample = samples[ID] wavePath = sample["path"] wavsFolder = wavePath.split(os.sep)[0] waveFullPath = os.path.join(os.path.split(jsonPath)[0], wavePath) featsLocalPath = wavePath.replace(wavsFolder, featsFolder).replace(".wav", ".csv") featsLocalPath = os.path.join("Feats", featsLocalPath) featsFullPath = os.path.join( os.path.split(jsonPath)[0], featsLocalPath) # print(featsLocalPath, featsFullPath) dim = makeFeatsCsv(waveFullPath, featsFullPath) if dim == 0: continue featsDict = getFeatsDict(dim, featsFolder, featsLocalPath) samples[ID]["features"][featsDict["ID"]] = featsDict # saveToJson(jsonPath, sample) printProgressBar(i + 1, len(samples), prefix='Adding mel-frequency filterbank features:', suffix='Complete', length="fit") with open(jsonPath, 'w') as jsonFile: json.dump(samples, jsonFile, indent=4, ensure_ascii=False)
def main(annotsList, headers, genres, jsonPath): """ Adding annotations to a given dataset and writting its reference to a json file Example ---------- python addAnnots.py -a "gs_arousal_0.01_std" "gs_valence_0.01_std" "gen_gs_arousal_0.01_std" "gen_gs_valence_0.01_std" -d "GoldStandard" -g arousal valence arousal valence -j "/mnt/HD-Storage/Datasets/Recola_46_S/data.json" python addAnnots.py -a "VAD_0.01" -d "label" -g "VAD" -j "/mnt/HD-Storage/Datasets/ESTER1/data.json" python addAnnots.py -a "turns_0.01" -d "label" -g "VAD" -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" python addAnnots.py -a "turns_0.01" -d "label" -g "VAD" -j "/mnt/HD-Storage/Datasets/SEMAINE/data.json" python addAnnots.py -a "turns_0.01" -d "label" -g "VAD" -j "/mnt/HD-Storage/Datasets/IEMOCAP/data.json" python addAnnots.py -a "turns_0.01" -d "label" -g "VAD" -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data.json" """ samples = loadFromJson(jsonPath) for t, annotName in enumerate(annotsList): trainFilePaths = [] print("annot:", annotName) for i, ID in enumerate(samples.keys()): sample = samples[ID] wavePath = sample["path"] wavsFolder = wavePath.split(os.sep)[0] waveFullPath = os.path.join(os.path.split(jsonPath)[0], wavePath) featsLocalPath = wavePath.replace(wavsFolder, annotName).replace(".wav", ".csv") featsLocalPath = os.path.join("Annots", featsLocalPath) featsFullPath = os.path.join(os.path.split(jsonPath)[0], featsLocalPath) try: df = pd.read_csv(featsFullPath, delimiter=',') out = df[headers].to_numpy().astype('float64') dim = list(out.shape) annotsDict = getAnnotsDict(annotName, genres[t], dim, featsLocalPath, headers) samples[ID]["annotations"][annotsDict["ID"]] = annotsDict except: print("Warning: could not read", featsFullPath) printProgressBar(i + 1, len(samples), prefix = 'Adding '+ annotName +' annotation', suffix = 'Complete', length = "fit") with open(jsonPath, 'w') as jsonFile: json.dump(samples, jsonFile, indent=4, ensure_ascii=False)
def main(featsFolder, jsonPath, modelPath, maxDur, normalised, cuda): """ Adding wav2vec2 features to a given dataset and writting its reference to a json file Example ---------- python wav2vec2.py -f "xlsr_53_56k_cut30" -d 29.99 -n True -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/xlsr_53_56k.pt" python wav2vec2.py -f "FlowBERT_2952h_base_cut30_noNorm" -d 29.99 -n False -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/FlowBERT_2952h_base.pt" python wav2vec2.py -f "FlowBERT_2952h_base_cut30" -d 29.99 -n True -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/FlowBERT_2952h_base.pt" python wav2vec2.py -f "FlowBERT_2952h_large_cut30" -d 29.99 -n True -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/FlowBERT_2952h_large.pt" python wav2vec2.py -f "FlowBERT_2952h_large_noNorm_cut30" -d 29.99 -n False -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/FlowBERT_2952h_large.pt" python wav2vec2.py -f "xlsr_53_56k_cut30" -d 29.98 -n True -j "/mnt/HD-Storage/Datasets/AlloSat/data.json" -m "/mnt/HD-Storage/Models/xlsr_53_56k.pt" python wav2vec2.py -f "FlowBERT_2952h_base_cut30" -d 29.99 -n False -j /home/getalp/alisamis/Datasets/AlloSat/data.json -m /home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/2952h_base/checkpoint_best.pt python wav2vec2.py -f "FlowBERT_2952h_base_cut30_noNorm" -d 29.99 -n False -j "/mnt/HD-Storage/Datasets/Recola_46_S/data.json" -m "/mnt/HD-Storage/Models/FlowBERT_2952h_base.pt" not working: python wav2vec2.py -f "wav2vec2-large-xlsr-53-french" -d 29.99 -n True -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/wav2vec2-large-xlsr-53-french.zip" python wav2vec2.py -f "mls_french_base_cut30" -d 29.98 -n False -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/mls_french_base/checkpoint_best.pt" python wav2vec2.py -f "xlsr_53_56k_cut30" -d 29.98 -n True -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/alisamis/Models/xlsr_53_56k.pt" python wav2vec2.py -f "mls_french_base_cut30" -d 29.98 -n False -j "/home/getalp/alisamis/Datasets/Recola_46/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/mls_french_base/checkpoint_best.pt" python wav2vec2.py -f "mls_french_base_cut30" -d 29.98 -n False -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/mls_french_base/checkpoint_best.pt" python wav2vec2.py -f "libri960_big_cut30" -d 29.98 -n True -j "/home/getalp/alisamis/Datasets/Recola_46/data.json" -m "/home/getalp/dinarelm/work/data/Exchance/wav2vec/models/libri960_big.pt" python wav2vec2.py -f "libri960_big_cut30" -d 29.98 -n True -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/dinarelm/work/data/Exchance/wav2vec/models/libri960_big.pt" python wav2vec2.py -f "mls_french_large_cut30" -d 29.98 -n True -j "/home/getalp/alisamis/Datasets/Recola_46/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/mls_french_large/checkpoint_best.pt" && \ python wav2vec2.py -f "FlowBERT_2952h_base_cut30" -d 29.98 -n False -j "/home/getalp/alisamis/Datasets/Recola_46/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/2952h_base/checkpoint_best.pt" && \ python wav2vec2.py -f "FlowBERT_2952h_large_cut30" -d 29.98 -n True -j "/home/getalp/alisamis/Datasets/Recola_46/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/2952h_large/checkpoint_best.pt" && \ python wav2vec2.py -f "FlowBERT_2952h_large_cut30" -d 29.98 -n True -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/2952h_large/checkpoint_best.pt" && \ python wav2vec2.py -f "mls_french_large_cut30" -d 29.98 -n True -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/mls_french_large/checkpoint_best.pt" python wav2vec2.py -f "mls_french_base_cut30" -d 29.98 -n False -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/mls_french_base/checkpoint_best.pt" """ # cp = torch.load(modelPath, map_location=torch.device('cpu')) # model = Wav2VecModel.build_model(cp['args'], task=None) # model.load_state_dict(cp['model']) # model.eval() # cp = torch.load(modelPath, map_location=torch.device('cpu')) model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task( [modelPath]) model = model[0] if cuda: model = model.cuda() model.eval() samples = loadFromJson(jsonPath) for i, ID in enumerate(samples.keys()): sample = samples[ID] wavePath = sample["path"] wavsFolder = wavePath.split(os.sep)[0] waveFullPath = os.path.join(os.path.split(jsonPath)[0], wavePath) featsLocalPath = wavePath.replace(wavsFolder, featsFolder).replace(".wav", ".csv") featsLocalPath = os.path.join("Feats", featsLocalPath) featsFullPath = os.path.join( os.path.split(jsonPath)[0], featsLocalPath) # print(featsLocalPath, featsFullPath) dim = makeFeatsCsv(waveFullPath, featsFullPath, model, maxDur, normalised, cuda=cuda) if dim == 0: continue featsDict = getFeatsDict(dim, featsFolder, featsLocalPath) samples[ID]["features"][featsDict["ID"]] = featsDict # saveToJson(jsonPath, sample) printProgressBar(i + 1, len(samples), prefix='Adding wav2vec features:', suffix='Complete', length="fit") with open(jsonPath, 'w') as jsonFile: json.dump(samples, jsonFile, indent=4, ensure_ascii=False)
def main(jsonPath, outJson, noiseFilesPaths, addWhite, SNRs, ignoreExisting): """ Adding noise to a given dataset and making a different json file for it to reference it Example ---------- python addNoise.py -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -aw True -n "/mnt/HD-Storage/Datasets/NoiseFiles/train" "/mnt/HD-Storage/Datasets/NoiseFiles/dev" "/mnt/HD-Storage/Datasets/NoiseFiles/test" python addNoise.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy.json" -aw True -n "/mnt/HD-Storage/Datasets/NoiseFiles/train" "/mnt/HD-Storage/Datasets/NoiseFiles/dev" "/mnt/HD-Storage/Datasets/NoiseFiles/test" python addNoise.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -aw True -n "/mnt/HD-Storage/Datasets/NoiseFiles/train" "/mnt/HD-Storage/Datasets/NoiseFiles/dev" "/mnt/HD-Storage/Datasets/NoiseFiles/test" python addNoise.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -aw True -n "/mnt/HD-Storage/Datasets/NoiseFiles/train" "/mnt/HD-Storage/Datasets/NoiseFiles/dev" "/mnt/HD-Storage/Datasets/NoiseFiles/test" """ datasetPath = os.path.split(jsonPath)[0] noisyFolder = "Wavs_Noisy" noisyWavsPath = os.path.join(datasetPath, noisyFolder) if not os.path.exists(noisyWavsPath): os.makedirs(noisyWavsPath) trainPath = os.path.join(noiseFilesPaths[0], "**", "*.wav") trainNoises = glob.glob(trainPath, recursive=True) devPath = os.path.join(noiseFilesPaths[1], "**", "*.wav") devNoises = glob.glob(devPath, recursive=True) testPath = os.path.join(noiseFilesPaths[2], "**", "*.wav") testNoises = glob.glob(testPath, recursive=True) samples = loadFromJson(jsonPath) newSamples = samples.copy() for i, ID in enumerate(samples.keys()): sample = samples[ID].copy() wavePath = sample["path"] wavFullPath = os.path.join(datasetPath, wavePath) sample["features"] = {} # to avoid reading the wrong feature extracted from clean speech wavsFolder = wavePath.split(os.sep)[0] splits = wavePath.split(os.sep) fileName = splits[-1].replace(".wav", "") ## MAKE NOISY FILES AND ADD TO SAMPLES, GIVE A NEW ID (which would be name of file) noiseFiles = trainNoises if sample["partition"] == "dev" : noiseFiles = devNoises if sample["partition"] == "test": noiseFiles = testNoises for snr in SNRs: for noiseFile in noiseFiles: outWavPath = noisyFolder for split in splits[1:-1]: outWavPath = os.path.join(outWavPath, split) outWavName = fileName +'_snr' + str(snr) + '_' + noiseFile.split(os.sep)[-1] outWavPath = os.path.join(outWavPath, outWavName) outWavFullPath = os.path.join(datasetPath, outWavPath) if not (ignoreExisting and os.path.exists(outWavFullPath)): addNoiseFile(wavFullPath, noiseFile, outWavFullPath, snr=snr) ID = outWavName.replace(".wav", "") newSample = sample.copy() newSample["path"] = outWavPath newSample["ID"] = ID newSamples[ID] = newSample if addWhite: outWavPath = noisyFolder for split in splits[1:-1]: outWavPath = os.path.join(outWavPath, split) outWavName = fileName +'_snr' + str(snr) + '_whiteNoise.wav' outWavPath = os.path.join(outWavPath, outWavName) outWavFullPath = os.path.join(datasetPath, outWavPath) if not (ignoreExisting and os.path.exists(outWavFullPath)): addWhiteNoise(wavFullPath, outWavFullPath, snr=snr) ID = outWavName.replace(".wav", "") newSample = sample.copy() newSample["path"] = outWavPath newSample["ID"] = ID newSamples[ID] = newSample printProgressBar(i + 1, len(samples), prefix = 'Making wav files noisy:', suffix = 'Complete', length = "fit") with open(outJson, 'w') as jsonFile: json.dump(newSamples, jsonFile, indent=4, ensure_ascii=False)
def main(jsonPath, outJson, percentageReduction, keepTest, blackList, whiteList): """ Reduce data in a json file to have a smaller subset of the original dataset Example ---------- python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_short_10.json" -p 10 python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr5_whiteNoise.json" -w snr5_whiteNoise && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr5_Ads_2018_2020.json" -w snr5_trainNoise_Ads_2018_2020 snr5_devNoise_Ads_2018_2020 snr5_testNoise_Ads_2018_2020 && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr5_News.json" -w snr5_trainNoise_News snr5_devNoise_News snr5_testNoise_News && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr5_TalkShows.json" -w snr5_trainNoise_TalkShows snr5_devNoise_TalkShows snr5_testNoise_TalkShows && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr5_Ambient_Music_top_charts.json" -w snr5_trainNoise_Ambient_Music_top_charts snr5_devNoise_Ambient_Music_top_charts snr5_testNoise_Ambient_Music_top_charts && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr15_whiteNoise.json" -w snr15_whiteNoise && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr15_Ads_2018_2020.json" -w snr15_trainNoise_Ads_2018_2020 snr15_devNoise_Ads_2018_2020 snr15_testNoise_Ads_2018_2020 && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr15_News.json" -w snr15_trainNoise_News snr15_devNoise_News snr15_testNoise_News && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr15_TalkShows.json" -w snr15_trainNoise_TalkShows snr15_devNoise_TalkShows snr15_testNoise_TalkShows && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr15_Ambient_Music_top_charts.json" -w snr15_trainNoise_Ambient_Music_top_charts snr15_devNoise_Ambient_Music_top_charts snr15_testNoise_Ambient_Music_top_charts python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr5_whiteNoise.json" -w snr5_whiteNoise && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr5_Ads_2018_2020.json" -w snr5_trainNoise_Ads_2018_2020 snr5_devNoise_Ads_2018_2020 snr5_testNoise_Ads_2018_2020 && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr5_News.json" -w snr5_trainNoise_News snr5_devNoise_News snr5_testNoise_News && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr5_TalkShows.json" -w snr5_trainNoise_TalkShows snr5_devNoise_TalkShows snr5_testNoise_TalkShows && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr5_Ambient_Music_top_charts.json" -w snr5_trainNoise_Ambient_Music_top_charts snr5_devNoise_Ambient_Music_top_charts snr5_testNoise_Ambient_Music_top_charts && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr15_whiteNoise.json" -w snr15_whiteNoise && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr15_Ads_2018_2020.json" -w snr15_trainNoise_Ads_2018_2020 snr15_devNoise_Ads_2018_2020 snr15_testNoise_Ads_2018_2020 && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr15_News.json" -w snr15_trainNoise_News snr15_devNoise_News snr15_testNoise_News && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr15_TalkShows.json" -w snr15_trainNoise_TalkShows snr15_devNoise_TalkShows snr15_testNoise_TalkShows && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr15_Ambient_Music_top_charts.json" -w snr15_trainNoise_Ambient_Music_top_charts snr15_devNoise_Ambient_Music_top_charts snr15_testNoise_Ambient_Music_top_charts python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr5_whiteNoise.json" -w snr5_whiteNoise && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr5_Ads_2018_2020.json" -w snr5_trainNoise_Ads_2018_2020 snr5_devNoise_Ads_2018_2020 snr5_testNoise_Ads_2018_2020 && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr5_News.json" -w snr5_trainNoise_News snr5_devNoise_News snr5_testNoise_News && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr5_TalkShows.json" -w snr5_trainNoise_TalkShows snr5_devNoise_TalkShows snr5_testNoise_TalkShows && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr5_Ambient_Music_top_charts.json" -w snr5_trainNoise_Ambient_Music_top_charts snr5_devNoise_Ambient_Music_top_charts snr5_testNoise_Ambient_Music_top_charts && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr15_whiteNoise.json" -w snr15_whiteNoise && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr15_Ads_2018_2020.json" -w snr15_trainNoise_Ads_2018_2020 snr15_devNoise_Ads_2018_2020 snr15_testNoise_Ads_2018_2020 && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr15_News.json" -w snr15_trainNoise_News snr15_devNoise_News snr15_testNoise_News && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr15_TalkShows.json" -w snr15_trainNoise_TalkShows snr15_devNoise_TalkShows snr15_testNoise_TalkShows && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr15_Ambient_Music_top_charts.json" -w snr15_trainNoise_Ambient_Music_top_charts snr15_devNoise_Ambient_Music_top_charts snr15_testNoise_Ambient_Music_top_charts python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr5_whiteNoise.json" -w snr5_whiteNoise && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr5_Ads_2018_2020.json" -w snr5_trainNoise_Ads_2018_2020 snr5_devNoise_Ads_2018_2020 snr5_testNoise_Ads_2018_2020 && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr5_News.json" -w snr5_trainNoise_News snr5_devNoise_News snr5_testNoise_News && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr5_TalkShows.json" -w snr5_trainNoise_TalkShows snr5_devNoise_TalkShows snr5_testNoise_TalkShows && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr5_Ambient_Music_top_charts.json" -w snr5_trainNoise_Ambient_Music_top_charts snr5_devNoise_Ambient_Music_top_charts snr5_testNoise_Ambient_Music_top_charts && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr15_whiteNoise.json" -w snr15_whiteNoise && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr15_Ads_2018_2020.json" -w snr15_trainNoise_Ads_2018_2020 snr15_devNoise_Ads_2018_2020 snr15_testNoise_Ads_2018_2020 && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr15_News.json" -w snr15_trainNoise_News snr15_devNoise_News snr15_testNoise_News && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr15_TalkShows.json" -w snr15_trainNoise_TalkShows snr15_devNoise_TalkShows snr15_testNoise_TalkShows && \ python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr15_Ambient_Music_top_charts.json" -w snr15_trainNoise_Ambient_Music_top_charts snr15_devNoise_Ambient_Music_top_charts snr15_testNoise_Ambient_Music_top_charts """ reductionAmount = percentageReduction / 100 dirname = os.path.dirname(outJson) if not os.path.exists(dirname): os.makedirs(dirname) samples = loadFromJson(jsonPath) trainSamples = {} devSamples = {} testSamples = {} for i, ID in enumerate(samples.keys()): sample = samples[ID] if sample["partition"] == "train": trainSamples[sample["ID"]] = sample.copy() if sample["partition"] == "dev": devSamples[sample["ID"]] = sample.copy() if sample["partition"] == "test": testSamples[sample["ID"]] = sample.copy() if reductionAmount > 0: print("Performing data reduction based on percentage") trainKeys = random.sample( trainSamples.keys(), int(reductionAmount * len(trainSamples.keys()))) devKeys = random.sample(devSamples.keys(), int(reductionAmount * len(devSamples.keys()))) testKeys = random.sample( testSamples.keys(), int(reductionAmount * len(testSamples.keys()))) if keepTest: testKeys = testSamples.keys() newSamples = {} for keys in [trainKeys, devKeys, testKeys]: for ID in keys: sample = samples[ID] newSamples[ID] = sample.copy() print("Data reduction completed!") else: newSamples = samples.copy() for filterString in blackList: blackKeys = [] for i, key in enumerate(newSamples.keys()): if filterString in key: blackKeys.append(key) printProgressBar(i + 1, len(newSamples.keys()), prefix='removing black-listed IDs :', suffix='', length="fit") [newSamples.pop(key) for key in blackKeys] if len(whiteList) > 0: blackKeys = [] for i, key in enumerate(newSamples.keys()): flag = True for whiteString in whiteList: if whiteString in key: flag = False if flag: blackKeys.append(key) printProgressBar(i + 1, len(newSamples.keys()), prefix='Keeping only white-listed IDs :', suffix='', length="fit") [newSamples.pop(key) for key in blackKeys] with open(outJson, 'w') as jsonFile: json.dump(newSamples, jsonFile, indent=4, ensure_ascii=False)
def main(partitions, jsonPath, basedOnFolder, trainIdsForced, devIdsForced, testIdsForced, outJson): """ Repartitioning files in a json file Example ---------- python repartitionJson.py -p 60 20 20 -f True -j "/mnt/HD-Storage/Datasets/RAVDESS/data.json" python repartitionJson.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_parted.json" -tr user_2 user_3 user_4 user_5 user_7 user_8 user_9 user_10 user_11 -de user_12 user_13 user_14 user_15 -te user_16 user_17 user_18 user_19 python repartitionJson.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -tr user_2 user_3 user_4 user_5 user_7 user_8 user_9 user_10 user_11 -de user_12 user_13 user_14 user_15 -te user_16 user_17 user_18 user_19 python repartitionJson.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted.json" -tr Ses01 Ses02 Ses03 -de Ses04 -te Ses05 python repartitionJson.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted.json" -tr B01 B02 B03 B04 B05 B06 B07 B08 B09 B10 B11 B12 B13 -de B14 B15 B16 B17 B18 B19 B20 -te B21 B22 B23 B24 B25 B26 B27 """ samples = loadFromJson(jsonPath) ids = list(samples.keys()) if basedOnFolder: folders = [] foldersIds = {} for i, ID in enumerate(samples.keys()): sample = samples[ID] wavePath = sample["path"] folder = os.path.split(wavePath)[0] if not folder in folders: folders.append(folder) foldersIds[folder] = [ID] else: foldersIds[folder].append(ID) ids = folders # print(folders) if len(partitions) != 0: total = len(ids) random.shuffle(ids) trainCut = int(total * partitions[0] / 100) devCut = int(total * (partitions[0] + partitions[1]) / 100) trainIds = ids[:trainCut] devIds = ids[trainCut:devCut] testIds = ids[devCut:] if trainIdsForced != []: trainIds = [] devIds = [] testIds = [] for ID in ids: trainFlag = False devFlag = False testFlag = False for trainId in trainIdsForced: if trainId in ID: trainFlag = True break for devId in devIdsForced: if devId in ID: devFlag = True break for testId in testIdsForced: if testId in ID: testFlag = True break if trainFlag: trainIds.append(ID) if devFlag: devIds.append(ID) if testFlag: testIds.append(ID) # print(trainIds) for i, idx in enumerate(ids): partition = "train" if idx in devIds: partition = "dev" if idx in testIds: partition = "test" if basedOnFolder: for eachIdx in foldersIds[idx]: samples[eachIdx]["partition"] = partition else: samples[idx]["partition"] = partition printProgressBar(i + 1, len(ids), prefix='Repartitioning ', suffix='Complete', length="fit") if outJson == "": outJson = jsonPath directory = os.path.dirname(outJson) if not os.path.exists(directory): os.makedirs(directory) with open(outJson, 'w') as jsonFile: json.dump(samples, jsonFile, indent=4, ensure_ascii=False)