def writeOutForRegression(self):
     dataset = DataReader(self.currentDataPath,
                          onlineFeat=self.onlineFeat,
                          resampleTarget=self.resampleTarget)
     dataset.setDatasetFeatOnly(self.experiment.testOn,
                                self.experiment.data["feature"])
     if self.experiment.data["featModelPath"] != "" and self.onlineFeat:
         dataset.getModelFeat(
             self.experiment.data["featModelPath"],
             normalised=self.experiment.data["featModelNorm"],
             maxDur=self.experiment.data["featModelMaxDur"],
             cuda=self.cuda)
         print("fairseq model for writting model outputs loaded")
     if self.testRun: dataset.keepOneOnly()
     dataloader = DataLoader(dataset=dataset, batch_size=1, shuffle=False)
     wrapper = self.getWrapper(getBest=True)
     modelOutPath = os.path.join(wrapper.savePath, "outputs")
     if not os.path.exists(modelOutPath): os.makedirs(modelOutPath)
     for idx, (ID, feat) in enumerate(dataloader):
         output = wrapper.forwardModel(feat)
         output = output.detach().cpu().numpy()
         # print(ID, feat.shape, output.shape)
         savePath = os.path.join(modelOutPath, ID[0] + ".csv")
         headers = ["output_" + str(i) for i in range(output.shape[2])]
         df = pd.DataFrame(output[0], columns=headers)
         df.to_csv(savePath, index=False)
         printProgressBar(idx + 1,
                          len(dataloader),
                          prefix='Writing outputs:',
                          suffix='',
                          length="fit")
Exemple #2
0
def main(mainPath, partitions):
    """
        Make the json file only given the wavs files of a dataset.

        example: 
            python makeJson.py -m "/mnt/HD-Storage/Datasets/ESTER1" -p "train/*.wav" "dev/*.wav" "test/*.wav"
            python makeJson.py -m "/mnt/HD-Storage/Datasets/RAVDESS"

            python makeJson.py -m "/mnt/HD-Storage/Datasets/SEMAINE"
            python makeJson.py -m "/mnt/HD-Storage/Datasets/IEMOCAP"

            python makeJson.py -m "/mnt/HD-Storage/Datasets/MaSS_Fr"

    """

    jsonPath = os.path.join(mainPath, "data.json")
    wavsPath = os.path.join(mainPath, "Wavs")
    if partitions == []:
        trainPath = os.path.join(wavsPath, "**", "*.wav")
    else:
        trainPath = os.path.join(wavsPath, partitions[0])
        print("trainPath", trainPath)
    trainFiles = glob.glob(trainPath, recursive=True)
    print("trainFiles", trainFiles)
    if partitions == []:
        devFiles, testFiles = [], []
    else:
        devPath = os.path.join(wavsPath, partitions[1])
        devFiles = glob.glob(devPath, recursive=True)
        testPath = os.path.join(wavsPath, partitions[2])
        testFiles = glob.glob(testPath, recursive=True)

    allDics = {}

    for f, filesPaths in enumerate([trainFiles, devFiles, testFiles]):
        partition = ""
        if f == 0: partition = "train"
        if f == 1: partition = "dev"
        if f == 2: partition = "test"
        for i, filePath in enumerate(filesPaths):
            baseName = os.path.basename(filePath)[:-4]
            fileDict = AudioSample()

            fileDict.setParams(baseName, filePath, partition)
            # fileDict.features = getFeatures(main_path, baseName)

            dic = classToDic(fileDict.__dict__)
            # dic = changePaths(dic, main_path, ".")
            dic = localizePaths(dic, mainPath)
            allDics[baseName] = dic
            printProgressBar(i + 1,
                             len(filesPaths),
                             prefix='Processing Files for ' + partition + ":",
                             suffix='Complete')

    with open(jsonPath, 'w') as fp:
        json.dump(allDics, fp, indent=4, ensure_ascii=False)
 def testClassification(self):
     dataset = DataReader(self.currentDataPath,
                          onlineFeat=self.onlineFeat,
                          resampleTarget=self.resampleTarget)
     dataset.setDatasetClassic(self.experiment.testOn,
                               self.experiment.data["feature"],
                               self.experiment.data["annotation"])
     if self.experiment.data["featModelPath"] != "" and self.onlineFeat:
         dataset.getModelFeat(
             self.experiment.data["featModelPath"],
             normalised=self.experiment.data["featModelNorm"],
             maxDur=self.experiment.data["featModelMaxDur"],
             cuda=self.cuda)
         print("fairseq model for testing model outputs loaded")
     inp, tar = dataset[0]
     self.experiment.inputDim = inp.shape[1]
     firstID1 = list(dataset.dataPart.keys())[0]
     firstID2 = list(dataset.dataPart[firstID1]["annotations"])[0]
     headers = dataset.dataPart[firstID1]["annotations"][firstID2][
         "headers"]
     # print(headers)
     wrapper = getWrapper(self.experiment, seed=self.seed, getBest=True)
     modelOutPath = os.path.join(wrapper.savePath, "outputs")
     savePath = os.path.join(modelOutPath, "outputs.csv")
     outputsCSV = pd.read_csv(savePath)
     if self.testRun: dataset.keepOneOnly()
     IDs = dataset.dataPart.keys()
     AllOuts = []
     AllTars = []
     for idx, ID in enumerate(IDs):
         outputs = outputsCSV[ID].to_numpy()
         targets = dataset.targetReader(ID)
         AllOuts.append(np.argmax(outputs))
         AllTars.append(targets[0, 0])
         # print(np.argmax(outputs), targets[0,0])
         printProgressBar(idx + 1,
                          len(IDs),
                          prefix='Testing model :',
                          suffix='',
                          length="fit")
         # if idx > 50: break
     target = np.array(AllTars)
     output = np.array(AllOuts)
     evaluation = {}
     for key in self.experiment.metrics:
         evaluation[key] = getMetric(target, output, metric=key)
     self.experiment.evaluation = evaluation
     confMat = confMatrix(target, output, numTars=self.experiment.outputDim)
     # print(confMatrix(target, output, numTars=experiment.outputDim))
     savePath = os.path.join(wrapper.savePath, "confMat.csv")
     np.savetxt(savePath, confMat, delimiter=",")
     return evaluation
 def trainModel(self,
                datasetTrain,
                datasetDev,
                batchSize=1,
                maxEpoch=200,
                loadBefore=True,
                tolerance=15,
                minForTolerance=15,
                limitTrainData=False,
                limitDevData=False):
     if loadBefore: self.loadCheckpoint()
     trainDataloader = DataLoader(dataset=datasetTrain,
                                  batch_size=batchSize,
                                  shuffle=True)
     devDataloader = DataLoader(dataset=datasetDev,
                                batch_size=batchSize,
                                shuffle=False)
     while self.currentEpoch <= maxEpoch:
         if self.noMoreTrain:
             if self.printLvl > 0:
                 print("Early stopping has been achieved!")
             break
         if limitTrainData:
             datasetTrain.limitData(limitTrainData)
             trainDataloader = DataLoader(dataset=datasetTrain,
                                          batch_size=batchSize,
                                          shuffle=True)
         if limitDevData:
             datasetDev.limitData(limitTrainData)
             devDataloader = DataLoader(dataset=datasetDev,
                                        batch_size=batchSize,
                                        shuffle=False)
         self.trainEpoch(trainDataloader)
         devLoss = self.evaluateModel(devDataloader)
         self.modelStates[self.currentEpoch] = copy.deepcopy(
             self.model.state_dict())
         self.epochDevLosses.append(devLoss)
         if self.printLvl > 1:
             printProgressBar(self.currentEpoch,
                              maxEpoch,
                              prefix='Training model:',
                              suffix='| epoch loss: ' + str(devLoss),
                              length="fit")
             # print("loss", self.currentEpoch, devLoss)
         self.currentEpoch += 1
         # --- Early Stopping ---
         if (self.currentEpoch - self.getBestEpochIdx() >=
                 tolerance) and self.currentEpoch > minForTolerance:
             self.noMoreTrain = True
         self.saveCheckpoint()
     self.saveLogToCSV()
     if self.printLvl > 0: print("Training the model has been finished!")
def main(inputPath, newPath, ignorePath, ext):
    """
        Transform all audio files under a folder into wav PCM 16kHz 16bits signed-intege.

        example: 
            python Preprocess.py --input "../../Data/Wavs" --output "../../Data/WavsProcessed"
            python Preprocess.py --input "/mnt/HD-Storage/Databases/AlloSat_corpus/audio" --output "/mnt/HD-Storage/Datasets/AlloSat/Wavs"

            python Preprocess.py --input "/mnt/HD-Storage/Databases/ESTER1/ESTER1_TRAIN/wav" --output "/mnt/HD-Storage/Datasets/ESTER1/Wavs/train"
            python Preprocess.py --input "/mnt/HD-Storage/Databases/ESTER1/ESTER1_DEV/wav" --output "/mnt/HD-Storage/Datasets/ESTER1/Wavs/dev"
            python Preprocess.py --input "/mnt/HD-Storage/Databases/ESTER1/ESTER1_TEST/wav" --output "/mnt/HD-Storage/Datasets/ESTER1/Wavs/test"

            python Preprocess.py --input "/mnt/HD-Storage/Databases/RAVDESS_Audio_Speech_Actors_01-24" --output "/mnt/HD-Storage/Datasets/RAVDESS/Wavs"
            python Preprocess.py --input "/mnt/HD-Storage/Databases/Noises4" --output "/mnt/HD-Storage/Datasets/NoiseFiles"

            python Preprocess.py --input "/mnt/HD-Storage/Databases/SEMAINE/wav_data_original" --output "/mnt/HD-Storage/Datasets/SEMAINE/Wavs"
            python Preprocess.py --input "/mnt/HD-Storage/Databases/IEMOCAP/IEMOCAP_full_release" --output "/mnt/HD-Storage/Datasets/IEMOCAP/Wavs" -e "avi"

            python Preprocess.py --input "/mnt/HD-Storage/Databases/MaSS/output_waves" --output "/mnt/HD-Storage/Datasets/MaSS_Fr/Wavs"
    """
    path = os.path.join(inputPath, "**")  #"../PMDOM2FR/**/"
    theFiles = get_files_in_path(path, ext=ext)

    for i, filePath in enumerate(theFiles):
        # Making wav files
        fileNewPath = filePath.replace(inputPath, newPath)
        if ignorePath:
            fileNewPath = os.path.join(newPath, os.path.split(filePath)[-1])
        makeDirFor(fileNewPath)
        if ext == "avi":
            os.system('ffmpeg -i ' + filePath + ' -y ' + "temp.wav")
            os.system('sox ' + "temp.wav" +
                      ' -r 16000 -c 1 -b 16 -e signed-integer ' +
                      fileNewPath[:-4] + ".wav")
            os.remove("temp.wav")
        else:
            os.system('sox ' + filePath +
                      ' -r 16000 -c 1 -b 16 -e signed-integer ' + fileNewPath)
        printProgressBar(i + 1,
                         len(theFiles),
                         prefix='Transforming Files:',
                         suffix='Complete')
def main(featsFolder, jsonPath):
    """
    Adding mel-frequency filterbank features to a given dataset and writting its reference to a json file


    Example
    ----------
    python MelFilterBank.py -f "MFB" -j "/mnt/HD-Storage/Datasets/Recola_46/data.json"
    python MelFilterBank.py -f "MFB" -j "/mnt/HD-Storage/Datasets/AlloSat/data.json"
    python MelFilterBank.py -f "MFB" -j "/mnt/HD-Storage/Datasets/Recola_46_S/data.json"
    
    python MelFilterBank.py -f "MFB" -j "/mnt/HD-Storage/Datasets/ESTER1/data.json"
    python MelFilterBank.py -f "MFB" -j "/home/getalp/alisamis/Datasets/ESTER1/data.json"

    python MelFilterBank.py -f "MFB" -j "/mnt/HD-Storage/Datasets/RAVDESS/data.json"
    """
    samples = loadFromJson(jsonPath)
    for i, ID in enumerate(samples.keys()):
        sample = samples[ID]
        wavePath = sample["path"]
        wavsFolder = wavePath.split(os.sep)[0]
        waveFullPath = os.path.join(os.path.split(jsonPath)[0], wavePath)
        featsLocalPath = wavePath.replace(wavsFolder,
                                          featsFolder).replace(".wav", ".csv")
        featsLocalPath = os.path.join("Feats", featsLocalPath)
        featsFullPath = os.path.join(
            os.path.split(jsonPath)[0], featsLocalPath)

        # print(featsLocalPath, featsFullPath)
        dim = makeFeatsCsv(waveFullPath, featsFullPath)
        if dim == 0: continue
        featsDict = getFeatsDict(dim, featsFolder, featsLocalPath)
        samples[ID]["features"][featsDict["ID"]] = featsDict
        # saveToJson(jsonPath, sample)
        printProgressBar(i + 1,
                         len(samples),
                         prefix='Adding mel-frequency filterbank features:',
                         suffix='Complete',
                         length="fit")
    with open(jsonPath, 'w') as jsonFile:
        json.dump(samples, jsonFile, indent=4, ensure_ascii=False)
Exemple #7
0
def main(annotsList, headers, genres, jsonPath):
    """
    Adding annotations to a given dataset and writting its reference to a json file


    Example
    ----------
    python addAnnots.py -a "gs_arousal_0.01_std" "gs_valence_0.01_std" "gen_gs_arousal_0.01_std" "gen_gs_valence_0.01_std" -d "GoldStandard" -g arousal valence arousal valence -j "/mnt/HD-Storage/Datasets/Recola_46_S/data.json"
    python addAnnots.py -a "VAD_0.01" -d "label" -g "VAD" -j "/mnt/HD-Storage/Datasets/ESTER1/data.json"
    python addAnnots.py -a "turns_0.01" -d "label" -g "VAD" -j "/mnt/HD-Storage/Datasets/Recola_46/data.json"
    python addAnnots.py -a "turns_0.01" -d "label" -g "VAD" -j "/mnt/HD-Storage/Datasets/SEMAINE/data.json"
    python addAnnots.py -a "turns_0.01" -d "label" -g "VAD" -j "/mnt/HD-Storage/Datasets/IEMOCAP/data.json"
    python addAnnots.py -a "turns_0.01" -d "label" -g "VAD" -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data.json"
    """
    samples = loadFromJson(jsonPath)
    for t, annotName in enumerate(annotsList):
        trainFilePaths = []
        print("annot:", annotName)

        for i, ID in enumerate(samples.keys()):
            sample = samples[ID]
            wavePath = sample["path"]
            wavsFolder = wavePath.split(os.sep)[0]
            waveFullPath = os.path.join(os.path.split(jsonPath)[0], wavePath)
            featsLocalPath = wavePath.replace(wavsFolder, annotName).replace(".wav", ".csv")
            featsLocalPath = os.path.join("Annots", featsLocalPath)
            featsFullPath  = os.path.join(os.path.split(jsonPath)[0], featsLocalPath)

            try:
                df = pd.read_csv(featsFullPath, delimiter=',')
                out = df[headers].to_numpy().astype('float64')
                dim = list(out.shape)
                annotsDict = getAnnotsDict(annotName, genres[t], dim, featsLocalPath, headers)
                samples[ID]["annotations"][annotsDict["ID"]] = annotsDict
            except:
                print("Warning: could not read", featsFullPath)

            printProgressBar(i + 1, len(samples), prefix = 'Adding '+ annotName +' annotation', suffix = 'Complete', length = "fit")

        with open(jsonPath, 'w') as jsonFile:
            json.dump(samples, jsonFile,  indent=4, ensure_ascii=False)
    def testRegression(self):
        dataset = DataReader(self.currentDataPath,
                             onlineFeat=self.onlineFeat,
                             resampleTarget=self.resampleTarget)
        dataset.setDatasetAnnotOnly(self.experiment.testOn,
                                    self.experiment.data["annotation"])
        # if self.experiment.data["featModelPath"] != "" and self.onlineFeat:
        # 	dataset.getModelFeat(self.experiment.data["featModelPath"], normalised=self.experiment.data["featModelNorm"], maxDur=self.experiment.data["featModelMaxDur"])
        # 	print("fairseq model for testing model outputs loaded")
        idx, tar = dataset[0]
        # experiment.inputDim = inp.shape[1]
        # if not "classification" in experiment.genre:
        self.experiment.outputDim = tar.shape[1]

        firstID1 = list(dataset.dataPart.keys())[0]
        firstID2 = list(dataset.dataPart[firstID1]["annotations"])[0]
        headers = dataset.dataPart[firstID1]["annotations"][firstID2][
            "headers"]
        # print(headers)
        wrapper = self.getWrapper(getBest=True)
        modelOutPath = os.path.join(wrapper.savePath, "outputs")
        if self.testRun: dataset.keepOneOnly()
        IDs = dataset.dataPart.keys()
        # for key in self.experiment.metrics:
        evaluations = {}
        evaluation = {}
        allTars = []
        allOuts = []
        results = np.zeros(
            (len(self.experiment.metrics), len(headers), len(IDs)))
        for idx, ID in enumerate(IDs):
            savePath = os.path.join(modelOutPath, ID + ".csv")
            outputs = pd.read_csv(savePath).to_numpy()
            targets = dataset.targetReader(ID)
            # RESAMPLE OUTPUT TO TARGETS FOR TESTING!
            if self.resampleTarget:
                from Utils.Funcs import reshapeMatrix
                outputs = reshapeMatrix(outputs, len(targets))
            # print(targets.shape, outputs.shape)
            # if idx == 0:
            #[[] for _ in range(targets.shape[1])]
            # bestresult = 0; bestID = "0"
            # for dim in range(targets.shape[1]): evaluation[headers[dim]] = {}
            for dim in range(targets.shape[1]):
                output = outputs[:, dim]
                target = targets[:, dim]
                # while target.shape[0] > output.shape[0]: output = np.append(output, outputs[-1])
                # while target.shape[0] < output.shape[0]: output = outputs[:target.shape[0]].reshape(target.shape[0])
                while target.shape[0] != output.shape[0]:
                    output = outputs.reshape(target.shape[0])
                if self.testConcated:
                    allTars += list(target)
                    allOuts += list(output)
                for k, key in enumerate(self.experiment.metrics):
                    result = getMetric(target, output, metric=key)
                    results[k, dim, idx] = result
                # if result > bestresult: bestresult=result; bestID = ID
                # print(ID, result, len(output))

            printProgressBar(idx + 1,
                             len(IDs),
                             prefix='Testing model:',
                             suffix='',
                             length="fit")
        for k, key in enumerate(self.experiment.metrics):
            for dim in range(targets.shape[1]):
                if self.testConcated:
                    evaluation[headers[dim]] = getMetric(np.array(allTars),
                                                         np.array(allOuts),
                                                         metric=key)
                    if key == "AUC":  # write fpr & tpr to plot ROCs!
                        from sklearn import metrics
                        fpr, tpr, thresholds = metrics.roc_curve(
                            np.array(allTars), np.array(allOuts))
                        fpr = reshapeMatrix(np.expand_dims(fpr, axis=1), 100)
                        tpr = reshapeMatrix(np.expand_dims(tpr, axis=1), 100)
                        savePath = os.path.join(
                            wrapper.savePath,
                            "ROC_resampled_" + str(dim) + "_" +
                            os.path.split(self.currentDataPath)[-1] + ".csv")
                        np.savetxt(
                            savePath,
                            [np.squeeze(fpr), np.squeeze(tpr)],
                            delimiter=",")
                else:
                    evaluation[headers[dim]] = {}
                    evaluation[headers[dim]]['mean'] = np.mean(results[k, dim])
                    evaluation[headers[dim]]['std'] = np.std(results[k, dim])
            evaluations[key] = evaluation.copy()
            # print("evaluation",evaluation,key)
            # print("self.experiment.evaluation[key]", key, self.experiment.evaluation[key])
        return evaluations
def main(featsFolder, jsonPath, modelPath, maxDur, normalised, cuda):
    """
    Adding wav2vec2 features to a given dataset and writting its reference to a json file


    Example
    ----------
    python wav2vec2.py -f "xlsr_53_56k_cut30" -d 29.99 -n True  -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/xlsr_53_56k.pt"
    python wav2vec2.py -f "FlowBERT_2952h_base_cut30_noNorm" -d 29.99 -n False -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/FlowBERT_2952h_base.pt"
    python wav2vec2.py -f "FlowBERT_2952h_base_cut30" -d 29.99 -n True -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/FlowBERT_2952h_base.pt"
    python wav2vec2.py -f "FlowBERT_2952h_large_cut30" -d 29.99 -n True -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/FlowBERT_2952h_large.pt"
    python wav2vec2.py -f "FlowBERT_2952h_large_noNorm_cut30" -d 29.99 -n False -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/FlowBERT_2952h_large.pt"
    
    python wav2vec2.py -f "xlsr_53_56k_cut30" -d 29.98 -n True  -j "/mnt/HD-Storage/Datasets/AlloSat/data.json" -m "/mnt/HD-Storage/Models/xlsr_53_56k.pt"
    python wav2vec2.py -f "FlowBERT_2952h_base_cut30" -d 29.99 -n False -j /home/getalp/alisamis/Datasets/AlloSat/data.json -m /home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/2952h_base/checkpoint_best.pt
    
    python wav2vec2.py -f "FlowBERT_2952h_base_cut30_noNorm" -d 29.99 -n False -j "/mnt/HD-Storage/Datasets/Recola_46_S/data.json" -m "/mnt/HD-Storage/Models/FlowBERT_2952h_base.pt"

    not working: python wav2vec2.py -f "wav2vec2-large-xlsr-53-french" -d 29.99 -n True -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/wav2vec2-large-xlsr-53-french.zip"

    python wav2vec2.py -f "mls_french_base_cut30" -d 29.98 -n False  -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/mls_french_base/checkpoint_best.pt"
    python wav2vec2.py -f "xlsr_53_56k_cut30" -d 29.98 -n True  -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/alisamis/Models/xlsr_53_56k.pt"
    python wav2vec2.py -f "mls_french_base_cut30" -d 29.98 -n False  -j "/home/getalp/alisamis/Datasets/Recola_46/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/mls_french_base/checkpoint_best.pt"
    python wav2vec2.py -f "mls_french_base_cut30" -d 29.98 -n False  -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/mls_french_base/checkpoint_best.pt"
    
	python wav2vec2.py -f "libri960_big_cut30" -d 29.98 -n True  -j "/home/getalp/alisamis/Datasets/Recola_46/data.json" -m "/home/getalp/dinarelm/work/data/Exchance/wav2vec/models/libri960_big.pt"
	python wav2vec2.py -f "libri960_big_cut30" -d 29.98 -n True  -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/dinarelm/work/data/Exchance/wav2vec/models/libri960_big.pt"
    

python wav2vec2.py -f "mls_french_large_cut30" -d 29.98 -n True  -j "/home/getalp/alisamis/Datasets/Recola_46/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/mls_french_large/checkpoint_best.pt" && \
python wav2vec2.py -f "FlowBERT_2952h_base_cut30" -d 29.98 -n False  -j "/home/getalp/alisamis/Datasets/Recola_46/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/2952h_base/checkpoint_best.pt" && \
python wav2vec2.py -f "FlowBERT_2952h_large_cut30" -d 29.98 -n True  -j "/home/getalp/alisamis/Datasets/Recola_46/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/2952h_large/checkpoint_best.pt" && \
python wav2vec2.py -f "FlowBERT_2952h_large_cut30" -d 29.98 -n True  -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/2952h_large/checkpoint_best.pt" && \
python wav2vec2.py -f "mls_french_large_cut30" -d 29.98 -n True  -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/mls_french_large/checkpoint_best.pt"
python wav2vec2.py -f "mls_french_base_cut30" -d 29.98 -n False  -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/mls_french_base/checkpoint_best.pt"

    """

    # cp = torch.load(modelPath, map_location=torch.device('cpu'))
    # model = Wav2VecModel.build_model(cp['args'], task=None)
    # model.load_state_dict(cp['model'])
    # model.eval()
    # cp = torch.load(modelPath, map_location=torch.device('cpu'))
    model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task(
        [modelPath])
    model = model[0]
    if cuda: model = model.cuda()
    model.eval()

    samples = loadFromJson(jsonPath)
    for i, ID in enumerate(samples.keys()):
        sample = samples[ID]
        wavePath = sample["path"]
        wavsFolder = wavePath.split(os.sep)[0]
        waveFullPath = os.path.join(os.path.split(jsonPath)[0], wavePath)
        featsLocalPath = wavePath.replace(wavsFolder,
                                          featsFolder).replace(".wav", ".csv")
        featsLocalPath = os.path.join("Feats", featsLocalPath)
        featsFullPath = os.path.join(
            os.path.split(jsonPath)[0], featsLocalPath)

        # print(featsLocalPath, featsFullPath)
        dim = makeFeatsCsv(waveFullPath,
                           featsFullPath,
                           model,
                           maxDur,
                           normalised,
                           cuda=cuda)
        if dim == 0: continue
        featsDict = getFeatsDict(dim, featsFolder, featsLocalPath)
        samples[ID]["features"][featsDict["ID"]] = featsDict
        # saveToJson(jsonPath, sample)
        printProgressBar(i + 1,
                         len(samples),
                         prefix='Adding wav2vec features:',
                         suffix='Complete',
                         length="fit")
    with open(jsonPath, 'w') as jsonFile:
        json.dump(samples, jsonFile, indent=4, ensure_ascii=False)
def main(jsonPath, outJson, noiseFilesPaths, addWhite, SNRs, ignoreExisting):
    """
    Adding noise to a given dataset and making a different json file for it to reference it


    Example
    ----------
    python addNoise.py -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -aw True -n "/mnt/HD-Storage/Datasets/NoiseFiles/train" "/mnt/HD-Storage/Datasets/NoiseFiles/dev" "/mnt/HD-Storage/Datasets/NoiseFiles/test"
    python addNoise.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy.json" -aw True -n "/mnt/HD-Storage/Datasets/NoiseFiles/train" "/mnt/HD-Storage/Datasets/NoiseFiles/dev" "/mnt/HD-Storage/Datasets/NoiseFiles/test"
    
    python addNoise.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -aw True -n "/mnt/HD-Storage/Datasets/NoiseFiles/train" "/mnt/HD-Storage/Datasets/NoiseFiles/dev" "/mnt/HD-Storage/Datasets/NoiseFiles/test"
    python addNoise.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -aw True -n "/mnt/HD-Storage/Datasets/NoiseFiles/train" "/mnt/HD-Storage/Datasets/NoiseFiles/dev" "/mnt/HD-Storage/Datasets/NoiseFiles/test"
    """
    datasetPath = os.path.split(jsonPath)[0]
    noisyFolder = "Wavs_Noisy"
    noisyWavsPath = os.path.join(datasetPath, noisyFolder)
    if not os.path.exists(noisyWavsPath): os.makedirs(noisyWavsPath)

    trainPath = os.path.join(noiseFilesPaths[0], "**", "*.wav")
    trainNoises = glob.glob(trainPath, recursive=True)
    devPath = os.path.join(noiseFilesPaths[1], "**", "*.wav")
    devNoises = glob.glob(devPath, recursive=True)
    testPath = os.path.join(noiseFilesPaths[2], "**", "*.wav")
    testNoises = glob.glob(testPath, recursive=True)

    samples = loadFromJson(jsonPath)
    newSamples = samples.copy()
    for i, ID in enumerate(samples.keys()):
        sample = samples[ID].copy()
        wavePath = sample["path"]
        wavFullPath = os.path.join(datasetPath, wavePath)

        sample["features"] = {} # to avoid reading the wrong feature extracted from clean speech

        wavsFolder = wavePath.split(os.sep)[0]
        splits = wavePath.split(os.sep)
        fileName = splits[-1].replace(".wav", "")

        ## MAKE NOISY FILES AND ADD TO SAMPLES, GIVE A NEW ID (which would be name of file)
        noiseFiles = trainNoises
        if sample["partition"] == "dev" : noiseFiles = devNoises
        if sample["partition"] == "test": noiseFiles = testNoises
        for snr in SNRs:
            for noiseFile in noiseFiles:
                outWavPath = noisyFolder
                for split in splits[1:-1]:
                    outWavPath = os.path.join(outWavPath, split) 
                outWavName = fileName +'_snr' + str(snr) + '_' + noiseFile.split(os.sep)[-1]
                outWavPath = os.path.join(outWavPath, outWavName)
                outWavFullPath = os.path.join(datasetPath, outWavPath)
                if not (ignoreExisting and os.path.exists(outWavFullPath)):
                    addNoiseFile(wavFullPath, noiseFile, outWavFullPath, snr=snr)
                ID = outWavName.replace(".wav", "")
                newSample = sample.copy()
                newSample["path"] = outWavPath
                newSample["ID"]   = ID
                newSamples[ID] = newSample

            if addWhite: 
                outWavPath = noisyFolder
                for split in splits[1:-1]:
                    outWavPath = os.path.join(outWavPath, split) 
                outWavName = fileName +'_snr' + str(snr) + '_whiteNoise.wav'
                outWavPath = os.path.join(outWavPath, outWavName)
                outWavFullPath = os.path.join(datasetPath, outWavPath)
                if not (ignoreExisting and os.path.exists(outWavFullPath)):
                    addWhiteNoise(wavFullPath, outWavFullPath, snr=snr)
                ID = outWavName.replace(".wav", "")
                newSample = sample.copy()
                newSample["path"] = outWavPath
                newSample["ID"]   = ID
                newSamples[ID] = newSample

        printProgressBar(i + 1, len(samples), prefix = 'Making wav files noisy:', suffix = 'Complete', length = "fit")


    with open(outJson, 'w') as jsonFile:
        json.dump(newSamples, jsonFile,  indent=4, ensure_ascii=False)
def main(jsonPath, outJson, percentageReduction, keepTest, blackList,
         whiteList):
    """
    Reduce data in a json file to have a smaller subset of the original dataset


    Example
    ----------
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_short_10.json" -p 10

    python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr5_whiteNoise.json" -w snr5_whiteNoise && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr5_Ads_2018_2020.json" -w snr5_trainNoise_Ads_2018_2020 snr5_devNoise_Ads_2018_2020 snr5_testNoise_Ads_2018_2020 && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr5_News.json" -w snr5_trainNoise_News snr5_devNoise_News snr5_testNoise_News && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr5_TalkShows.json" -w snr5_trainNoise_TalkShows snr5_devNoise_TalkShows snr5_testNoise_TalkShows && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr5_Ambient_Music_top_charts.json" -w snr5_trainNoise_Ambient_Music_top_charts snr5_devNoise_Ambient_Music_top_charts snr5_testNoise_Ambient_Music_top_charts && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr15_whiteNoise.json" -w snr15_whiteNoise && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr15_Ads_2018_2020.json" -w snr15_trainNoise_Ads_2018_2020 snr15_devNoise_Ads_2018_2020 snr15_testNoise_Ads_2018_2020 && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr15_News.json" -w snr15_trainNoise_News snr15_devNoise_News snr15_testNoise_News && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr15_TalkShows.json" -w snr15_trainNoise_TalkShows snr15_devNoise_TalkShows snr15_testNoise_TalkShows && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr15_Ambient_Music_top_charts.json" -w snr15_trainNoise_Ambient_Music_top_charts snr15_devNoise_Ambient_Music_top_charts snr15_testNoise_Ambient_Music_top_charts
    
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr5_whiteNoise.json" -w snr5_whiteNoise && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr5_Ads_2018_2020.json" -w snr5_trainNoise_Ads_2018_2020 snr5_devNoise_Ads_2018_2020 snr5_testNoise_Ads_2018_2020 && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr5_News.json" -w snr5_trainNoise_News snr5_devNoise_News snr5_testNoise_News && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr5_TalkShows.json" -w snr5_trainNoise_TalkShows snr5_devNoise_TalkShows snr5_testNoise_TalkShows && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr5_Ambient_Music_top_charts.json" -w snr5_trainNoise_Ambient_Music_top_charts snr5_devNoise_Ambient_Music_top_charts snr5_testNoise_Ambient_Music_top_charts && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr15_whiteNoise.json" -w snr15_whiteNoise && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr15_Ads_2018_2020.json" -w snr15_trainNoise_Ads_2018_2020 snr15_devNoise_Ads_2018_2020 snr15_testNoise_Ads_2018_2020 && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr15_News.json" -w snr15_trainNoise_News snr15_devNoise_News snr15_testNoise_News && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr15_TalkShows.json" -w snr15_trainNoise_TalkShows snr15_devNoise_TalkShows snr15_testNoise_TalkShows && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr15_Ambient_Music_top_charts.json" -w snr15_trainNoise_Ambient_Music_top_charts snr15_devNoise_Ambient_Music_top_charts snr15_testNoise_Ambient_Music_top_charts

    python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr5_whiteNoise.json" -w snr5_whiteNoise && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr5_Ads_2018_2020.json" -w snr5_trainNoise_Ads_2018_2020 snr5_devNoise_Ads_2018_2020 snr5_testNoise_Ads_2018_2020 && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr5_News.json" -w snr5_trainNoise_News snr5_devNoise_News snr5_testNoise_News && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr5_TalkShows.json" -w snr5_trainNoise_TalkShows snr5_devNoise_TalkShows snr5_testNoise_TalkShows && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr5_Ambient_Music_top_charts.json" -w snr5_trainNoise_Ambient_Music_top_charts snr5_devNoise_Ambient_Music_top_charts snr5_testNoise_Ambient_Music_top_charts && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr15_whiteNoise.json" -w snr15_whiteNoise && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr15_Ads_2018_2020.json" -w snr15_trainNoise_Ads_2018_2020 snr15_devNoise_Ads_2018_2020 snr15_testNoise_Ads_2018_2020 && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr15_News.json" -w snr15_trainNoise_News snr15_devNoise_News snr15_testNoise_News && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr15_TalkShows.json" -w snr15_trainNoise_TalkShows snr15_devNoise_TalkShows snr15_testNoise_TalkShows && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr15_Ambient_Music_top_charts.json" -w snr15_trainNoise_Ambient_Music_top_charts snr15_devNoise_Ambient_Music_top_charts snr15_testNoise_Ambient_Music_top_charts
    
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr5_whiteNoise.json" -w snr5_whiteNoise && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr5_Ads_2018_2020.json" -w snr5_trainNoise_Ads_2018_2020 snr5_devNoise_Ads_2018_2020 snr5_testNoise_Ads_2018_2020 && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr5_News.json" -w snr5_trainNoise_News snr5_devNoise_News snr5_testNoise_News && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr5_TalkShows.json" -w snr5_trainNoise_TalkShows snr5_devNoise_TalkShows snr5_testNoise_TalkShows && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr5_Ambient_Music_top_charts.json" -w snr5_trainNoise_Ambient_Music_top_charts snr5_devNoise_Ambient_Music_top_charts snr5_testNoise_Ambient_Music_top_charts && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr15_whiteNoise.json" -w snr15_whiteNoise && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr15_Ads_2018_2020.json" -w snr15_trainNoise_Ads_2018_2020 snr15_devNoise_Ads_2018_2020 snr15_testNoise_Ads_2018_2020 && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr15_News.json" -w snr15_trainNoise_News snr15_devNoise_News snr15_testNoise_News && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr15_TalkShows.json" -w snr15_trainNoise_TalkShows snr15_devNoise_TalkShows snr15_testNoise_TalkShows && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr15_Ambient_Music_top_charts.json" -w snr15_trainNoise_Ambient_Music_top_charts snr15_devNoise_Ambient_Music_top_charts snr15_testNoise_Ambient_Music_top_charts
    
    """
    reductionAmount = percentageReduction / 100
    dirname = os.path.dirname(outJson)
    if not os.path.exists(dirname): os.makedirs(dirname)

    samples = loadFromJson(jsonPath)
    trainSamples = {}
    devSamples = {}
    testSamples = {}
    for i, ID in enumerate(samples.keys()):
        sample = samples[ID]
        if sample["partition"] == "train":
            trainSamples[sample["ID"]] = sample.copy()
        if sample["partition"] == "dev":
            devSamples[sample["ID"]] = sample.copy()
        if sample["partition"] == "test":
            testSamples[sample["ID"]] = sample.copy()

    if reductionAmount > 0:
        print("Performing data reduction based on percentage")
        trainKeys = random.sample(
            trainSamples.keys(),
            int(reductionAmount * len(trainSamples.keys())))
        devKeys = random.sample(devSamples.keys(),
                                int(reductionAmount * len(devSamples.keys())))
        testKeys = random.sample(
            testSamples.keys(), int(reductionAmount * len(testSamples.keys())))
        if keepTest: testKeys = testSamples.keys()
        newSamples = {}
        for keys in [trainKeys, devKeys, testKeys]:
            for ID in keys:
                sample = samples[ID]
                newSamples[ID] = sample.copy()
        print("Data reduction completed!")
    else:
        newSamples = samples.copy()

    for filterString in blackList:
        blackKeys = []
        for i, key in enumerate(newSamples.keys()):
            if filterString in key: blackKeys.append(key)
            printProgressBar(i + 1,
                             len(newSamples.keys()),
                             prefix='removing black-listed IDs :',
                             suffix='',
                             length="fit")
        [newSamples.pop(key) for key in blackKeys]

    if len(whiteList) > 0:
        blackKeys = []
        for i, key in enumerate(newSamples.keys()):
            flag = True
            for whiteString in whiteList:
                if whiteString in key: flag = False
            if flag: blackKeys.append(key)
            printProgressBar(i + 1,
                             len(newSamples.keys()),
                             prefix='Keeping only white-listed IDs :',
                             suffix='',
                             length="fit")
        [newSamples.pop(key) for key in blackKeys]

    with open(outJson, 'w') as jsonFile:
        json.dump(newSamples, jsonFile, indent=4, ensure_ascii=False)
Exemple #12
0
def main(partitions, jsonPath, basedOnFolder, trainIdsForced, devIdsForced,
         testIdsForced, outJson):
    """
    Repartitioning files in a json file


    Example
    ----------
    python repartitionJson.py -p 60 20 20 -f True -j "/mnt/HD-Storage/Datasets/RAVDESS/data.json"
    python repartitionJson.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_parted.json" -tr user_2 user_3 user_4 user_5 user_7 user_8 user_9 user_10 user_11 -de user_12 user_13 user_14 user_15 -te user_16 user_17 user_18 user_19
    python repartitionJson.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -tr user_2 user_3 user_4 user_5 user_7 user_8 user_9 user_10 user_11 -de user_12 user_13 user_14 user_15 -te user_16 user_17 user_18 user_19
    
    python repartitionJson.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted.json" -tr Ses01 Ses02 Ses03 -de Ses04 -te Ses05

    python repartitionJson.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted.json" -tr B01 B02 B03 B04 B05 B06 B07 B08 B09 B10 B11 B12 B13 -de B14 B15 B16 B17 B18 B19 B20 -te B21 B22 B23 B24 B25 B26 B27
    """
    samples = loadFromJson(jsonPath)
    ids = list(samples.keys())
    if basedOnFolder:
        folders = []
        foldersIds = {}
        for i, ID in enumerate(samples.keys()):
            sample = samples[ID]
            wavePath = sample["path"]
            folder = os.path.split(wavePath)[0]
            if not folder in folders:
                folders.append(folder)
                foldersIds[folder] = [ID]
            else:
                foldersIds[folder].append(ID)
        ids = folders
    # print(folders)
    if len(partitions) != 0:
        total = len(ids)
        random.shuffle(ids)
        trainCut = int(total * partitions[0] / 100)
        devCut = int(total * (partitions[0] + partitions[1]) / 100)
        trainIds = ids[:trainCut]
        devIds = ids[trainCut:devCut]
        testIds = ids[devCut:]

    if trainIdsForced != []:
        trainIds = []
        devIds = []
        testIds = []
        for ID in ids:
            trainFlag = False
            devFlag = False
            testFlag = False
            for trainId in trainIdsForced:
                if trainId in ID:
                    trainFlag = True
                    break
            for devId in devIdsForced:
                if devId in ID:
                    devFlag = True
                    break
            for testId in testIdsForced:
                if testId in ID:
                    testFlag = True
                    break
            if trainFlag: trainIds.append(ID)
            if devFlag: devIds.append(ID)
            if testFlag: testIds.append(ID)
    # print(trainIds)

    for i, idx in enumerate(ids):
        partition = "train"
        if idx in devIds: partition = "dev"
        if idx in testIds: partition = "test"
        if basedOnFolder:
            for eachIdx in foldersIds[idx]:
                samples[eachIdx]["partition"] = partition
        else:
            samples[idx]["partition"] = partition
        printProgressBar(i + 1,
                         len(ids),
                         prefix='Repartitioning ',
                         suffix='Complete',
                         length="fit")

    if outJson == "": outJson = jsonPath
    directory = os.path.dirname(outJson)
    if not os.path.exists(directory): os.makedirs(directory)
    with open(outJson, 'w') as jsonFile:
        json.dump(samples, jsonFile, indent=4, ensure_ascii=False)