コード例 #1
0
def main(jsonPaths, outJson):
    """
    given a list of data.json files, combine all the data and put them all in one json file


    Example
    ----------
    python dataCombiner.py -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" "/mnt/HD-Storage/Datasets/MaSS_Fr/data.json" -o "/mnt/HD-Storage/Datasets/Mixed/data_RecolaMaSS.json"

    """
    dirname = os.path.dirname(outJson)
    if not os.path.exists(dirname): os.makedirs(dirname)

    newSamples = {}
    for jsonPath in jsonPaths:
        # print(jsonPath)
        # print(os.path.dirname(os.path.relpath(jsonPath, outJson)))
        addPath = os.path.dirname(os.path.relpath(jsonPath, outJson))
        samples = loadFromJson(jsonPath)
        for sample in samples:
            # print("before",samples[sample])
            addToPaths(samples[sample], addPath=addPath)
            # print("after",samples[sample])
            newSamples[sample] = samples[sample]
    with open(outJson, 'w') as jsonFile:
        json.dump(newSamples, jsonFile, indent=4, ensure_ascii=False)
コード例 #2
0
def main(featsFolder, jsonPath):
    """
    Adding mel-frequency filterbank features to a given dataset and writting its reference to a json file


    Example
    ----------
    python MelFilterBank.py -f "MFB" -j "/mnt/HD-Storage/Datasets/Recola_46/data.json"
    python MelFilterBank.py -f "MFB" -j "/mnt/HD-Storage/Datasets/AlloSat/data.json"
    python MelFilterBank.py -f "MFB" -j "/mnt/HD-Storage/Datasets/Recola_46_S/data.json"
    
    python MelFilterBank.py -f "MFB" -j "/mnt/HD-Storage/Datasets/ESTER1/data.json"
    python MelFilterBank.py -f "MFB" -j "/home/getalp/alisamis/Datasets/ESTER1/data.json"

    python MelFilterBank.py -f "MFB" -j "/mnt/HD-Storage/Datasets/RAVDESS/data.json"
    """
    samples = loadFromJson(jsonPath)
    for i, ID in enumerate(samples.keys()):
        sample = samples[ID]
        wavePath = sample["path"]
        wavsFolder = wavePath.split(os.sep)[0]
        waveFullPath = os.path.join(os.path.split(jsonPath)[0], wavePath)
        featsLocalPath = wavePath.replace(wavsFolder,
                                          featsFolder).replace(".wav", ".csv")
        featsLocalPath = os.path.join("Feats", featsLocalPath)
        featsFullPath = os.path.join(
            os.path.split(jsonPath)[0], featsLocalPath)

        # print(featsLocalPath, featsFullPath)
        dim = makeFeatsCsv(waveFullPath, featsFullPath)
        if dim == 0: continue
        featsDict = getFeatsDict(dim, featsFolder, featsLocalPath)
        samples[ID]["features"][featsDict["ID"]] = featsDict
        # saveToJson(jsonPath, sample)
        printProgressBar(i + 1,
                         len(samples),
                         prefix='Adding mel-frequency filterbank features:',
                         suffix='Complete',
                         length="fit")
    with open(jsonPath, 'w') as jsonFile:
        json.dump(samples, jsonFile, indent=4, ensure_ascii=False)
コード例 #3
0
def main(annotsList, headers, genres, jsonPath):
    """
    Adding annotations to a given dataset and writting its reference to a json file


    Example
    ----------
    python addAnnots.py -a "gs_arousal_0.01_std" "gs_valence_0.01_std" "gen_gs_arousal_0.01_std" "gen_gs_valence_0.01_std" -d "GoldStandard" -g arousal valence arousal valence -j "/mnt/HD-Storage/Datasets/Recola_46_S/data.json"
    python addAnnots.py -a "VAD_0.01" -d "label" -g "VAD" -j "/mnt/HD-Storage/Datasets/ESTER1/data.json"
    python addAnnots.py -a "turns_0.01" -d "label" -g "VAD" -j "/mnt/HD-Storage/Datasets/Recola_46/data.json"
    python addAnnots.py -a "turns_0.01" -d "label" -g "VAD" -j "/mnt/HD-Storage/Datasets/SEMAINE/data.json"
    python addAnnots.py -a "turns_0.01" -d "label" -g "VAD" -j "/mnt/HD-Storage/Datasets/IEMOCAP/data.json"
    python addAnnots.py -a "turns_0.01" -d "label" -g "VAD" -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data.json"
    """
    samples = loadFromJson(jsonPath)
    for t, annotName in enumerate(annotsList):
        trainFilePaths = []
        print("annot:", annotName)

        for i, ID in enumerate(samples.keys()):
            sample = samples[ID]
            wavePath = sample["path"]
            wavsFolder = wavePath.split(os.sep)[0]
            waveFullPath = os.path.join(os.path.split(jsonPath)[0], wavePath)
            featsLocalPath = wavePath.replace(wavsFolder, annotName).replace(".wav", ".csv")
            featsLocalPath = os.path.join("Annots", featsLocalPath)
            featsFullPath  = os.path.join(os.path.split(jsonPath)[0], featsLocalPath)

            try:
                df = pd.read_csv(featsFullPath, delimiter=',')
                out = df[headers].to_numpy().astype('float64')
                dim = list(out.shape)
                annotsDict = getAnnotsDict(annotName, genres[t], dim, featsLocalPath, headers)
                samples[ID]["annotations"][annotsDict["ID"]] = annotsDict
            except:
                print("Warning: could not read", featsFullPath)

            printProgressBar(i + 1, len(samples), prefix = 'Adding '+ annotName +' annotation', suffix = 'Complete', length = "fit")

        with open(jsonPath, 'w') as jsonFile:
            json.dump(samples, jsonFile,  indent=4, ensure_ascii=False)
コード例 #4
0
def main(featsFolder, jsonPath, modelPath, maxDur, normalised, cuda):
    """
    Adding wav2vec2 features to a given dataset and writting its reference to a json file


    Example
    ----------
    python wav2vec2.py -f "xlsr_53_56k_cut30" -d 29.99 -n True  -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/xlsr_53_56k.pt"
    python wav2vec2.py -f "FlowBERT_2952h_base_cut30_noNorm" -d 29.99 -n False -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/FlowBERT_2952h_base.pt"
    python wav2vec2.py -f "FlowBERT_2952h_base_cut30" -d 29.99 -n True -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/FlowBERT_2952h_base.pt"
    python wav2vec2.py -f "FlowBERT_2952h_large_cut30" -d 29.99 -n True -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/FlowBERT_2952h_large.pt"
    python wav2vec2.py -f "FlowBERT_2952h_large_noNorm_cut30" -d 29.99 -n False -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/FlowBERT_2952h_large.pt"
    
    python wav2vec2.py -f "xlsr_53_56k_cut30" -d 29.98 -n True  -j "/mnt/HD-Storage/Datasets/AlloSat/data.json" -m "/mnt/HD-Storage/Models/xlsr_53_56k.pt"
    python wav2vec2.py -f "FlowBERT_2952h_base_cut30" -d 29.99 -n False -j /home/getalp/alisamis/Datasets/AlloSat/data.json -m /home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/2952h_base/checkpoint_best.pt
    
    python wav2vec2.py -f "FlowBERT_2952h_base_cut30_noNorm" -d 29.99 -n False -j "/mnt/HD-Storage/Datasets/Recola_46_S/data.json" -m "/mnt/HD-Storage/Models/FlowBERT_2952h_base.pt"

    not working: python wav2vec2.py -f "wav2vec2-large-xlsr-53-french" -d 29.99 -n True -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/wav2vec2-large-xlsr-53-french.zip"

    python wav2vec2.py -f "mls_french_base_cut30" -d 29.98 -n False  -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/mls_french_base/checkpoint_best.pt"
    python wav2vec2.py -f "xlsr_53_56k_cut30" -d 29.98 -n True  -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/alisamis/Models/xlsr_53_56k.pt"
    python wav2vec2.py -f "mls_french_base_cut30" -d 29.98 -n False  -j "/home/getalp/alisamis/Datasets/Recola_46/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/mls_french_base/checkpoint_best.pt"
    python wav2vec2.py -f "mls_french_base_cut30" -d 29.98 -n False  -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/mls_french_base/checkpoint_best.pt"
    
	python wav2vec2.py -f "libri960_big_cut30" -d 29.98 -n True  -j "/home/getalp/alisamis/Datasets/Recola_46/data.json" -m "/home/getalp/dinarelm/work/data/Exchance/wav2vec/models/libri960_big.pt"
	python wav2vec2.py -f "libri960_big_cut30" -d 29.98 -n True  -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/dinarelm/work/data/Exchance/wav2vec/models/libri960_big.pt"
    

python wav2vec2.py -f "mls_french_large_cut30" -d 29.98 -n True  -j "/home/getalp/alisamis/Datasets/Recola_46/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/mls_french_large/checkpoint_best.pt" && \
python wav2vec2.py -f "FlowBERT_2952h_base_cut30" -d 29.98 -n False  -j "/home/getalp/alisamis/Datasets/Recola_46/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/2952h_base/checkpoint_best.pt" && \
python wav2vec2.py -f "FlowBERT_2952h_large_cut30" -d 29.98 -n True  -j "/home/getalp/alisamis/Datasets/Recola_46/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/2952h_large/checkpoint_best.pt" && \
python wav2vec2.py -f "FlowBERT_2952h_large_cut30" -d 29.98 -n True  -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/2952h_large/checkpoint_best.pt" && \
python wav2vec2.py -f "mls_french_large_cut30" -d 29.98 -n True  -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/mls_french_large/checkpoint_best.pt"
python wav2vec2.py -f "mls_french_base_cut30" -d 29.98 -n False  -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/mls_french_base/checkpoint_best.pt"

    """

    # cp = torch.load(modelPath, map_location=torch.device('cpu'))
    # model = Wav2VecModel.build_model(cp['args'], task=None)
    # model.load_state_dict(cp['model'])
    # model.eval()
    # cp = torch.load(modelPath, map_location=torch.device('cpu'))
    model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task(
        [modelPath])
    model = model[0]
    if cuda: model = model.cuda()
    model.eval()

    samples = loadFromJson(jsonPath)
    for i, ID in enumerate(samples.keys()):
        sample = samples[ID]
        wavePath = sample["path"]
        wavsFolder = wavePath.split(os.sep)[0]
        waveFullPath = os.path.join(os.path.split(jsonPath)[0], wavePath)
        featsLocalPath = wavePath.replace(wavsFolder,
                                          featsFolder).replace(".wav", ".csv")
        featsLocalPath = os.path.join("Feats", featsLocalPath)
        featsFullPath = os.path.join(
            os.path.split(jsonPath)[0], featsLocalPath)

        # print(featsLocalPath, featsFullPath)
        dim = makeFeatsCsv(waveFullPath,
                           featsFullPath,
                           model,
                           maxDur,
                           normalised,
                           cuda=cuda)
        if dim == 0: continue
        featsDict = getFeatsDict(dim, featsFolder, featsLocalPath)
        samples[ID]["features"][featsDict["ID"]] = featsDict
        # saveToJson(jsonPath, sample)
        printProgressBar(i + 1,
                         len(samples),
                         prefix='Adding wav2vec features:',
                         suffix='Complete',
                         length="fit")
    with open(jsonPath, 'w') as jsonFile:
        json.dump(samples, jsonFile, indent=4, ensure_ascii=False)
コード例 #5
0
def main(jsonPath, outJson, noiseFilesPaths, addWhite, SNRs, ignoreExisting):
    """
    Adding noise to a given dataset and making a different json file for it to reference it


    Example
    ----------
    python addNoise.py -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -aw True -n "/mnt/HD-Storage/Datasets/NoiseFiles/train" "/mnt/HD-Storage/Datasets/NoiseFiles/dev" "/mnt/HD-Storage/Datasets/NoiseFiles/test"
    python addNoise.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy.json" -aw True -n "/mnt/HD-Storage/Datasets/NoiseFiles/train" "/mnt/HD-Storage/Datasets/NoiseFiles/dev" "/mnt/HD-Storage/Datasets/NoiseFiles/test"
    
    python addNoise.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -aw True -n "/mnt/HD-Storage/Datasets/NoiseFiles/train" "/mnt/HD-Storage/Datasets/NoiseFiles/dev" "/mnt/HD-Storage/Datasets/NoiseFiles/test"
    python addNoise.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -aw True -n "/mnt/HD-Storage/Datasets/NoiseFiles/train" "/mnt/HD-Storage/Datasets/NoiseFiles/dev" "/mnt/HD-Storage/Datasets/NoiseFiles/test"
    """
    datasetPath = os.path.split(jsonPath)[0]
    noisyFolder = "Wavs_Noisy"
    noisyWavsPath = os.path.join(datasetPath, noisyFolder)
    if not os.path.exists(noisyWavsPath): os.makedirs(noisyWavsPath)

    trainPath = os.path.join(noiseFilesPaths[0], "**", "*.wav")
    trainNoises = glob.glob(trainPath, recursive=True)
    devPath = os.path.join(noiseFilesPaths[1], "**", "*.wav")
    devNoises = glob.glob(devPath, recursive=True)
    testPath = os.path.join(noiseFilesPaths[2], "**", "*.wav")
    testNoises = glob.glob(testPath, recursive=True)

    samples = loadFromJson(jsonPath)
    newSamples = samples.copy()
    for i, ID in enumerate(samples.keys()):
        sample = samples[ID].copy()
        wavePath = sample["path"]
        wavFullPath = os.path.join(datasetPath, wavePath)

        sample["features"] = {} # to avoid reading the wrong feature extracted from clean speech

        wavsFolder = wavePath.split(os.sep)[0]
        splits = wavePath.split(os.sep)
        fileName = splits[-1].replace(".wav", "")

        ## MAKE NOISY FILES AND ADD TO SAMPLES, GIVE A NEW ID (which would be name of file)
        noiseFiles = trainNoises
        if sample["partition"] == "dev" : noiseFiles = devNoises
        if sample["partition"] == "test": noiseFiles = testNoises
        for snr in SNRs:
            for noiseFile in noiseFiles:
                outWavPath = noisyFolder
                for split in splits[1:-1]:
                    outWavPath = os.path.join(outWavPath, split) 
                outWavName = fileName +'_snr' + str(snr) + '_' + noiseFile.split(os.sep)[-1]
                outWavPath = os.path.join(outWavPath, outWavName)
                outWavFullPath = os.path.join(datasetPath, outWavPath)
                if not (ignoreExisting and os.path.exists(outWavFullPath)):
                    addNoiseFile(wavFullPath, noiseFile, outWavFullPath, snr=snr)
                ID = outWavName.replace(".wav", "")
                newSample = sample.copy()
                newSample["path"] = outWavPath
                newSample["ID"]   = ID
                newSamples[ID] = newSample

            if addWhite: 
                outWavPath = noisyFolder
                for split in splits[1:-1]:
                    outWavPath = os.path.join(outWavPath, split) 
                outWavName = fileName +'_snr' + str(snr) + '_whiteNoise.wav'
                outWavPath = os.path.join(outWavPath, outWavName)
                outWavFullPath = os.path.join(datasetPath, outWavPath)
                if not (ignoreExisting and os.path.exists(outWavFullPath)):
                    addWhiteNoise(wavFullPath, outWavFullPath, snr=snr)
                ID = outWavName.replace(".wav", "")
                newSample = sample.copy()
                newSample["path"] = outWavPath
                newSample["ID"]   = ID
                newSamples[ID] = newSample

        printProgressBar(i + 1, len(samples), prefix = 'Making wav files noisy:', suffix = 'Complete', length = "fit")


    with open(outJson, 'w') as jsonFile:
        json.dump(newSamples, jsonFile,  indent=4, ensure_ascii=False)
コード例 #6
0
def main(jsonPath, outJson, percentageReduction, keepTest, blackList,
         whiteList):
    """
    Reduce data in a json file to have a smaller subset of the original dataset


    Example
    ----------
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_short_10.json" -p 10

    python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr5_whiteNoise.json" -w snr5_whiteNoise && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr5_Ads_2018_2020.json" -w snr5_trainNoise_Ads_2018_2020 snr5_devNoise_Ads_2018_2020 snr5_testNoise_Ads_2018_2020 && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr5_News.json" -w snr5_trainNoise_News snr5_devNoise_News snr5_testNoise_News && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr5_TalkShows.json" -w snr5_trainNoise_TalkShows snr5_devNoise_TalkShows snr5_testNoise_TalkShows && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr5_Ambient_Music_top_charts.json" -w snr5_trainNoise_Ambient_Music_top_charts snr5_devNoise_Ambient_Music_top_charts snr5_testNoise_Ambient_Music_top_charts && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr15_whiteNoise.json" -w snr15_whiteNoise && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr15_Ads_2018_2020.json" -w snr15_trainNoise_Ads_2018_2020 snr15_devNoise_Ads_2018_2020 snr15_testNoise_Ads_2018_2020 && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr15_News.json" -w snr15_trainNoise_News snr15_devNoise_News snr15_testNoise_News && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr15_TalkShows.json" -w snr15_trainNoise_TalkShows snr15_devNoise_TalkShows snr15_testNoise_TalkShows && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/Recola_46/data_noisy.json" -o "/mnt/HD-Storage/Datasets/Recola_46/data_noisy_snr15_Ambient_Music_top_charts.json" -w snr15_trainNoise_Ambient_Music_top_charts snr15_devNoise_Ambient_Music_top_charts snr15_testNoise_Ambient_Music_top_charts
    
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr5_whiteNoise.json" -w snr5_whiteNoise && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr5_Ads_2018_2020.json" -w snr5_trainNoise_Ads_2018_2020 snr5_devNoise_Ads_2018_2020 snr5_testNoise_Ads_2018_2020 && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr5_News.json" -w snr5_trainNoise_News snr5_devNoise_News snr5_testNoise_News && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr5_TalkShows.json" -w snr5_trainNoise_TalkShows snr5_devNoise_TalkShows snr5_testNoise_TalkShows && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr5_Ambient_Music_top_charts.json" -w snr5_trainNoise_Ambient_Music_top_charts snr5_devNoise_Ambient_Music_top_charts snr5_testNoise_Ambient_Music_top_charts && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr15_whiteNoise.json" -w snr15_whiteNoise && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr15_Ads_2018_2020.json" -w snr15_trainNoise_Ads_2018_2020 snr15_devNoise_Ads_2018_2020 snr15_testNoise_Ads_2018_2020 && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr15_News.json" -w snr15_trainNoise_News snr15_devNoise_News snr15_testNoise_News && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr15_TalkShows.json" -w snr15_trainNoise_TalkShows snr15_devNoise_TalkShows snr15_testNoise_TalkShows && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted_snr15_Ambient_Music_top_charts.json" -w snr15_trainNoise_Ambient_Music_top_charts snr15_devNoise_Ambient_Music_top_charts snr15_testNoise_Ambient_Music_top_charts

    python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr5_whiteNoise.json" -w snr5_whiteNoise && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr5_Ads_2018_2020.json" -w snr5_trainNoise_Ads_2018_2020 snr5_devNoise_Ads_2018_2020 snr5_testNoise_Ads_2018_2020 && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr5_News.json" -w snr5_trainNoise_News snr5_devNoise_News snr5_testNoise_News && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr5_TalkShows.json" -w snr5_trainNoise_TalkShows snr5_devNoise_TalkShows snr5_testNoise_TalkShows && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr5_Ambient_Music_top_charts.json" -w snr5_trainNoise_Ambient_Music_top_charts snr5_devNoise_Ambient_Music_top_charts snr5_testNoise_Ambient_Music_top_charts && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr15_whiteNoise.json" -w snr15_whiteNoise && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr15_Ads_2018_2020.json" -w snr15_trainNoise_Ads_2018_2020 snr15_devNoise_Ads_2018_2020 snr15_testNoise_Ads_2018_2020 && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr15_News.json" -w snr15_trainNoise_News snr15_devNoise_News snr15_testNoise_News && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr15_TalkShows.json" -w snr15_trainNoise_TalkShows snr15_devNoise_TalkShows snr15_testNoise_TalkShows && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted_noisy_snr15_Ambient_Music_top_charts.json" -w snr15_trainNoise_Ambient_Music_top_charts snr15_devNoise_Ambient_Music_top_charts snr15_testNoise_Ambient_Music_top_charts
    
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr5_whiteNoise.json" -w snr5_whiteNoise && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr5_Ads_2018_2020.json" -w snr5_trainNoise_Ads_2018_2020 snr5_devNoise_Ads_2018_2020 snr5_testNoise_Ads_2018_2020 && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr5_News.json" -w snr5_trainNoise_News snr5_devNoise_News snr5_testNoise_News && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr5_TalkShows.json" -w snr5_trainNoise_TalkShows snr5_devNoise_TalkShows snr5_testNoise_TalkShows && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr5_Ambient_Music_top_charts.json" -w snr5_trainNoise_Ambient_Music_top_charts snr5_devNoise_Ambient_Music_top_charts snr5_testNoise_Ambient_Music_top_charts && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr15_whiteNoise.json" -w snr15_whiteNoise && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr15_Ads_2018_2020.json" -w snr15_trainNoise_Ads_2018_2020 snr15_devNoise_Ads_2018_2020 snr15_testNoise_Ads_2018_2020 && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr15_News.json" -w snr15_trainNoise_News snr15_devNoise_News snr15_testNoise_News && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr15_TalkShows.json" -w snr15_trainNoise_TalkShows snr15_devNoise_TalkShows snr15_testNoise_TalkShows && \
    python dataReducer.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted_noisy_snr15_Ambient_Music_top_charts.json" -w snr15_trainNoise_Ambient_Music_top_charts snr15_devNoise_Ambient_Music_top_charts snr15_testNoise_Ambient_Music_top_charts
    
    """
    reductionAmount = percentageReduction / 100
    dirname = os.path.dirname(outJson)
    if not os.path.exists(dirname): os.makedirs(dirname)

    samples = loadFromJson(jsonPath)
    trainSamples = {}
    devSamples = {}
    testSamples = {}
    for i, ID in enumerate(samples.keys()):
        sample = samples[ID]
        if sample["partition"] == "train":
            trainSamples[sample["ID"]] = sample.copy()
        if sample["partition"] == "dev":
            devSamples[sample["ID"]] = sample.copy()
        if sample["partition"] == "test":
            testSamples[sample["ID"]] = sample.copy()

    if reductionAmount > 0:
        print("Performing data reduction based on percentage")
        trainKeys = random.sample(
            trainSamples.keys(),
            int(reductionAmount * len(trainSamples.keys())))
        devKeys = random.sample(devSamples.keys(),
                                int(reductionAmount * len(devSamples.keys())))
        testKeys = random.sample(
            testSamples.keys(), int(reductionAmount * len(testSamples.keys())))
        if keepTest: testKeys = testSamples.keys()
        newSamples = {}
        for keys in [trainKeys, devKeys, testKeys]:
            for ID in keys:
                sample = samples[ID]
                newSamples[ID] = sample.copy()
        print("Data reduction completed!")
    else:
        newSamples = samples.copy()

    for filterString in blackList:
        blackKeys = []
        for i, key in enumerate(newSamples.keys()):
            if filterString in key: blackKeys.append(key)
            printProgressBar(i + 1,
                             len(newSamples.keys()),
                             prefix='removing black-listed IDs :',
                             suffix='',
                             length="fit")
        [newSamples.pop(key) for key in blackKeys]

    if len(whiteList) > 0:
        blackKeys = []
        for i, key in enumerate(newSamples.keys()):
            flag = True
            for whiteString in whiteList:
                if whiteString in key: flag = False
            if flag: blackKeys.append(key)
            printProgressBar(i + 1,
                             len(newSamples.keys()),
                             prefix='Keeping only white-listed IDs :',
                             suffix='',
                             length="fit")
        [newSamples.pop(key) for key in blackKeys]

    with open(outJson, 'w') as jsonFile:
        json.dump(newSamples, jsonFile, indent=4, ensure_ascii=False)
コード例 #7
0
def main(partitions, jsonPath, basedOnFolder, trainIdsForced, devIdsForced,
         testIdsForced, outJson):
    """
    Repartitioning files in a json file


    Example
    ----------
    python repartitionJson.py -p 60 20 20 -f True -j "/mnt/HD-Storage/Datasets/RAVDESS/data.json"
    python repartitionJson.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_parted.json" -tr user_2 user_3 user_4 user_5 user_7 user_8 user_9 user_10 user_11 -de user_12 user_13 user_14 user_15 -te user_16 user_17 user_18 user_19
    python repartitionJson.py -j "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy.json" -o "/mnt/HD-Storage/Datasets/SEMAINE/data_noisy_parted.json" -tr user_2 user_3 user_4 user_5 user_7 user_8 user_9 user_10 user_11 -de user_12 user_13 user_14 user_15 -te user_16 user_17 user_18 user_19
    
    python repartitionJson.py -j "/mnt/HD-Storage/Datasets/IEMOCAP/data.json" -o "/mnt/HD-Storage/Datasets/IEMOCAP/data_parted.json" -tr Ses01 Ses02 Ses03 -de Ses04 -te Ses05

    python repartitionJson.py -j "/mnt/HD-Storage/Datasets/MaSS_Fr/data.json" -o "/mnt/HD-Storage/Datasets/MaSS_Fr/data_parted.json" -tr B01 B02 B03 B04 B05 B06 B07 B08 B09 B10 B11 B12 B13 -de B14 B15 B16 B17 B18 B19 B20 -te B21 B22 B23 B24 B25 B26 B27
    """
    samples = loadFromJson(jsonPath)
    ids = list(samples.keys())
    if basedOnFolder:
        folders = []
        foldersIds = {}
        for i, ID in enumerate(samples.keys()):
            sample = samples[ID]
            wavePath = sample["path"]
            folder = os.path.split(wavePath)[0]
            if not folder in folders:
                folders.append(folder)
                foldersIds[folder] = [ID]
            else:
                foldersIds[folder].append(ID)
        ids = folders
    # print(folders)
    if len(partitions) != 0:
        total = len(ids)
        random.shuffle(ids)
        trainCut = int(total * partitions[0] / 100)
        devCut = int(total * (partitions[0] + partitions[1]) / 100)
        trainIds = ids[:trainCut]
        devIds = ids[trainCut:devCut]
        testIds = ids[devCut:]

    if trainIdsForced != []:
        trainIds = []
        devIds = []
        testIds = []
        for ID in ids:
            trainFlag = False
            devFlag = False
            testFlag = False
            for trainId in trainIdsForced:
                if trainId in ID:
                    trainFlag = True
                    break
            for devId in devIdsForced:
                if devId in ID:
                    devFlag = True
                    break
            for testId in testIdsForced:
                if testId in ID:
                    testFlag = True
                    break
            if trainFlag: trainIds.append(ID)
            if devFlag: devIds.append(ID)
            if testFlag: testIds.append(ID)
    # print(trainIds)

    for i, idx in enumerate(ids):
        partition = "train"
        if idx in devIds: partition = "dev"
        if idx in testIds: partition = "test"
        if basedOnFolder:
            for eachIdx in foldersIds[idx]:
                samples[eachIdx]["partition"] = partition
        else:
            samples[idx]["partition"] = partition
        printProgressBar(i + 1,
                         len(ids),
                         prefix='Repartitioning ',
                         suffix='Complete',
                         length="fit")

    if outJson == "": outJson = jsonPath
    directory = os.path.dirname(outJson)
    if not os.path.exists(directory): os.makedirs(directory)
    with open(outJson, 'w') as jsonFile:
        json.dump(samples, jsonFile, indent=4, ensure_ascii=False)