def testModel(experiment, testRun, setTarg): # print("Testing model ...") dataset = DataReader(experiment.data["path"]) dataset.setDatasetClassic("test", experiment.data["feature"], experiment.data["annotation"]) if setTarg == "MeanStd": dataset.setTargetMeanStd() inp, tar = dataset[0] experiment.inputDim = inp.shape[1] experiment.outputDim = tar.shape[1] firstID1 = list(dataset.dataPart.keys())[0] firstID2 = list(dataset.dataPart[firstID1]["annotations"])[0] headers = dataset.dataPart[firstID1]["annotations"][firstID2]["headers"] if setTarg == "MeanStd": headers = ["mean", "std"] # print(headers) wrapper = getWrapper(experiment, getBest=True) modelOutPath = os.path.join(wrapper.savePath, "ouputs") if testRun: dataset = keepOne(dataset) IDs = dataset.dataPart.keys() for key in experiment.evaluation.keys(): metrics = {} for idx, ID in enumerate(IDs): savePath = os.path.join(modelOutPath, ID + ".csv") outputs = pd.read_csv(savePath).to_numpy() targets = dataset.targetReader(ID) # print(targets.shape, outputs.shape) if idx == 0: results = [[] for _ in range(targets.shape[1])] # bestresult = 0; bestID = "0" for dim in range(targets.shape[1]): metrics[headers[dim]] = {} for dim in range(targets.shape[1]): output = outputs[:, dim] target = targets[:, dim] while target.shape[0] > output.shape[0]: output = np.append(output, outputs[-1]) while target.shape[0] < output.shape[0]: output = outputs[:target.shape[0]].reshape(target.shape[0]) result = getMetric(target, output, metric=key) # if result > bestresult: bestresult=result; bestID = ID # print(ID, result, len(output)) results[dim].append(result) printProgressBar(idx + 1, len(IDs), prefix='Testing model with ' + key + ':', suffix='', length="fit") for dim in range(targets.shape[1]): metrics[headers[dim]]['mean'] = np.mean(np.array(results[dim])) metrics[headers[dim]]['std'] = np.std(np.array(results[dim])) experiment.evaluation[key] = metrics return experiment
def main(featsList, jsonPath): """ Adding mfcc features to a given dataset and writting its reference to a json file Example ---------- python Standardize.py -f MFCC opensmile_ComParE_2016 MFB -j "/mnt/HD-Storage/Databases/RECOLA_46_P_S/data.json" python Standardize.py -f MFB -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" python Standardize.py -f MFB -j "/mnt/HD-Storage/Datasets/AlloSat/data.json" python Standardize.py -f MFB -j "/mnt/HD-Storage/Datasets/Recola_46_S/data.json" """ for featName in featsList: samples = loadFromJson(jsonPath) trainFilePaths = [] print("feature:", featName) for i, ID in enumerate(samples.keys()): sample = samples[ID] featRef = sample["features"][featName] if sample["partition"] == "train": path = featRef["path"] fullPath = os.path.join(os.path.split(jsonPath)[0], path) trainFilePaths.append(fullPath) mean, std = getMeanStd(trainFilePaths) for i, ID in enumerate(samples.keys()): sample = samples[ID] featRef = sample["features"][featName] newfeatName = featName + "_standardized" path = featRef["path"] fullPath = os.path.join(os.path.split(jsonPath)[0], path) fileOutPath = os.path.join( os.path.split(jsonPath)[0], path.replace(featName, newfeatName)) standardize(fullPath, fileOutPath, mean, std) featsDict = getFeatsDict(newfeatName, featRef["genre"], featRef["dimension"], path.replace(featName, newfeatName)) samples[ID]["features"][featsDict["ID"]] = featsDict printProgressBar(i + 1, len(samples), prefix='Standardizing ' + featName + ' features', suffix='Complete', length="fit") with open(jsonPath, 'w') as jsonFile: json.dump(samples, jsonFile, indent=4, ensure_ascii=False)
def main(inputPath, newPath): """ Transform all audio files under a folder into wav PCM 16kHz 16bits signed-intege. example: python Preprocess.py --input "../../Data/Wavs" --output "../../Data/WavsProcessed" python Preprocess.py --input "/mnt/HD-Storage/Databases/AlloSat_corpus/audio" --output "/mnt/HD-Storage/Datasets/AlloSat/Wavs" """ path = os.path.join(inputPath, "**")#"../PMDOM2FR/**/" theFiles = get_files_in_path(path) for i, filePath in enumerate(theFiles): # Making wav files fileNewPath = filePath.replace(inputPath, newPath) makeDirFor(fileNewPath) os.system('sox ' + filePath + ' -r 16000 -c 1 -b 16 -e signed-integer ' + fileNewPath) printProgressBar(i + 1, len(theFiles), prefix = 'Transforming Files:', suffix = 'Complete')
def writeOutForRegression(experiment, testRun, seed=0): dataset = DataReader(experiment.data["path"]) dataset.setDatasetFeatOnly("test", experiment.data["feature"]) if testRun: dataset = keepOne(dataset) dataloader = DataLoader(dataset=dataset, batch_size=1, shuffle=False) wrapper = getWrapper(experiment, seed=seed, getBest=True) modelOutPath = os.path.join(wrapper.savePath, "outputs") if not os.path.exists(modelOutPath): os.makedirs(modelOutPath) for idx, (ID, feat) in enumerate(dataloader): output = wrapper.forwardModel(feat) output = output.detach().cpu().numpy() # print(ID, feat.shape, output.shape) savePath = os.path.join(modelOutPath, ID[0]+".csv") headers = ["output_"+str(i) for i in range(output.shape[2])] df = pd.DataFrame(output[0], columns = headers) df.to_csv(savePath, index=False) printProgressBar(idx+1, len(dataloader), prefix = 'Writing outputs:', suffix = '', length = "fit")
def writeOutForClassification(experiment, testRun, seed=0): dataset = DataReader(experiment.data["path"]) dataset.setDatasetFeatOnly("test", experiment.data["feature"]) if testRun: dataset = keepOne(dataset) dataloader = DataLoader(dataset=dataset, batch_size=1, shuffle=False) wrapper = getWrapper(experiment, seed=seed, getBest=True) modelOutPath = os.path.join(wrapper.savePath, "outputs") if not os.path.exists(modelOutPath): os.makedirs(modelOutPath) headers = [] outputs = [] for idx, (ID, feat) in enumerate(dataloader): printProgressBar(idx+1, len(dataloader), prefix = 'Writing outputs:', suffix = '', length = "fit") output = wrapper.forwardModel(feat) output = output.detach().cpu().numpy() # print(ID, feat.shape, output.shape) outputs = output if len(outputs)==0 else np.concatenate((outputs,output)) headers.append(ID[0]) # print(len(headers), outputs.shape) savePath = os.path.join(modelOutPath, "outputs.csv") df = pd.DataFrame(np.transpose(outputs), columns = headers) df.to_csv(savePath, index=False)
def trainModel(self, datasetTrain, datasetDev, batchSize=1, maxEpoch=200, loadBefore=True, tolerance=15, minForTolerance=15): if loadBefore: self.loadCheckpoint() trainDataloader = DataLoader(dataset=datasetTrain, batch_size=batchSize, shuffle=True) devDataloader = DataLoader(dataset=datasetDev, batch_size=batchSize, shuffle=False) while self.currentEpoch <= maxEpoch: if self.noMoreTrain: if self.printLvl > 0: print("Early stopping has been achieved!") break self.trainEpoch(trainDataloader) devLoss = self.evaluateModel(devDataloader) self.modelStates[self.currentEpoch] = copy.deepcopy( self.model.state_dict()) self.epochDevLosses.append(devLoss) if self.printLvl > 1: printProgressBar(self.currentEpoch, maxEpoch, prefix='Training model:', suffix='| epoch loss: ' + str(devLoss), length="fit") # print("loss", self.currentEpoch, devLoss) self.currentEpoch += 1 # --- Early Stopping --- if (self.currentEpoch - self.getBestEpochIdx() >= tolerance) and self.currentEpoch > minForTolerance: self.noMoreTrain = True self.saveCheckpoint() self.saveLogToCSV() if self.printLvl > 0: print("Training the model has been finished!")
def getMeanStd(filePaths): # filePaths = glob.glob(os.path.join(csvInFolder, "**", fileNamesWith+"*.csv"), recursive=True) df = pd.read_csv(filePaths[0]) dims = len([aux for aux in df.keys() if "feat_" in aux]) myFeats = [[] for _ in range(dims)] # print("dim", dim) for f, filePath in enumerate(filePaths): df = pd.read_csv(filePath) for dim in range(dims): keyWord = "feat_" + str(dim) col = list(df[keyWord]) myFeats[dim] += col printProgressBar(f + 1, len(filePaths), prefix='Calculating Mean and Std:', suffix='Complete', length="fit") allFeats = np.array(myFeats) mean = np.mean(allFeats, axis=1) std = np.std(allFeats, axis=1) return mean, std
def main(): Datasets_Path = "/mnt/HD-Storage/Datasets" main_path = os.path.join(Datasets_Path, "AlloSat") wavs_path = os.path.join(main_path, "Wavs") jsonPath = os.path.join(main_path, "data.json") path = os.path.join(wavs_path, "**", "*.wav") filesPaths = glob.glob(path, recursive=True) allDics = {} trainList, devList, testList = getParts(os.path.join(main_path, "Parts")) for i, filePath in enumerate(filesPaths): baseName = os.path.basename(filePath)[:-4] fileDict = AudioSample() partition = "" if baseName in trainList: partition = "train" if baseName in devList: partition = "dev" if baseName in testList: partition = "test" fileDict.setParams(baseName, filePath, partition) # fileDict.features = getFeatures(main_path, baseName) fileDict.annotations = getAnnots(main_path, baseName) fileDict.speaker_info.language = "French" dic = classToDic(fileDict.__dict__) # dic = changePaths(dic, main_path, ".") dic = localizePaths(dic, main_path) allDics[baseName] = dic printProgressBar(i + 1, len(filesPaths), prefix='Processing Files:', suffix='Complete') with open(jsonPath, 'w') as fp: json.dump(allDics, fp, indent=4, ensure_ascii=False)
def main(featsFolder, jsonPath): """ Adding mel-frequency filterbank features to a given dataset and writting its reference to a json file Example ---------- python MelFilterBank.py -f "MFB" -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" python MelFilterBank.py -f "MFB" -j "/mnt/HD-Storage/Datasets/AlloSat/data.json" python MelFilterBank.py -f "MFB" -j "/mnt/HD-Storage/Datasets/Recola_46_S/data.json" """ samples = loadFromJson(jsonPath) for i, ID in enumerate(samples.keys()): sample = samples[ID] wavePath = sample["path"] wavsFolder = wavePath.split(os.sep)[0] waveFullPath = os.path.join(os.path.split(jsonPath)[0], wavePath) featsLocalPath = wavePath.replace(wavsFolder, featsFolder).replace(".wav", ".csv") featsLocalPath = os.path.join("Feats", featsLocalPath) featsFullPath = os.path.join( os.path.split(jsonPath)[0], featsLocalPath) # print(featsLocalPath, featsFullPath) dim = makeFeatsCsv(waveFullPath, featsFullPath) if dim == 0: continue featsDict = getFeatsDict(dim, featsFolder, featsLocalPath) samples[ID]["features"][featsDict["ID"]] = featsDict # saveToJson(jsonPath, sample) printProgressBar(i + 1, len(samples), prefix='Adding mel-frequency filterbank features:', suffix='Complete', length="fit") with open(jsonPath, 'w') as jsonFile: json.dump(samples, jsonFile, indent=4, ensure_ascii=False)
def testClassification(experiment, testRun, setTarg, seed=0): dataset = DataReader(experiment.data["path"]) dataset.setDatasetClassic("test", experiment.data["feature"], experiment.data["annotation"]) inp, tar = dataset[0] experiment.inputDim = inp.shape[1] firstID1 = list(dataset.dataPart.keys())[0] firstID2 = list(dataset.dataPart[firstID1]["annotations"])[0] headers = dataset.dataPart[firstID1]["annotations"][firstID2]["headers"] # print(headers) wrapper = getWrapper(experiment, seed=seed, getBest=True) modelOutPath = os.path.join(wrapper.savePath, "outputs") savePath = os.path.join(modelOutPath, "outputs.csv") outputsCSV = pd.read_csv(savePath) if testRun: dataset = keepOne(dataset) IDs = dataset.dataPart.keys() AllOuts = [] AllTars = [] for idx, ID in enumerate(IDs): outputs = outputsCSV[ID].to_numpy() targets = dataset.targetReader(ID) AllOuts.append(np.argmax(outputs)) AllTars.append(targets[0,0]) # print(np.argmax(outputs), targets[0,0]) printProgressBar(idx+1, len(IDs), prefix = 'Testing model :', suffix = '', length = "fit") # if idx > 50: break target = np.array(AllTars) output = np.array(AllOuts) metrics = {} for key in experiment.evaluation.keys(): metrics[key] = getMetric(target, output, metric=key) experiment.evaluation = metrics confMat = confMatrix(target, output, numTars=experiment.outputDim) # print(confMatrix(target, output, numTars=experiment.outputDim)) savePath = os.path.join(wrapper.savePath, "confMat.csv") np.savetxt(savePath, confMat, delimiter=",") return experiment
def main(featsFolder, jsonPath, modelPath, maxDur, normalised): """ Adding wav2vec2 features to a given dataset and writting its reference to a json file Example ---------- python wav2vec2.py -f "xlsr_53_56k_cut30" -d 29.99 -n True -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/xlsr_53_56k.pt" python wav2vec2.py -f "FlowBERT_2952h_base_cut30_noNorm" -d 29.99 -n False -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/FlowBERT_2952h_base.pt" python wav2vec2.py -f "FlowBERT_2952h_base_cut30" -d 29.99 -n True -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/FlowBERT_2952h_base.pt" python wav2vec2.py -f "FlowBERT_2952h_large_cut30" -d 29.99 -n True -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/FlowBERT_2952h_large.pt" python wav2vec2.py -f "FlowBERT_2952h_large_noNorm_cut30" -d 29.99 -n False -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/FlowBERT_2952h_large.pt" python wav2vec2.py -f "xlsr_53_56k_cut30" -d 29.98 -n True -j "/mnt/HD-Storage/Datasets/AlloSat/data.json" -m "/mnt/HD-Storage/Models/xlsr_53_56k.pt" python wav2vec2.py -f "FlowBERT_2952h_base_cut30" -d 29.99 -n False -j /home/getalp/alisamis/Datasets/AlloSat/data.json -m /home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/2952h_base/checkpoint_best.pt python wav2vec2.py -f "FlowBERT_2952h_base_cut30_noNorm" -d 29.99 -n False -j "/mnt/HD-Storage/Datasets/Recola_46_S/data.json" -m "/mnt/HD-Storage/Models/FlowBERT_2952h_base.pt" not working: python wav2vec2.py -f "wav2vec2-large-xlsr-53-french" -d 29.99 -n True -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/wav2vec2-large-xlsr-53-french.zip" python wav2vec2.py -f "mls_french_base_cut30" -d 29.98 -n False -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/mls_french_base/checkpoint_best.pt" python wav2vec2.py -f "xlsr_53_56k_cut30" -d 29.98 -n True -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/alisamis/Models/xlsr_53_56k.pt" python wav2vec2.py -f "mls_french_base_cut30" -d 29.98 -n False -j "/home/getalp/alisamis/Datasets/Recola_46/data.json" -m "/home/getalp/nguyen35/flowbert_ssl_resources/wav2vec2.0_models/mls_french_base/checkpoint_best.pt" python wav2vec2.py -f "mls_french_base_cut30" -d 29.98 -n False -j "/mnt/HD-Storage/Datasets/Recola_46/data.json" -m "/mnt/HD-Storage/Models/mls_french_base/checkpoint_best.pt" python wav2vec2.py -f "libri960_big_cut30" -d 29.98 -n True -j "/home/getalp/alisamis/Datasets/Recola_46/data.json" -m "/home/getalp/dinarelm/work/data/Exchance/wav2vec/models/libri960_big.pt" python wav2vec2.py -f "libri960_big_cut30" -d 29.98 -n True -j "/home/getalp/alisamis/Datasets/AlloSat/data.json" -m "/home/getalp/dinarelm/work/data/Exchance/wav2vec/models/libri960_big.pt" """ # cp = torch.load(modelPath, map_location=torch.device('cpu')) # model = Wav2VecModel.build_model(cp['args'], task=None) # model.load_state_dict(cp['model']) # model.eval() # cp = torch.load(modelPath, map_location=torch.device('cpu')) model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task( [modelPath]) model = model[0] model.eval() samples = loadFromJson(jsonPath) for i, ID in enumerate(samples.keys()): sample = samples[ID] wavePath = sample["path"] wavsFolder = wavePath.split(os.sep)[0] waveFullPath = os.path.join(os.path.split(jsonPath)[0], wavePath) featsLocalPath = wavePath.replace(wavsFolder, featsFolder).replace(".wav", ".csv") featsLocalPath = os.path.join("Feats", featsLocalPath) featsFullPath = os.path.join( os.path.split(jsonPath)[0], featsLocalPath) # print(featsLocalPath, featsFullPath) dim = makeFeatsCsv(waveFullPath, featsFullPath, model, maxDur, normalised) if dim == 0: continue featsDict = getFeatsDict(dim, featsFolder, featsLocalPath) samples[ID]["features"][featsDict["ID"]] = featsDict # saveToJson(jsonPath, sample) printProgressBar(i + 1, len(samples), prefix='Adding wav2vec features:', suffix='Complete', length="fit") with open(jsonPath, 'w') as jsonFile: json.dump(samples, jsonFile, indent=4, ensure_ascii=False)