class TextSpeechDataset(Dataset): def __init__(self, root_dir, annotations_file, parameters): self.root_dir = root_dir self.annotations = pd.read_csv(os.path.join(root_dir, annotations_file), sep="|", header=None) self.cleaners = parameters["text_cleaner"] self.outputs_per_step = parameters["outputs_per_step"] self.sample_rate = parameters["sample_rate"] self.ap = AudioProcessor(sample_rate=self.sample_rate, preemphasis=parameters["preemphasis"], frequency=parameters["frequency"], frame_length=parameters["frame_length"], frame_shift=parameters["frame_shift"], min_dbs=parameters["min_dbs"], ref_dbs=parameters["ref_dbs"], mels_size=parameters["mels_size"], griff_lim_iters=parameters["griff_lim_iters"], power=parameters["spectro_power"] ) def __len__(self): return self.annotations.shape[0] def __getitem__(self, index): text = self.annotations.iloc[index][1] text = np.asarray(text_to_sequence(text=text, cleaner_names=[self.cleaners]), dtype=np.int32) wav_path = os.path.join(self.root_dir, "wavs", f"{self.annotations.iloc[index][0]}.wav") wav = load_wav(filename=wav_path, sample_rate=self.sample_rate) return {"text": text, "wav": wav} def collate_fn(self, batch): wavs = list(map(lambda sample: sample["wav"], batch)) texts = list(map(lambda sample: sample["text"], batch)) max_text_len = np.max(list(map(lambda txt: len(txt), texts))) texts = np.stack(list(map(lambda txt: pad_data(txt, max_text_len), texts))).astype(np.int32) max_wav_len = np.max(list(map(lambda waveform: len(waveform), wavs))) wavs = np.stack(list(map(lambda waveform: pad_data(x=waveform, length=max_wav_len), wavs))).astype(np.int32) linears = list( map(lambda waveform: self.ap.wav_to_linear_spectrogram(waveform=waveform).astype(np.float32), wavs)) mels = list(map(lambda waveform: self.ap.wav_to_mel_spectrogram(waveform=waveform).astype(np.float32), wavs)) mel_lengths = list(map(lambda mel_: mel_.shape[1] + 1, mels)) linears = prepare_tensor(inputs=linears, out_steps=self.outputs_per_step) mels = prepare_tensor(inputs=mels, out_steps=self.outputs_per_step) linears = linears.transpose(0, 2, 1) mels = mels.transpose(0, 2, 1) texts = torch.LongTensor(texts) linears = torch.FloatTensor(linears) mels = torch.FloatTensor(mels) mel_lengths = torch.LongTensor(mel_lengths) return texts, linears, mels, mel_lengths
def __init__(self, root_dir, annotations_file, parameters): self.root_dir = root_dir self.annotations = pd.read_csv(os.path.join(root_dir, annotations_file), sep="|", header=None) self.cleaners = parameters["text_cleaner"] self.outputs_per_step = parameters["outputs_per_step"] self.sample_rate = parameters["sample_rate"] self.ap = AudioProcessor(sample_rate=self.sample_rate, preemphasis=parameters["preemphasis"], frequency=parameters["frequency"], frame_length=parameters["frame_length"], frame_shift=parameters["frame_shift"], min_dbs=parameters["min_dbs"], ref_dbs=parameters["ref_dbs"], mels_size=parameters["mels_size"], griff_lim_iters=parameters["griff_lim_iters"], power=parameters["spectro_power"] )
def run_train(c, args, model_params): c = copy_config_dict(c) ap = AudioProcessor(**c.audio) if args.seed is None: log_path = os.path.join(c.train_config['logs_path'], c.model_name) else: log_path = os.path.join(os.path.join(c.train_config['logs_path'], str(args.seed)), c.model_name) c.train_config['seed'] = args.seed os.makedirs(log_path, exist_ok=True) tensorboard = TensorboardWriter(os.path.join(log_path,'tensorboard')) trainloader = train_dataloader(copy_config_dict(c), ap, class_balancer_batch=c.dataset['class_balancer_batch']) max_seq_len = trainloader.dataset.get_max_seq_lenght() c.dataset['max_seq_len'] = max_seq_len model_params['config'] = copy_config_dict(c) # save config in train dir, its necessary for test before train and reproducity save_config_file(c, os.path.join(log_path,'config.json')) # one_window in eval use overlapping if c.dataset['temporal_control'] == 'one_window': c.dataset['temporal_control'] = 'overlapping' evaloader = eval_dataloader(c, ap, max_seq_len=max_seq_len) # enablePrint() # print(max_seq_len, trainloader.dataset.get_max_seq_lenght(), c.dataset['temporal_control'], c.dataset['max_seq_len']) return train(args, log_path, args.checkpoint_path, trainloader, evaloader, tensorboard, c, c.model_name, ap, cuda=True, model_params=model_params)
def run_train(c, args, model_params=None): ap = AudioProcessor(**c.audio) log_path = os.path.join(c.train_config['logs_path'], c.model_name) os.makedirs(log_path, exist_ok=True) tensorboard = TensorboardWriter(os.path.join(log_path, 'tensorboard')) print(c.dataset['train_csv'], c.dataset['eval_csv']) trainloader = train_dataloader( c, ap, class_balancer_batch=c.dataset['class_balancer_batch']) max_seq_len = trainloader.dataset.get_max_seq_lenght() c.dataset['max_seq_len'] = max_seq_len print(c.dataset['train_csv'], c.dataset['eval_csv']) # save config in train dir, its necessary for test before train and reproducity save_config_file(c, os.path.join(log_path, 'config.json')) evaloader = eval_dataloader(c, ap, max_seq_len=max_seq_len) return train(args, log_path, args.checkpoint_path, trainloader, evaloader, tensorboard, c, c.model_name, ap, cuda=True, model_params=model_params)
type=str, required=False, default=False, help="Librispeech format, if true load with librispeech format") args = parser.parse_args() os.makedirs(args.out_dir, exist_ok=True) if args.train_data_csv: os.makedirs(os.path.join(args.out_dir, 'train'), exist_ok=True) if args.test_data_csv: os.makedirs(os.path.join(args.out_dir, 'test'), exist_ok=True) cpu_num = cpu_count() # num threads = num cpu cores config = load_config(args.config) ap = AudioProcessor(config.audio) sample_rate = config.audio[config.audio['backend']]['sample_rate'] audio_len = config.audio['audio_len'] form = config.dataset['format'] output_dir_train = os.path.join(args.out_dir, 'train') output_dir_test = os.path.join(args.out_dir, 'test') dataset_root_dir = args.dataset_root_dir train_data_csv = None test_data_csv = None noise_files = open(args.noise_csv).readlines() if args.train_data_csv: train_data_csv = pd.read_csv(args.train_data_csv, sep=',').values
outpath = os.path.join( output_dir, os.path.basename(mixed_path).replace(".wav", "_predict.wav")) soundfile.write(outpath, est_wav, 16000) return est_wav, target_wav, mixed_wav, emb_wav print("... Load GE2E encoder model ...") encoder.load_model(Path('encoder/saved_models/pretrained_en.pt')) checkpoint_path = 'models/demo5.pt' checkpoint = torch.load(checkpoint_path, map_location='cpu') model_c = load_config_from_str(checkpoint['config_str']) ap = AudioProcessor(model_c.audio) # create AudioProcessor for model model_name = model_c.model_name cuda = False if model_name == 'voicefilter': print('inicializado com voicefilter') model = VoiceFilter(model_c) elif model_name == 'voicesplit': model = VoiceSplit(model_c) else: raise Exception(" The model '" + model_name + "' is not suported") if model_c.train_config['optimizer'] == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=model_c.train_config['learning_rate']) else: raise Exception("The %s not is a optimizer supported" % model_c.train['optimizer'])
help="Root directory of run.") parser.add_argument('-c', '--config_path', type=str, required=False, default=None, help="json file with configurations") parser.add_argument('--checkpoints_path', type=str, required=True, help="path of checkpoint pt file, for continue training") args = parser.parse_args() all_checkpoints = sorted(glob(os.path.join(args.checkpoints_path, '*.pt'))) #print(all_checkpoints, os.listdir(args.checkpoints_path)) if args.config_path: c = load_config(args.config_path) else: #load config in checkpoint checkpoint = torch.load(all_checkpoints[0], map_location='cpu') c = load_config_from_str(checkpoint['config_str']) ap = AudioProcessor(c.audio) log_path = os.path.join(c.train_config['logs_path'], c.model_name) audio_config = c.audio[c.audio['backend']] tensorboard = TensorboardWriter(log_path, audio_config) # set test dataset dir c.dataset['test_dir'] = args.dataset_dir # set batchsize = 32 c.test_config['batch_size'] = 5 test_dataloader = test_dataloader(c, ap) best_loss = 999999999 best_loss_checkpoint = '' sdrs_checkpoint = [] for i in tqdm.tqdm(range(len(all_checkpoints))): checkpoint = all_checkpoints[i] mean_loss= test(args, log_path, checkpoint, test_dataloader, tensorboard, c, c.model_name, ap, cuda=True)
help="path of checkpoint pt file, for continue training") parser.add_argument('--batch_size', type=int, default=20, help="Batch size for test") parser.add_argument('--num_workers', type=int, default=10, help="Number of Workers for test data load") parser.add_argument('--no_insert_noise', type=bool, default=False, help=" No insert noise in test ?") parser.add_argument('--num_noise_control', type=int, default=1, help="Number of Noise for insert in control") parser.add_argument('--num_noise_patient', type=int, default=0, help="Number of Noise for insert in patient") args = parser.parse_args() c = load_config(args.config_path) ap = AudioProcessor(**c.audio) if not args.no_insert_noise: c.data_aumentation['insert_noise'] = True else: c.data_aumentation['insert_noise'] = False # ste values for noisy insertion in test c.data_aumentation["num_noise_control"] = args.num_noise_control c.data_aumentation["num_noise_patient"] = args.num_noise_patient print("Insert noise ?", c.data_aumentation['insert_noise']) c.dataset['test_csv'] = args.test_csv c.dataset['test_data_root_path'] = args.test_root_dir
if __name__ == "__main__": # Get defaults so it can work with no Sacred parser = argparse.ArgumentParser() parser.add_argument('-f', "--wavfile_path", required=True) parser.add_argument('-c', '--config', type=str, required=True, help='JSON file for configuration') args = parser.parse_args() config = load_config(args.config) filepath = args.wavfile_path # extract spectrogram config.audio['feature'] = 'spectrogram' ap = AudioProcessor(**config.audio) spectrogram = ap.get_feature_from_audio_path(filepath) print("Spectogram with shape:",spectrogram.shape) # extract spectrogram config.audio['feature'] = 'melspectrogram' ap = AudioProcessor(**config.audio) melspectrogram = ap.get_feature_from_audio_path(filepath) print("MelSpectogram with shape:",melspectrogram.shape) # extract spectrogram config.audio['feature'] = 'mfcc' ap = AudioProcessor(**config.audio) mfcc = ap.get_feature_from_audio_path(filepath) print("MFCC with shape:",mfcc.shape)
def run_test_all_seeds(args, cuda=True, debug=False, return_potential=False): runs_list = os.listdir(args.experiment_dir) runs_list.sort() num_runs = len(runs_list) votes = [] wav_files = [] targets = [] # define loss function criterion = nn.BCELoss(reduction='sum') for run in runs_list: blockPrint() run_dir = os.path.join(args.experiment_dir, run) if os.path.isfile(run_dir): continue model_name = os.listdir(run_dir)[0] checkpoint_path = os.path.join(run_dir, model_name, 'best_checkpoint.pt') config_path = os.path.join(run_dir, model_name, 'config.json') c = load_config(config_path) ap = AudioProcessor(**c.audio) c.dataset['test_csv'] = args.test_csv c.dataset['test_data_root_path'] = args.test_root_dir c.test_config['batch_size'] = args.batch_size c.test_config['num_workers'] = args.num_workers max_seq_len = c.dataset['max_seq_len'] c.train_config['seed'] = 0 testdataloader = test_dataloader(c, ap, max_seq_len=max_seq_len) # load model model = return_model(c) enablePrint() if checkpoint_path is not None: print("Loading checkpoint: %s" % checkpoint_path) try: checkpoint = torch.load(checkpoint_path, map_location='cpu') model.load_state_dict(checkpoint['model']) print("Model Sucessful Load !") except Exception as e: raise ValueError( "You need pass a valid checkpoint, may be you need check your config.json because de the of this checkpoint cause the error: " + e) blockPrint() # convert model from cuda if cuda: model = model.cuda() model.train(False) vote, targets, wav_path = test(criterion, ap, model, c, testdataloader, cuda=cuda, confusion_matrix=True, debug=debug, simples_vote=args.simples_vote) # print(vote) wav_files.append(wav_path) votes.append(vote) if len(wav_files): if wav_files[-1] != wav_files[0]: raise ValueError( "Diferents files or order for the test in diferrents seeds or folds" ) # mean vote, and round is necessary if use composite vote preds = np.mean(np.array(votes), axis=0) # print(preds) if not return_potential: preds = preds.round() file_names = wav_files[0] if debug and not return_potential: enablePrint() targets = np.array(targets) preds = np.array(preds) names = np.array(file_names) idxs = np.nonzero(targets == c.dataset['control_class']) control_target = targets[idxs] control_preds = preds[idxs] names_control = names[idxs] idxs = np.nonzero(targets == c.dataset['patient_class']) patient_target = targets[idxs] patient_preds = preds[idxs] names_patient = names[idxs] if debug: print('+' * 40) print("Control Files Classified incorrectly:") incorrect_ids = np.nonzero( control_preds != c.dataset['control_class']) inc_names = names_control[incorrect_ids] print("Num. Files:", len(inc_names)) print(inc_names) print('+' * 40) print('-' * 40) print("Patient Files Classified incorrectly:") incorrect_ids = np.nonzero( patient_preds != c.dataset['patient_class']) inc_names = names_patient[incorrect_ids] print("Num. Files:", len(inc_names)) print(inc_names) print('-' * 40) acc_control = (control_preds == control_target).mean() acc_patient = (patient_preds == patient_target).mean() acc_balanced = (acc_control + acc_patient) / 2 f1 = f1_score(targets.tolist(), preds.tolist()) uar = recall_score(targets.tolist(), preds.tolist(), average='macro') print("======== Confusion Matrix ==========") y_target = pd.Series(targets, name='Target') y_pred = pd.Series(preds, name='Predicted') df_confusion = pd.crosstab(y_target, y_pred, rownames=['Target'], colnames=['Predicted'], margins=True) print(df_confusion) print("Test\n ", "Acurracy Control: ", acc_control, "Acurracy Patient: ", acc_patient, "Acurracy Balanced", acc_balanced) print("F1:", f1, "UAR:", uar) if return_potential: return preds, file_names else: df = pd.DataFrame({ 'filename': file_names, 'prediction': preds.astype(int) }) df['prediction'] = df['prediction'].replace( int(c.dataset['control_class']), 'negative', regex=True).replace(int(c.dataset['patient_class']), 'positive', regex=True) if args.output_csv: out_csv_path = args.output_csv else: out_csv_path = os.path.join( args.experiment_dir, os.path.basename(c.dataset['test_csv'])) df.to_csv(out_csv_path, index=False)
"audio_len": 3, "voicefilter": { "n_fft": 1200, "num_mels": 40, "num_freq": 601, "sample_rate": 16000, "hop_length": 160, "win_length": 400, "min_level_db": -100.0, "ref_level_db": 20.0, "preemphasis": 0.97, "power": 1.5, "griffin_lim_iters": 60 } } ap = AudioProcessor(config) # In[9]: os.listdir(TEST_DATA) # In[12]: #Preprocess dataset train_files = sorted(glob(os.path.join(TRAIN_DATA, glob_re_wav_emb))) test_files = sorted(glob(os.path.join(TEST_DATA, glob_re_wav_emb))) if len(train_files) == 0 or len(test_files): print("check train and test path files not in directory") files = train_files + test_files
] if __name__ == "__main__": # Get defaults so it can work with no Sacred parser = argparse.ArgumentParser() parser.add_argument('-c', '--config', type=str, required=True, help='JSON file for configuration') args = parser.parse_args() config = load_config(args.config) # extract spectrogram config.audio['feature'] = 'spectrogram' ap = AudioProcessor(**config.audio) max_amp = 0 min_amp = 99999999999999999 for samples_file in samples: for sample in samples_file: loc_max_amp = 0 wav = ap.load_wav(sample[0]) inicio = sample[1] fim = sample[2] slice_start = int(inicio * ap.sample_rate) slice_end = int(fim * ap.sample_rate) wav = wav[:, slice_start:slice_end] wav_max_amp = wav.max().numpy() if wav_max_amp > loc_max_amp: loc_max_amp = wav_max_amp