def cut_wav(wav_filename, textgrid_filename, output_folder): wav, sr = librosa.load(wav_filename) # print(np.shape(wav)) # print(sr) cut_info_dict = parse_TextGrid.parse_textgrid(textgrid_filename) total_wav = np.array([]) cnt = 0 for ind, cut_info in enumerate(cut_info_dict): # print(type(cut_info)) if cut_info != "None": filename = str(cnt) + "_" + cut_info + ".wav" fn = os.path.join(output_folder, filename) wav_part, _ = utils.cut_wav_save(wav, sr, cut_info_dict[cut_info][0], cut_info_dict[cut_info][1], fn) cnt = cnt + 1 total_wav = np.concatenate((total_wav, wav_part)) # print(np.shape(total_wav)) # print(wav) # print(total_wav) total_filename = os.path.join(output_folder, "total.wav") # librosa.output.write_wav(total_filename, total_wav, sr) utils.save_wav(total_wav, sr, total_filename)
def synthesize(mel_sp, save_path, weight_path): wavenet = WaveNet(hparams.num_mels, hparams.upsample_scales) wavenet.load_weights(weight_path) mel_sp = tf.expand_dims(mel_sp, axis=0) outputs = wavenet.synthesis(mel_sp) outputs = np.squeeze(outputs) outputs = inv_mulaw_quantize(outputs) save_wav(outputs, save_path, hparams.sampling_rate)
def cut_total_wav(wav_filename, textgrid_filename, output_filename): wav, sr = librosa.load(wav_filename) cut_info_dict = parse_TextGrid.parse_textgrid(textgrid_filename) total_wav = np.array([]) # cnt = 0 for ind, cut_info in enumerate(cut_info_dict): # print(type(cut_info)) if cut_info != "None": # filename = str(cnt) + "_" + cut_info + ".wav" # fn = os.path.join(output_folder, filename) wav_part = utils.cut_wav(wav, sr, cut_info_dict[cut_info][0], cut_info_dict[cut_info][1]) # cnt = cnt + 1 total_wav = np.concatenate((total_wav, wav_part)) utils.save_wav(total_wav, sr, output_filename)
def evaluation(model, step, device, args): # Evaluation model.eval() with torch.no_grad(): # Preprocessing eval texts print('Start generating evaluation speeches...') n_eval = len(hps.eval_texts) for i in range(n_eval): sys.stdout.write('\rProgress: {}/{}'.format(i + 1, n_eval)) sys.stdout.flush() text = hps.eval_texts[i] text = text_normalize(text) txt_id = sent2idx(text) + [hps.vocab.find('E')] txt_len = len(txt_id) GO_frame = torch.zeros(1, 1, hps.n_mels) # Shape: (1, seq_length) txt = torch.LongTensor([txt_id]) txt_len = torch.LongTensor([txt_len]) if args.cuda: GO_frame = GO_frame.cuda() txt = txt.cuda() txt_len.cuda() _batch = model(text=txt, frames=GO_frame, text_length=txt_len) mel = _batch['mel'][0] mag = _batch['mag'][0] attn = _batch['attn'][0] if args.cuda: mel = mel.cpu() mag = mag.cpu() attn = attn.cpu() mel = mel.numpy() mag = mag.numpy() attn = attn.numpy() wav = mag2wav(mag) save_alignment(attn, step, 'eval/plots/attn_{}.png'.format(text)) save_spectrogram(mag, 'eval/plots/spectrogram_[{}].png'.format(text)) save_wav(wav, 'eval/results/wav_{}.wav'.format(text)) sys.stdout.write('\n')
def pad2drums(read_from_fname, save_to_fname): """ Reads .wav-file in folder "raw_audio" from a drum pad (with mic about 10 cm away) and converts it to an .wav-file with drum sounds in place of the pad sounds. Created file is placed in folder "results". """ load_path = 'raw_audio/' fs, raw_audio = load_wav(load_path + read_from_fname) # Detecting the pad hits from the raw_audio hit_indices, hit_strengths = detect_sound(raw_audio, stereo=True) dg = DrumGenerator(fs=fs) drum_audio = dg.generate_drum_audio(hit_indices, hit_strengths, raw_audio.size) # Save drum_audio to file name for save_to_file added by user save_path = 'results/' + save_to_fname save_wav(save_path, drum_audio, fs)
def demo(): mir1k_sr = 16000 n_fft = 1024 hop_length = n_fft // 4 num_rnn_layer = 3 num_hidden_units = 256 checkpoint = torch.load("final_model.pth") mir1k_dir = 'data/MIR1K/MIR-1K' test_path = os.path.join(mir1k_dir, 'MIR-1K_test.json') with open(test_path, 'r') as text_file: content = json.load(text_file) wav_filenames = ["{}/{}".format("data/MIR1K/MIR-1K/Wavfile", f) for f in content] wav_filenames = ["../HW3/sample_music.wav"] # only get the first two for demo wavs_mono, wavs_src1, wavs_src2 = load_wavs(filenames = wav_filenames, sr = mir1k_sr) stfts_mono, stfts_src1, stfts_src2 = wavs_to_specs( wavs_mono = wavs_mono, wavs_src1 = wavs_src1, wavs_src2 = wavs_src2, n_fft = n_fft, hop_length = hop_length) stfts_mono_full, stfts_src1_full, stfts_src2_full = prepare_data_full(stfts_mono = stfts_mono, stfts_src1 = stfts_src1, stfts_src2 = stfts_src2) model = Model(n_fft // 2 + 1, num_hidden_units).to(device) model.load_state_dict(checkpoint["model_state_dict"]) wavs_src1_pred = list() wavs_src2_pred = list() step = 1 model.eval() with torch.no_grad(): for wav_filename, wav_mono, stft_mono_full in zip(wav_filenames, wavs_mono, stfts_mono_full): stft_mono_magnitude, stft_mono_phase = sperate_magnitude_phase(data = stft_mono_full) stft_mono_magnitude = np.array([stft_mono_magnitude]) stft_mono_magnitude = torch.Tensor(stft_mono_magnitude).to(device) y1_pred, y2_pred = model(stft_mono_magnitude) # ISTFT with the phase from mono y1_pred = y1_pred.cpu().numpy() y2_pred = y2_pred.cpu().numpy() y1_stft_hat = combine_magnitude_phase(y1_pred[0], stft_mono_phase) y2_stft_hat = combine_magnitude_phase(y2_pred[0], stft_mono_phase) y1_stft_hat = y1_stft_hat.transpose() y2_stft_hat = y2_stft_hat.transpose() y1_hat = librosa.istft(y1_stft_hat, hop_length = hop_length) y2_hat = librosa.istft(y2_stft_hat, hop_length = hop_length) filename = "demo/"+wav_filename.split("/")[-1] save_wav(filename+"_mono.wav", wav_mono) save_wav(filename+"_src1", y1_hat) save_wav(filename+"_src2", y2_hat) print("done")
bits=params["preprocessing"]["bits"], hop_length=params["preprocessing"]["hop_length"], nc=args.nc, device=device) model.to(device) print("Load checkpoint from: {}:".format(args.checkpoint)) checkpoint = torch.load(args.checkpoint, map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint["model"]) model_step = checkpoint["step"] wav = load_wav(args.wav_path, params["preprocessing"]["sample_rate"]) utterance_id = os.path.basename(args.wav_path).split(".")[0] wav = wav / np.abs(wav).max() * 0.999 mel = melspectrogram(wav, sample_rate=params["preprocessing"]["sample_rate"], preemph=params["preprocessing"]["preemph"], num_mels=params["preprocessing"]["num_mels"], num_fft=params["preprocessing"]["num_fft"], min_level_db=params["preprocessing"]["min_level_db"], hop_length=params["preprocessing"]["hop_length"], win_length=params["preprocessing"]["win_length"], fmin=params["preprocessing"]["fmin"]) mel = torch.FloatTensor(mel).unsqueeze(0).to(device) output = model.generate(mel) path = os.path.join( args.gen_dir, "gen_{}_model_steps_{}.wav".format(utterance_id, model_step)) save_wav(path, output, params["preprocessing"]["sample_rate"])
def train_fn(args, params): # Directory preparation exp_dir = makeExpDirs(args.results_dir, args.exp_name) # Automatic Mixed-Precision if args.optim != "no": import apex device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = Vocoder(mel_channels=params["preprocessing"]["num_mels"], conditioning_channels=params["vocoder"]["conditioning_channels"], embedding_dim=params["vocoder"]["embedding_dim"], rnn_channels=params["vocoder"]["rnn_channels"], fc_channels=params["vocoder"]["fc_channels"], bits=params["preprocessing"]["bits"], hop_length=params["preprocessing"]["hop_length"], nc=args.nc, device=device ) model.to(device) print(model) optimizer = optim.Adam(model.parameters(), lr=params["vocoder"]["learning_rate"]) # Automatic Mixed-Precision if args.optim != "no": model, optimizer = apex.amp.initialize(model, optimizer, opt_level=args.optim) scheduler = optim.lr_scheduler.StepLR(optimizer, params["vocoder"]["schedule"]["step_size"], params["vocoder"]["schedule"]["gamma"]) if args.resume is not None: print(f"Resume checkpoint from: {args.resume}:") checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) scheduler.load_state_dict(checkpoint["scheduler"]) global_step = checkpoint["step"] else: global_step = 0 train_dataset = VocoderDataset(meta_file=os.path.join(args.data_dir, "train.txt"), sample_frames=params["vocoder"]["sample_frames"], audio_slice_frames=params["vocoder"]["audio_slice_frames"], hop_length=params["preprocessing"]["hop_length"], bits=params["preprocessing"]["bits"]) train_dataloader = DataLoader(train_dataset, batch_size=params["vocoder"]["batch_size"], shuffle=True, num_workers=1, pin_memory=True) num_epochs = params["vocoder"]["num_steps"] // len(train_dataloader) + 1 start_epoch = global_step // len(train_dataloader) + 1 # Logger writer = SummaryWriter(exp_dir/"logs") # Add original utterance to TensorBoard if args.resume is None: with open(os.path.join(args.data_dir, "test.txt"), encoding="utf-8") as f: test_wavnpy_paths = [line.strip().split("|")[1] for line in f] for index, wavnpy_path in enumerate(test_wavnpy_paths): muraw_npy = np.load(wavnpy_path) wav_npy = mulaw_decode(muraw_npy, 2**params["preprocessing"]["bits"]) writer.add_audio("orig", torch.from_numpy(wav_npy), global_step=global_step, sample_rate=params["preprocessing"]["sample_rate"]) break for epoch in range(start_epoch, num_epochs + 1): running_loss = 0 for i, (audio, mels) in enumerate(tqdm(train_dataloader, leave=False), 1): audio, mels = audio.to(device), mels.to(device) output = model(audio[:, :-1], mels) loss = F.cross_entropy(output.transpose(1, 2), audio[:, 1:]) optimizer.zero_grad() # Automatic Mixed-Precision if args.optim != "no": with apex.amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() scheduler.step() running_loss += loss.item() average_loss = running_loss / i global_step += 1 if global_step % args.save_step == 0: save_checkpoint(model, optimizer, scheduler, global_step, exp_dir/"params", False) if global_step % params["vocoder"]["checkpoint_interval"] == 0: save_checkpoint(model, optimizer, scheduler, global_step, exp_dir/"params", True) if global_step % params["vocoder"]["generation_interval"] == 0: with open(os.path.join(args.data_dir, "test.txt"), encoding="utf-8") as f: test_mel_paths = [line.strip().split("|")[2] for line in f] for index, mel_path in enumerate(test_mel_paths): utterance_id = os.path.basename(mel_path).split(".")[0] # unsqueeze: insert in a batch mel = torch.FloatTensor(np.load(mel_path)).unsqueeze(0).to(device) output = model.generate(mel) path = exp_dir/"samples"/f"gen_{utterance_id}_model_steps_{global_step}.wav" save_wav(str(path), output, params["preprocessing"]["sample_rate"]) if index == 0: writer.add_audio("cnvt", torch.from_numpy(output), global_step=global_step, sample_rate=params["preprocessing"]["sample_rate"]) # finish a epoch writer.add_scalar("NLL", average_loss, global_step)
def run(args): # Check cuda device device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Data if hps.bucket: dataset = LJSpeech_Dataset(meta_file=hps.meta_path, wav_dir=hps.wav_dir, batch_size=hps.batch_size, do_bucket=True, bucket_size=20) loader = DataLoader( dataset, batch_size=1, shuffle=True, num_workers=4) else: dataset = LJSpeech_Dataset(meta_file=hps.meta_path, wav_dir=hps.wav_dir) loader = DataLoader( dataset, batch_size=hps.batch_size, shuffle=True, num_workers=4, drop_last=True, collate_fn=collate_fn) # Network model = Tacotron() criterion = nn.L1Loss() if args.cuda: model = nn.DataParallel(model.to(device)) criterion = criterion.to(device) # The learning rate scheduling mechanism in "Attention is all you need" lr_lambda = lambda step: hps.warmup_step ** 0.5 * min((step+1) * (hps.warmup_step ** -1.5), (step+1) ** -0.5) optimizer = optim.Adam(model.parameters(), lr=hps.lr) scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) step = 1 epoch = 1 # Load model if args.ckpt: ckpt = load(args.ckpt) step = ckpt['step'] epoch = ckpt['epoch'] model.load_state_dict(ckpt['model']) optimizer.load_state_dict(ckpt['optimizer']) scheduler = optim.lr_scheduler.LambdaLR( optimizer, lr_lambda, last_epoch=step) if args.eval: # Evaluation model.eval() with torch.no_grad(): # Preprocessing eval texts print('Start generating evaluation speeches...') n_eval = len(hps.eval_texts) for i in range(n_eval): sys.stdout.write('\rProgress: {}/{}'.format(i+1, n_eval)) sys.stdout.flush() text = hps.eval_texts[i] text = text_normalize(text) txt_id = sent2idx(text) + [hps.char_set.find('E')] GO_frame = torch.zeros(1, 1, hps.n_mels) # Shape: (1, seq_length) txt = torch.LongTensor(txt_id).unsqueeze(0) if args.cuda: GO_frame = GO_frame.cuda() txt = txt.cuda() _batch = model(text=txt, frames=GO_frame) mel = _batch['mel'][0] mag = _batch['mag'][0] attn = _batch['attn'][0] if args.cuda: mel = mel.cpu() mag = mag.cpu() attn = attn.cpu() mel = mel.numpy() mag = mag.numpy() attn = attn.numpy() wav = mag2wav(mag) save_alignment(attn, step, 'eval/plots/attn_{}.png'.format(text)) save_spectrogram(mag, 'eval/plots/spectrogram_[{}].png'.format(text)) save_wav(wav, 'eval/results/wav_{}.wav'.format(text)) sys.stdout.write('\n') if args.train: before_load = time.time() # Start training model.train() while True: for batch in loader: # torch.LongTensor, (batch_size, seq_length) txt = batch['text'] # torch.Tensor, (batch_size, max_time, hps.n_mels) mel = batch['mel'] # torch.Tensor, (batch_size, max_time, hps.n_fft) mag = batch['mag'] if hps.bucket: # If bucketing, the shape will be (1, batch_size, ...) txt = txt.squeeze(0) mel = mel.squeeze(0) mag = mag.squeeze(0) # GO frame GO_frame = torch.zeros(mel[:, :1, :].size()) if args.cuda: txt = txt.to(device) mel = mel.to(device) mag = mag.to(device) GO_frame = GO_frame.to(device) # Model prediction decoder_input = torch.cat([GO_frame, mel[:, hps.reduction_factor::hps.reduction_factor, :]], dim=1) load_time = time.time() - before_load before_step = time.time() _batch = model(text=txt, frames=decoder_input) _mel = _batch['mel'] _mag = _batch['mag'] _attn = _batch['attn'] # Optimization optimizer.zero_grad() loss_mel = criterion(_mel, mel) loss_mag = criterion(_mag, mag) loss = loss_mel + loss_mag loss.backward() # Gradient clipping total_norm = clip_grad_norm_(model.parameters(), max_norm=hps.clip_norm) # Apply gradient optimizer.step() # Adjust learning rate scheduler.step() process_time = time.time() - before_step if step % hps.log_every_step == 0: lr_curr = optimizer.param_groups[0]['lr'] log = '[{}-{}] loss: {:.3f}, grad: {:.3f}, lr: {:.3e}, time: {:.2f} + {:.2f} sec'.format(epoch, step, loss.item(), total_norm, lr_curr, load_time, process_time) print(log) if step % hps.save_model_every_step == 0: save(filepath='tmp/ckpt/ckpt_{}.pth.tar'.format(step), model=model.state_dict(), optimizer=optimizer.state_dict(), step=step, epoch=epoch) if step % hps.save_result_every_step == 0: sample_idx = random.randint(0, hps.batch_size-1) attn_sample = _attn[sample_idx].detach().cpu().numpy() mag_sample = _mag[sample_idx].detach().cpu().numpy() wav_sample = mag2wav(mag_sample) # Save results save_alignment(attn_sample, step, 'tmp/plots/attn_{}.png'.format(step)) save_spectrogram(mag_sample, 'tmp/plots/spectrogram_{}.png'.format(step)) save_wav(wav_sample, 'tmp/results/wav_{}.wav'.format(step)) before_load = time.time() step += 1 epoch += 1
def train(model, loader, optimizer, criterion, scheduler, step, epoch, device, args): before_load = time.time() # Start training model.train() while True: for batch in loader: # torch.LongTensor, (batch_size, seq_length) txt = batch['text'] # torch.Tensor, (batch_size, max_time, hps.n_mels) mel = batch['mel'] # torch.Tensor, (batch_size, max_time, hps.n_fft) mag = batch['mag'] # torch.LongTensor, (batch_size, ) txt_len = batch['text_length'] frame_len = batch['frame_length'] if hps.bucket: # If bucketing, the shape will be (1, batch_size, ...) txt = txt.squeeze(0) mel = mel.squeeze(0) mag = mag.squeeze(0) txt_len = txt_len.squeeze(0) frame_len = frame_len.squeeze(0) # GO frame GO_frame = torch.zeros(mel[:, :1, :].size()) if args.cuda: txt = txt.to(device) mel = mel.to(device) mag = mag.to(device) GO_frame = GO_frame.to(device) # Model prediction decoder_input = torch.cat([ GO_frame, mel[:, hps.reduction_factor::hps.reduction_factor, :] ], dim=1) load_time = time.time() - before_load before_step = time.time() _batch = model(text=txt, frames=decoder_input, text_length=txt_len, frame_length=frame_len) _mel = _batch['mel'] _mag = _batch['mag'] _attn = _batch['attn'] # Optimization optimizer.zero_grad() loss_mel = criterion(_mel, mel) loss_mag = criterion(_mag, mag) loss = loss_mel + loss_mag loss.backward() # Gradient clipping total_norm = clip_grad_norm_(model.parameters(), max_norm=hps.clip_norm) # Apply gradient optimizer.step() # Adjust learning rate scheduler.step() process_time = time.time() - before_step if step % hps.log_every_step == 0: lr_curr = optimizer.param_groups[0]['lr'] log = '[{}-{}] total_loss: {:.3f}, mel_loss: {:.3f}, mag_loss: {:.3f}, grad: {:.3f}, lr: {:.3e}, time: {:.2f} + {:.2f} sec'.format( epoch, step, loss.item(), loss_mel.item(), loss_mag.item(), total_norm, lr_curr, load_time, process_time) print(log) if step % hps.save_model_every_step == 0: save(filepath='tmp/ckpt/ckpt_{}.pth.tar'.format(step), model=model.state_dict(), optimizer=optimizer.state_dict(), step=step, epoch=epoch) if step % hps.save_result_every_step == 0: sample_idx = random.randint(0, hps.batch_size - 1) attn_sample = _attn[sample_idx].detach().cpu().numpy() mag_sample = _mag[sample_idx].detach().cpu().numpy() wav_sample = mag2wav(mag_sample) # Save results save_alignment(attn_sample, step, 'tmp/plots/attn_{}.png'.format(step)) save_spectrogram(mag_sample, 'tmp/plots/spectrogram_{}.png'.format(step)) save_wav(wav_sample, 'tmp/results/wav_{}.wav'.format(step)) before_load = time.time() step += 1 epoch += 1
def main(): hps = Hparams parser = argparse.ArgumentParser('VC inference') parser.add_argument('--src_wav', type=str, help='source wav file path') parser.add_argument('--ckpt', type=str, help='model ckpt path') parser.add_argument('--save_dir', type=str, help='synthesized wav save directory') args = parser.parse_args() # 0. src_wav_arr = load_wav(args.src_wav) pre_emphasized_wav = _preemphasize(src_wav_arr) # 1. extract ppgs ppg_extractor_hps = hps.PPGExtractor.CNNBLSTMClassifier mfcc_pl = tf.placeholder(dtype=tf.float32, shape=[None, None, 3 * hps.Audio.n_mfcc], name='mfcc_pl') ppg_extractor = CNNBLSTMClassifier(out_dims=hps.Audio.ppg_dim, n_cnn=ppg_extractor_hps.n_cnn, cnn_hidden=ppg_extractor_hps.cnn_hidden, cnn_kernel=ppg_extractor_hps.cnn_kernel, n_blstm=ppg_extractor_hps.n_blstm, lstm_hidden=ppg_extractor_hps.lstm_hidden) predicted_ppgs = ppg_extractor(inputs=mfcc_pl)['logits'] # set up a session config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) # load saved model saver = tf.train.Saver() print('Restoring ppgs extractor from {}'.format(ppg_extractor_hps.ckpt)) saver.restore(sess, ppg_extractor_hps.ckpt) mfcc_feats = wav2unnormalized_mfcc(src_wav_arr) ppg = sess.run(predicted_ppgs, feed_dict={mfcc_pl: np.expand_dims(mfcc_feats, axis=0)}) sess.close() ppg = softmax(np.squeeze(ppg, axis=0)) # 2. extract lf0, mel-spectrogram log_f0 = logf0(args.src_wav) log_f0 = lf0_normailze(log_f0) # mel-spectrogram is extracted for comparison mel_spec = melspectrogram(pre_emphasized_wav).astype(np.float32).T # 3. prepare inputs min_len = min(log_f0.shape[0], ppg.shape[0]) vc_inputs = np.concatenate([ppg[:min_len, :], log_f0[:min_len, :]], axis=1) vc_inputs = np.expand_dims(vc_inputs, axis=1) # [time, batch, dim] # 4. setup vc model and do the inference model = BLSTMConversionModel(in_channels=hps.Audio.ppg_dim + 2, out_channels=hps.Audio.num_mels, lstm_hidden=hps.BLSTMConversionModel.lstm_hidden) device = torch.device('cpu') model.load_state_dict(torch.load(args.ckpt, map_location=device)) model.eval() predicted_mels = model(torch.tensor(vc_inputs)) predicted_mels = np.squeeze(predicted_mels.detach().numpy(), axis=1) # 5. synthesize wav synthesized_wav = inv_preemphasize(inv_mel_spectrogram(predicted_mels.T)) resynthesized_wav = inv_preemphasize(inv_mel_spectrogram(mel_spec.T)) ckpt_name = args.ckpt.split('/')[-1].split('.')[0] wav_name = args.src_wav.split('/')[-1].split('.')[0] save_wav(synthesized_wav, os.path.join(args.save_dir, '{}-{}-converted.wav'.format(wav_name, ckpt_name))) save_wav(resynthesized_wav, os.path.join(args.save_dir, '{}-{}-src-resyn.wav'.format(wav_name, ckpt_name))) return
def train_fn(args, params): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = Vocoder( mel_channels=params["preprocessing"]["num_mels"], conditioning_channels=params["vocoder"]["conditioning_channels"], embedding_dim=params["vocoder"]["embedding_dim"], rnn_channels=params["vocoder"]["rnn_channels"], fc_channels=params["vocoder"]["fc_channels"], bits=params["preprocessing"]["bits"], hop_length=params["preprocessing"]["hop_length"]) model.to(device) print(model) optimizer = optim.Adam(model.parameters(), lr=params["vocoder"]["learning_rate"]) scheduler = optim.lr_scheduler.StepLR( optimizer, params["vocoder"]["schedule"]["step_size"], params["vocoder"]["schedule"]["gamma"]) if args.resume is not None: print("Resume checkpoint from: {}:".format(args.resume)) checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint["model"]) global_step = checkpoint["step"] else: global_step = 0 train_dataset = VocoderDataset( meta_file=os.path.join(args.data_dir, "train.txt"), sample_frames=params["vocoder"]["sample_frames"], audio_slice_frames=params["vocoder"]["audio_slice_frames"], hop_length=params["preprocessing"]["hop_length"], bits=params["preprocessing"]["bits"]) train_dataloader = DataLoader(train_dataset, batch_size=params["vocoder"]["batch_size"], shuffle=True, num_workers=1, pin_memory=True) num_epochs = params["vocoder"]["num_steps"] // len(train_dataloader) + 1 start_epoch = global_step // len(train_dataloader) + 1 for epoch in range(start_epoch, num_epochs + 1): running_loss = 0 for i, (audio, mels) in enumerate(tqdm(train_dataloader), 1): audio, mels = audio.to(device), mels.to(device) output = model(audio[:, :-1], mels) loss = F.cross_entropy(output.transpose(1, 2), audio[:, 1:]) optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() running_loss += loss.item() average_loss = running_loss / i global_step += 1 if global_step % params["vocoder"]["checkpoint_interval"] == 0: save_checkpoint(model, global_step, args.checkpoint_dir) with open(os.path.join(args.data_dir, "test.txt"), encoding="utf-8") as f: test_mel_paths = [line.strip().split("|")[2] for line in f] for mel_path in test_mel_paths: utterance_id = os.path.basename(mel_path).split(".")[0] mel = torch.FloatTensor( np.load(mel_path)).unsqueeze(0).to(device) output = model.generate( mel, params["vocoder"]["generate"]["batched"], params["vocoder"]["generate"]["target"], params["vocoder"]["generate"]["overlap"]) path = os.path.join( args.gen_dir, "gen_{}_model_steps_{}.wav".format( utterance_id, global_step)) save_wav(path, output, params["preprocessing"]["sample_rate"]) print("epoch:{}, loss:{:.3f}".format(epoch, average_loss))
def gen_from_mel(model, mel, output): assert mel.shape[1] == 80, 'Input mel shape is invalid.' assert output.endswith('.wav') mel = torch.FloatTensor(mel).unsqueeze(0).to(device) waveform = model.generate(mel) save_wav(output, waveform, params["preprocessing"]["sample_rate"])