def _process_utterance(out_dir, index, wav_path, text): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'ljspeech-audio-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index # np.save(os.path.join(out_dir, audio_filename), # out.astype(out_dtype), allow_pickle=False) # np.save(os.path.join(out_dir, mel_filename), # mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text)
def _process_utterance(out_dir, wav_path): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T return mel_spectrogram.astype(np.float32)
def _process_utterance(out_dir, index, wav_path, text, silence_threshold, fft_size): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max if hparams.input_type != "raw": # Mu-law quantize out = P.mulaw_quantize(wav) # Trim silences start, end = audio.start_and_end_indices(out, silence_threshold) out = out[start:end] wav = wav[start:end] constant_value = P.mulaw_quantize(0, 256) out_dtype = np.int16 else: out = wav constant_value = 0. out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_value) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) wav_id = os.path.basename(wav_path).split('.')[ 0] # wav_id = wav_path.split('/')[-1].split('.')[0] # Write the spectrograms to disk: audio_filename = '{}-audio.npy'.format(wav_id) mel_filename = '{}-mel.npy'.format(wav_id) np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return audio_filename, mel_filename, timesteps, text
def _extract_mel(wav_path): # Load the audio to a numpy array. Resampled if needed. wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjast time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjastment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 assert len(out) // N == audio.get_hop_size() timesteps = len(out) return out, mel_spectrogram, timesteps, out_dtype
def get_voice_file(idx, duration, quantize_type): """ Gets one of the last VCTK voices """ BASE_PATH = "/projects/grail/audiovisual/datasets/VCTK-Corpus/wav48/test" assert (idx in list(range(0, 100))) if idx < 25: speaker_path = os.path.join(BASE_PATH, "p345") elif idx < 50: speaker_path = os.path.join(BASE_PATH, "p361") elif idx < 75: speaker_path = os.path.join(BASE_PATH, "p362") elif idx < 100: speaker_path = os.path.join(BASE_PATH, "p374") file_list = list(Path(speaker_path).rglob('*.wav')) curr_file = random.choice(file_list) y, sr = librosa.core.load(curr_file, sr=22050) y /= abs(y).max() start_idx = len(y) // 2 y = y[int(start_idx - duration / 2):int(start_idx + duration / 2)] # Mulaw, linear or linear max audio if quantize_type == 0: quantized = P.mulaw_quantize(y, hparams.quantize_channels - 1) elif quantize_type == 1: quantized = linear_quantize(y, hparams.quantize_channels - 1) return quantized
def wavenet_data(): out = P.mulaw_quantize(wav, hparams.quantize_channels) out8 = P.mulaw_quantize(wav, 256) # WAVENENT TRANFSORMATIONS # Mu-law quantize # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) import matplotlib.pyplot as plt plt.subplot(3, 1, 1) specshow(mel_spectrogram.T, sr=20000, hop_length=hparams.hop_size) plt.subplot(3, 1, 2) plt.plot(out) plt.xlim(0, len(out)) plt.subplot(3, 1, 3) plt.plot(wav) plt.xlim(0, len(wav)) plt.show() out /= out.max()
def test_mulaw_real(): fs, x = wavfile.read(example_audio_file()) x = (x / 32768.0).astype(np.float32) mu = 256 y = P.mulaw_quantize(x, mu) assert y.min() >= 0 and y.max() < mu assert y.dtype == np.int x = P.inv_mulaw_quantize(y, mu) * 32768 assert x.dtype == np.float32 x = x.astype(np.int16)
def _process_utterance(out_dir, index, wav_path, text): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Mu-law quantize quantized = P.mulaw_quantize(wav) # Trim silences start, end = audio.start_and_end_indices(quantized, hparams.silence_threshold) quantized = quantized[start:end] wav = wav[start:end] # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjast time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal quantized = np.pad(quantized, (l, r), mode="constant", constant_values=P.mulaw_quantize(0)) N = mel_spectrogram.shape[0] assert len(quantized) >= N * audio.get_hop_size() # time resolution adjastment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample quantized = quantized[:N * audio.get_hop_size()] assert len(quantized) % audio.get_hop_size() == 0 timesteps = len(quantized) # Write the spectrograms to disk: audio_filename = 'ljspeech-audio-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index np.save(os.path.join(out_dir, audio_filename), quantized.astype(np.int16), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text)
def eval_model(global_step, writer, model, y, c, g, input_lengths, eval_dir): model.eval() idx = np.random.randint(0, len(y)) length = input_lengths[idx].data.cpu().numpy()[0] # (T,) y_target = y[idx].view(-1).data.cpu().long().numpy()[:length] if c is not None: c = c[idx, :, :length].unsqueeze(0) assert c.dim() == 3 print("Shape of local conditioning features: {}".format(c.size())) if g is not None: # TODO: test g = g[idx] print("Shape of global conditioning features: {}".format(g.size())) # Dummy silence initial_value = P.mulaw_quantize(0) print("Intial value:", initial_value) # (C,) initial_input = np_utils.to_categorical(initial_value, num_classes=256).astype(np.float32) initial_input = Variable(torch.from_numpy(initial_input), volatile=True).view(1, 1, 256) initial_input = initial_input.cuda() if use_cuda else initial_input y_hat = model.incremental_forward(initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True) y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat) y_target = P.inv_mulaw_quantize(y_target) # Save audio os.makedirs(eval_dir, exist_ok=True) path = join(eval_dir, "step{:09d}_predicted.wav".format(global_step)) librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate) path = join(eval_dir, "step{:09d}_target.wav".format(global_step)) librosa.output.write_wav(path, y_target, sr=hparams.sample_rate) # save figure path = join(eval_dir, "step{:09d}_waveplots.png".format(global_step)) save_waveplot(path, y_hat, y_target)
def _test_data(sr=4000, N=3000, returns_power=False, mulaw=True): x, _ = librosa.load(example_audio_file(), sr=sr) x, _ = librosa.effects.trim(x, top_db=15) # To save computational cost x = x[:N] # For power conditioning wavenet if returns_power: # (1 x N') p = librosa.feature.rmse(x, frame_length=256, hop_length=128) upsample_factor = x.size // p.size # (1 x N) p = np.repeat(p, upsample_factor, axis=-1) if p.size < x.size: # pad against time axis p = np.pad(p, [(0, 0), (0, x.size - p.size)], mode="constant", constant_values=0) # shape adajst p = p.reshape(1, 1, -1) # (T,) if mulaw: x = P.mulaw_quantize(x) x_org = P.inv_mulaw_quantize(x) # (C, T) x = to_categorical(x, num_classes=256).T # (1, C, T) x = x.reshape(1, 256, -1).astype(np.float32) else: x_org = x x = x.reshape(1, 1, -1) if returns_power: return x, x_org, p return x, x_org
def get_piano_file(idx, duration, quantize_type): """ Gets one of the test supra piano samples """ BASE_PATH = "/projects/grail/audiovisual/datasets/supra-rw-mp3/test" file_list = list(Path(BASE_PATH).rglob("*.mp3")) curr_file = random.choice(file_list) y, sr = librosa.core.load(curr_file, sr=22050) y /= abs(y).max() num_samples = y.shape[0] start_idx = random.randint(0, num_samples - duration) y = y[start_idx:start_idx + duration] # Mulaw, linear or linear max audio if quantize_type == 0: quantized = P.mulaw_quantize(y, hparams.quantize_channels - 1) elif quantize_type == 1: quantized = linear_quantize(y, hparams.quantize_channels - 1) return quantized
def _process_utterance(out_dir, index, speaker_id, wav_path, text): sr = hparams.sample_rate # Load the audio to a numpy array. Resampled if needed wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab") # Trim silence from hts labels if available # TODO if exists(lab_path) and False: labels = hts.load(lab_path) b = int(start_at(labels) * 1e-7 * sr) e = int(end_at(labels) * 1e-7 * sr) wav = wav[b:e] wav, _ = librosa.effects.trim(wav, top_db=20) else: wav, _ = librosa.effects.trim(wav, top_db=20) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'cmu_arctic-audio-%05d.npy' % index mel_filename = 'cmu_arctic-mel-%05d.npy' % index np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text, speaker_id)
def main(args): model = ModelWrapper() model.eval() if args["--downsample_interval"] is None: raise(ValueError("Must specify downsample fraction with --downsample_interval")) downsample_interval = int(args["--downsample_interval"]) receptive_field = model.receptive_field # Change the output dir if you want writing_dir = args["<output-dir>"] os.makedirs(writing_dir, exist_ok=True) print("writing dir: {}".format(writing_dir)) # Load up a samples x_original = librosa.core.load(args["<input-file>"], sr=hparams.sample_rate, mono=True)[0] # Hacky way to allow processing some or all of the file global SAMPLE_SIZE if SAMPLE_SIZE == -1: SAMPLE_SIZE = x_original.shape[0] x_original = x_original[:SAMPLE_SIZE] # Normalize to reduce encoding artifacts x_original /= abs(x_original).max() sf.write(os.path.join(writing_dir, "x_original.wav"), x_original, hparams.sample_rate) # Cut the sampling rate x_modified = x_original[::downsample_interval] x_modified_out = librosa.core.resample(x_modified, int(hparams.sample_rate / downsample_interval), hparams.sample_rate) sf.write(join(writing_dir, "x_modified.wav"), x_modified_out, hparams.sample_rate) x_modified = P.mulaw_quantize(x_modified, hparams.quantize_channels - 1) # Update constraint mask for super resolution. Masked spots don't update mask = np.ones_like(x_original) mask[::downsample_interval] = 0 mask = torch.Tensor(mask).unsqueeze(0).to(device) # Initialize with noise for the samples we need to fill in, or x original for the samples # we are allowed to use noise = np.random.uniform(0, 256, size=x_original.shape) mask_np = mask[0].detach().cpu().numpy() x = P.mulaw_quantize(x_original, hparams.quantize_channels - 1) * (1 - mask_np) + noise * (mask_np) x = torch.FloatTensor(x).unsqueeze(0).to(device) x.requires_grad = True sigmas = [175.9, 110., 68.7, 54.3, 42.9, 34.0, 26.8, 21.2, 16.8, 13.3, 10.5, 8.29, 6.55, 5.18, 4.1, 3.24, 2.56, 1.6, 1.0, 0.625, 0.39, 0.244, 0.15, 0.1] for idx, sigma in enumerate(sigmas): # Make sure each sample is updated on average N_STEPS times n_steps_sgld = int((SAMPLE_SIZE/(SGLD_WINDOW*BATCH_SIZE)) * N_STEPS) print("Number of SGLD steps {}".format(n_steps_sgld)) # Bump down a model checkpoint_path = join(args["<checkpoint>"], CHECKPOINTS[sigma], "checkpoint_latest_ema.pth") model.load_checkpoint(checkpoint_path) parmodel = torch.nn.DataParallel(model) parmodel.to(device) eta = .05 * (sigma ** 2) for i in range(n_steps_sgld): # need to get a good sampling of the beginning/end (boundary effects) # to understand this: think about how often we would update x[receptive_field] (first point) # if we only sampled U(receptive_field,x0.shape-receptive_field-SGLD_WINDOW) j = np.random.randint(-SGLD_WINDOW, x.shape[1], BATCH_SIZE) j = np.maximum(j, 0) j = np.minimum(j, x.shape[1]-(SGLD_WINDOW)) patches = [] for k in range(BATCH_SIZE): patches.append(x[:, j[k]:j[k] + SGLD_WINDOW]) patches = torch.stack(patches, axis=0) # Forward pass log_prob, prediction = parmodel(patches, sigma=sigma) log_prob = torch.sum(log_prob) grad = torch.autograd.grad(log_prob, patches)[0] x_update = eta * grad # Langevin step epsilon = np.sqrt(2 * eta) * torch.normal(0, 1, size=x_update.shape, device=device) x_update += epsilon with torch.no_grad(): for k in range(BATCH_SIZE): x_update[k] *= mask[:, j[k] : j[k] + SGLD_WINDOW] x[:, j[k] : j[k] + SGLD_WINDOW] += x_update[k] if (not i % 20) or (i == (n_steps_sgld - 1)): # debugging print("--------------") print('sigma = {}'.format(sigma)) print('eta = {}'.format(eta)) print("i {}".format(i)) print("Max sample {}".format( abs(x).max())) print('Mean sample logpx: {}'.format(log_prob / (BATCH_SIZE*SGLD_WINDOW))) print("Max gradient update: {}".format(eta * abs(grad).max())) t0 = time.time() out = P.inv_mulaw_quantize(x[0].detach().cpu().numpy(), hparams.quantize_channels - 1) out = np.clip(out, -1, 1) sf.write(os.path.join(writing_dir, "out_{}.wav".format(sigma)), out, hparams.sample_rate)
def wavegen(model, length=None, c=None, g=None, initial_value=None, fast=False, tqdm=tqdm): """Generate waveform samples by WaveNet. Args: model (nn.Module) : WaveNet decoder length (int): Time steps to generate. If conditinlal features are given, then this is determined by the feature size. c (numpy.ndarray): Conditional features, of shape T x C g (scaler): Speaker ID initial_value (int) : initial_value for the WaveNet decoder. fast (Bool): Whether to remove weight normalization or not. tqdm (lambda): tqdm Returns: numpy.ndarray : Generated waveform samples """ from train import sanity_check sanity_check(model, c, g) c = _to_numpy(c) g = _to_numpy(g) model.eval() if fast: model.make_generation_fast_() if c is None: assert length is not None else: # (Tc, D) if c.ndim != 2: raise RuntimeError( "Expected 2-dim shape (T, {}) for the conditional feature, but {} was actually given." .format(hparams.cin_channels, c.shape)) assert c.ndim == 2 Tc = c.shape[0] upsample_factor = audio.get_hop_size() # Overwrite length according to feature size length = Tc * upsample_factor # (Tc, D) -> (Tc', D) # Repeat features before feeding it to the network if not hparams.upsample_conditional_features: c = np.repeat(c, upsample_factor, axis=0) # B x C x T c = torch.FloatTensor(c.T).unsqueeze(0) if initial_value is None: if is_mulaw_quantize(hparams.input_type): initial_value = P.mulaw_quantize(0, hparams.quantize_channels - 1) else: initial_value = 0.0 if is_mulaw_quantize(hparams.input_type): assert initial_value >= 0 and initial_value < hparams.quantize_channels initial_input = np_utils.to_categorical( initial_value, num_classes=hparams.quantize_channels).astype(np.float32) initial_input = torch.from_numpy(initial_input).view( 1, 1, hparams.quantize_channels) else: initial_input = torch.zeros(1, 1, 1).fill_(initial_value) g = None if g is None else torch.LongTensor([g]) # Transform data to GPU initial_input = initial_input.to(device) g = None if g is None else g.to(device) c = None if c is None else c.to(device) with torch.no_grad(): y_hat = model.incremental_forward(initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = P.inv_mulaw( y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels) else: y_hat = y_hat.view(-1).cpu().data.numpy() if hparams.postprocess is not None and hparams.postprocess not in [ "", "none" ]: y_hat = getattr(audio, hparams.postprocess)(y_hat) if hparams.global_gain_scale > 0: y_hat /= hparams.global_gain_scale return y_hat
def eval_model(global_step, writer, model, y, c, g, input_lengths, eval_dir, ema=None): if ema is not None: print("Using averaged model for evaluation") model = clone_as_averaged_model(model, ema) model.eval() idx = np.random.randint(0, len(y)) length = input_lengths[idx].data.cpu().numpy()[0] # (T,) y_target = y[idx].view(-1).data.cpu().numpy()[:length] if c is not None: c = c[idx, :, :length].unsqueeze(0) assert c.dim() == 3 print("Shape of local conditioning features: {}".format(c.size())) if g is not None: # TODO: test g = g[idx] print("Shape of global conditioning features: {}".format(g.size())) # Dummy silence if is_mulaw_quantize(hparams.input_type): initial_value = P.mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = P.mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 print("Intial value:", initial_value) # (C,) if is_mulaw_quantize(hparams.input_type): initial_input = np_utils.to_categorical( initial_value, num_classes=hparams.quantize_channels).astype(np.float32) initial_input = Variable(torch.from_numpy(initial_input)).view( 1, 1, hparams.quantize_channels) else: initial_input = Variable(torch.zeros(1, 1, 1).fill_(initial_value)) initial_input = initial_input.cuda() if use_cuda else initial_input # Run the model in fast eval mode y_hat = model.incremental_forward(initial_input, c=c, g=g, T=length, softmax=True, quantize=True, tqdm=tqdm, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) y_target = P.inv_mulaw_quantize(y_target, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = P.inv_mulaw( y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels) y_target = P.inv_mulaw(y_target, hparams.quantize_channels) else: y_hat = y_hat.view(-1).cpu().data.numpy() # Save audio os.makedirs(eval_dir, exist_ok=True) path = join(eval_dir, "step{:09d}_predicted.wav".format(global_step)) librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate) path = join(eval_dir, "step{:09d}_target.wav".format(global_step)) librosa.output.write_wav(path, y_target, sr=hparams.sample_rate) # save figure path = join(eval_dir, "step{:09d}_waveplots.png".format(global_step)) save_waveplot(path, y_hat, y_target)
def collate_fn(batch): """Create batch Args: batch(tuple): List of tuples - x[0] (ndarray,int) : list of (T,) - x[1] (ndarray,int) : list of (T, D) - x[2] (ndarray,int) : list of (1,), speaker id Returns: tuple: Tuple of batch - x (FloatTensor) : Network inputs (B, C, T) - y (LongTensor) : Network targets (B, T, 1) """ local_conditioning = len(batch[0]) >= 2 and hparams.cin_channels > 0 global_conditioning = len(batch[0]) >= 3 and hparams.gin_channels > 0 if hparams.max_time_sec is not None: max_time_steps = int(hparams.max_time_sec * hparams.sample_rate) elif hparams.max_time_steps is not None: max_time_steps = hparams.max_time_steps else: max_time_steps = None # Time resolution adjustment if local_conditioning: new_batch = [] for idx in range(len(batch)): x, c, g = batch[idx] if hparams.upsample_conditional_features: assert_ready_for_upsampling(x, c) if max_time_steps is not None: max_steps = ensure_divisible(max_time_steps, audio.get_hop_size(), True) if len(x) > max_steps: max_time_frames = max_steps // audio.get_hop_size() s = np.random.randint(0, len(c) - max_time_frames) ts = s * audio.get_hop_size() x = x[ts:ts + audio.get_hop_size() * max_time_frames] c = c[s:s + max_time_frames, :] assert_ready_for_upsampling(x, c) else: x, c = audio.adjust_time_resolution(x, c) if max_time_steps is not None and len(x) > max_time_steps: s = np.random.randint(0, len(x) - max_time_steps) x, c = x[s:s + max_time_steps], c[s:s + max_time_steps, :] assert len(x) == len(c) new_batch.append((x, c, g)) batch = new_batch else: new_batch = [] for idx in range(len(batch)): x, c, g = batch[idx] x = audio.trim(x) if max_time_steps is not None and len(x) > max_time_steps: s = np.random.randint(0, len(x) - max_time_steps) if local_conditioning: x, c = x[s:s + max_time_steps], c[s:s + max_time_steps, :] else: x = x[s:s + max_time_steps] new_batch.append((x, c, g)) batch = new_batch # Lengths input_lengths = [len(x[0]) for x in batch] max_input_len = max(input_lengths) # (B, T, C) # pad for time-axis if is_mulaw_quantize(hparams.input_type): padding_value = P.mulaw_quantize(0, mu=hparams.quantize_channels) x_batch = np.array([ _pad_2d( np_utils.to_categorical(x[0], num_classes=hparams.quantize_channels), max_input_len, 0, padding_value) for x in batch ], dtype=np.float32) else: x_batch = np.array( [_pad_2d(x[0].reshape(-1, 1), max_input_len) for x in batch], dtype=np.float32) assert len(x_batch.shape) == 3 # (B, T) if is_mulaw_quantize(hparams.input_type): padding_value = P.mulaw_quantize(0, mu=hparams.quantize_channels) y_batch = np.array([ _pad(x[0], max_input_len, constant_values=padding_value) for x in batch ], dtype=np.int) else: y_batch = np.array([_pad(x[0], max_input_len) for x in batch], dtype=np.float32) assert len(y_batch.shape) == 2 # (B, T, D) if local_conditioning: max_len = max([len(x[1]) for x in batch]) c_batch = np.array([_pad_2d(x[1], max_len) for x in batch], dtype=np.float32) assert len(c_batch.shape) == 3 # (B x C x T) c_batch = torch.FloatTensor(c_batch).transpose(1, 2).contiguous() else: c_batch = None if global_conditioning: g_batch = torch.LongTensor([x[2] for x in batch]) else: g_batch = None # Covnert to channel first i.e., (B, C, T) x_batch = torch.FloatTensor(x_batch).transpose(1, 2).contiguous() # Add extra axis if is_mulaw_quantize(hparams.input_type): y_batch = torch.LongTensor(y_batch).unsqueeze(-1).contiguous() else: y_batch = torch.FloatTensor(y_batch).unsqueeze(-1).contiguous() input_lengths = torch.LongTensor(input_lengths) return x_batch, y_batch, c_batch, g_batch, input_lengths
def _process_utterance(out_dir, index, audio_filepath, text): # Load the audio to a numpy array: wav_whole = audio.load_wav(audio_filepath) if hparams.rescaling: wav_whole = wav_whole / np.abs(wav_whole).max() * hparams.rescaling_max # This is a librivox source, so the audio files are going to be v. long # compared to a typical 'utterance' : So split the wav into chunks tup_results = [] n_samples = int(8.0 * hparams.sample_rate) # All 8 second utterances n_chunks = wav_whole.shape[0] // n_samples for chunk_idx in range(n_chunks): chunk_start, chunk_end = chunk_idx * \ n_samples, (chunk_idx + 1) * n_samples if chunk_idx == n_chunks - 1: # This is the last chunk - allow it # to extend to the end of the file chunk_end = None wav = wav_whole[chunk_start:chunk_end] # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution # between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'librivox-audio-%04d-%05d.npy' % ( index, chunk_idx, ) mel_filename = 'librivox-mel-%04d-%05d.npy' % ( index, chunk_idx, ) text_idx = '%s - %05d' % ( text, chunk_idx, ) np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Add results tuple describing this training example: tup_results.append((audio_filename, mel_filename, timesteps, text_idx)) # Return all the audio results tuples (unpack in caller) return tup_results
def test_mulaw(): # Check corner cases assert P.mulaw_quantize(-1.0, 2) == 0 assert P.mulaw_quantize(-0.5, 2) == 0 assert P.mulaw_quantize(-0.001, 2) == 0 assert P.mulaw_quantize(0.0, 2) == 1 assert P.mulaw_quantize(0.0001, 2) == 1 assert P.mulaw_quantize(0.5, 2) == 1 assert P.mulaw_quantize(0.99999, 2) == 1 assert P.mulaw_quantize(1.0, 2) == 2 np.random.seed(1234) # forward/backward correctness for mu in [128, 256, 512]: for x in np.random.rand(100): y = P.mulaw(x, mu) assert y >= 0 and y <= 1 x_hat = P.inv_mulaw(y, mu) assert np.allclose(x, x_hat) # forward/backward correctness for quantize for mu in [128, 256, 512]: for x, y in [(-1.0, 0), (0.0, mu // 2), (0.99999, mu - 1)]: y_hat = P.mulaw_quantize(x, mu) err = np.abs(x - P.inv_mulaw_quantize(y_hat, mu)) print(y, y_hat, err) assert np.allclose(y, y_hat) # have small quantize error assert err <= 0.1 # ndarray input for mu in [128, 256, 512]: x = np.random.rand(10) y = P.mulaw(x, mu) x_hat = P.inv_mulaw(y, mu) assert np.allclose(x, x_hat) P.inv_mulaw_quantize(P.mulaw_quantize(x)) # torch array input from warnings import warn import torch torch.manual_seed(1234) for mu in [128, 256, 512]: x = torch.rand(10) y = P.mulaw(x, mu) x_hat = P.inv_mulaw(y, mu) assert np.allclose(x, x_hat) P.inv_mulaw_quantize(P.mulaw_quantize(x))
def _process_utterance(out_dir, index, speaker_id, wav_path, text): sr = hparams.sample_rate # Load the audio to a numpy array. Resampled if needed wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 #print("Wavepath is ", wav_path) filename = wav_path.split('/wav/')[-1].split('.wav')[0] fname = filename filename = ccoeffs_feats_path + '/' + filename + '.mcep' mel_spectrogram = np.loadtxt(filename) #print("Shape of mel scptrogram is ", mel_spectrogram.shape) # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) #mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram #l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal #out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] #out = ensure_divisible(out, N) #print("Length of out: ", len(out), "N ", N) #print("Out and N: ", len(out), N) #if len(out) < N * audio.get_hop_size(): #print("Out and N: ", filename, len(out), N, N * audio.get_hop_size()) # sys.exit() #assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample #out = out[:N * 80] #out = ensure_divisible(out, N) g = open('logfile','a') g.write("Processing " + fname + '\n') g.close() out,mel_spectrogram = ensure_frameperiod(out,mel_spectrogram) #out = ensure_divisible(out, audio.get_hop_size()) #assert len(out) % audio.get_hop_size() == 0 #assert len(out) % N == 0 timesteps = len(out) g = open('logfile','a') g.write(fname + ' ' + str(len(out)) + ' ' + str(N) + ' ' + str(len(out) % N) + '\n') g.write('\n') g.close() # Write the spectrograms to disk: audio_filename = fname + '-audio-%05d.npy' % index mel_filename = fname + '-mel-%05d.npy' % index np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text, speaker_id)
def _process_utterance(wav_path, out_dir): fname = wav_path.split(os.sep)[-1].split(".")[0] audio_filename = '{}_resolved.npy'.format(fname) mel_filename = '{}_mel.npy'.format(fname) apth = os.path.join(out_dir, audio_filename) mpth = os.path.join(out_dir, mel_filename) if os.path.exists(apth) and os.path.exists(mpth): print("File {} already processed".format(wav_path)) return # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: np.save(apth, out.astype(out_dtype), allow_pickle=False) np.save(mpth, mel_spectrogram.astype(np.float32), allow_pickle=False)
def _process_utterance_single(out_dir, text, wav_path, hparams=hparams): # modified version of LJSpeech _process_utterance audio.set_hparams(hparams) # Load the audio to a numpy array: wav = audio.load_wav(wav_path) sr = hparams.sample_rate # Added from the multispeaker version lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab") if not exists(lab_path): lab_path = os.path.splitext(wav_path)[0]+'.lab' # Trim silence from hts labels if available if exists(lab_path): labels = hts.load(lab_path) wav = clean_by_phoneme(labels, wav, sr) wav, _ = librosa.effects.trim(wav, top_db=25) else: if hparams.process_only_htk_aligned: return None wav, _ = librosa.effects.trim(wav, top_db=15) # End added from the multispeaker version if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max if hparams.max_audio_length != 0 and librosa.core.get_duration(y=wav, sr=sr) > hparams.max_audio_length: return None if hparams.min_audio_length != 0 and librosa.core.get_duration(y=wav, sr=sr) < hparams.min_audio_length: return None # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: # Get filename from wav_path wav_name = os.path.basename(wav_path) wav_name = os.path.splitext(wav_name)[0] out_filename = 'audio-{}.npy'.format(wav_name) mel_filename = 'mel-{}.npy'.format(wav_name) np.save(os.path.join(out_dir, out_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (out_filename, mel_filename, timesteps, text)
def eval_model(global_step, writer, device, model, y, c, g, input_lengths, eval_dir, ema=None): if ema is not None: print("Using averaged model for evaluation") model = clone_as_averaged_model(device, model, ema) model.make_generation_fast_() model.eval() #pick one of the available waves to try to emulate idx = np.random.randint(0, len(y)) length = input_lengths[idx].data.cpu().item() # (T,) y_target = y[idx].view(-1).data.cpu().numpy()[:length] if c is not None: if hparams.upsample_conditional_features: c = c[idx, :, :length // audio.get_hop_size()].unsqueeze(0) else: c = c[idx, :, :length].unsqueeze(0) assert c.dim() == 3 print("Shape of local conditioning features: {}".format(c.size())) if g is not None: # TODO: test g = g[idx] print("Shape of global conditioning features: {}".format(g.size())) # Dummy silence if is_mulaw_quantize(hparams.input_type): initial_value = P.mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = P.mulaw(0.0, hparams.quantize_channels) else: #initial_value = 0.0 initial_value = float(y_target[0]) #TODO change initial value to first value of actual waveform instead of zero?? <MLK, 10/19> print("Intial value:", initial_value) # (C,) if is_mulaw_quantize(hparams.input_type): initial_input = np_utils.to_categorical( initial_value, num_classes=hparams.quantize_channels).astype(np.float32) initial_input = torch.from_numpy(initial_input).view( 1, 1, hparams.quantize_channels) else: initial_input = torch.zeros(1, 1, 1).fill_(initial_value) initial_input = initial_input.to(device) # Run the model in fast eval mode with torch.no_grad(): y_hat = model.incremental_forward( initial_input, c=c, g=g, T=length, softmax=True, quantize=True, tqdm=tqdm, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) y_target = P.inv_mulaw_quantize(y_target, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = P.inv_mulaw(y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels) y_target = P.inv_mulaw(y_target, hparams.quantize_channels) else: y_hat = y_hat.view(-1).cpu().data.numpy() # Save audio os.makedirs(eval_dir, exist_ok=True) path = join(eval_dir, "step_noncausal_{:09d}_predicted.npy".format(global_step)) np.save(path, y_hat) path = join(eval_dir, "step_noncausal_{:09d}_target.npy".format(global_step)) np.save(path, y_target) # save figure path = join(eval_dir, "step_noncausal_{:09d}_waveplots.png".format(global_step)) save_waveplot(path, y_hat, y_target)
def _process_utterance( out_dir, index, speaker_id, wav_path, text, silence_threshold, fft_size, ): sr = hparams.sample_rate # Load the audio to a numpy array. Resampled if needed wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab") # Trim silence from hts labels if available # TODO if exists(lab_path) and False: labels = hts.load(lab_path) b = int(start_at(labels) * 1e-7 * sr) e = int(end_at(labels) * 1e-7 * sr) wav = wav[b:e] wav, _ = librosa.effects.trim(wav, top_db=20) else: wav, _ = librosa.effects.trim(wav, top_db=20) # Mu-law quantize quantized = P.mulaw_quantize(wav) # Trim silences start, end = audio.start_and_end_indices(quantized, silence_threshold) quantized = quantized[start:end] wav = wav[start:end] # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjast time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, fft_size, audio.get_hop_size()) # zero pad for quantized signal quantized = np.pad(quantized, (l, r), mode="constant", constant_values=P.mulaw_quantize(0)) N = mel_spectrogram.shape[0] assert len(quantized) >= N * audio.get_hop_size() # time resolution adjastment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample quantized = quantized[:N * audio.get_hop_size()] assert len(quantized) % audio.get_hop_size() == 0 timesteps = len(quantized) wav_id = wav_path.split('/')[-1].split('.')[0] # Write the spectrograms to disk: audio_filename = '{}-audio.npy'.format(wav_id) mel_filename = '{}-mel.npy'.format(wav_id) np.save(os.path.join(out_dir, audio_filename), quantized.astype(np.int16), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text, speaker_id)
def wavegen(model, length=None, c=None, g=None, initial_value=None, fast=False, tqdm=tqdm): """Generate waveform samples by WaveNet. Args: model (nn.Module) : WaveNet decoder length (int): Time steps to generate. If conditinlal features are given, then determined by the feature size. c (numpy.ndarray): Conditional features, of shape T x C g (scaler): Speaker ID initial_value (int) : initial_value for the WaveNet decoder. fast (Bool): Whether to remove weight normalization or not. tqdm (lambda): tqdm Returns: numpy.ndarray : Generated waveform samples """ c = _to_numpy(c) g = _to_numpy(g) if use_cuda: model = model.cuda() model.eval() if fast: model.make_generation_fast_() if c is None: assert length is not None else: # (N, D) assert c.ndim == 2 # (T, D) if not hparams.upsample_conditional_features: upsample_factor = audio.get_hop_size() c = np.repeat(c, upsample_factor, axis=0) length = c.shape[0] # B x C x T c = c.T.reshape(1, -1, length) c = Variable(torch.FloatTensor(c)) if initial_value is None: initial_value = P.mulaw_quantize(0) # dummy silence assert initial_value >= 0 and initial_value < 256 initial_input = np_utils.to_categorical(initial_value, num_classes=256).astype(np.float32) initial_input = Variable(torch.from_numpy(initial_input)).view(1, 1, 256) g = None if g is None else Variable(torch.LongTensor([g])) if use_cuda: initial_input = initial_input.cuda() g = None if g is None else g.cuda() c = None if c is None else c.cuda() y_hat = model.incremental_forward(initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True) y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat) return y_hat
def eval_model(global_step, writer, device, model, y, c, g, input_lengths, eval_dir, ema=None): if ema is not None: print("Using averaged model for evaluation") model = clone_as_averaged_model(device, model, ema) model.make_generation_fast_() model.eval() idx = np.random.randint(0, len(y)) length = input_lengths[idx].data.cpu().item() # (T,) y_target = y[idx].view(-1).data.cpu().numpy()[:length] if c is not None: if hparams.upsample_conditional_features: c = c[idx, :, :length // audio.get_hop_size() + hparams.cin_pad * 2].unsqueeze(0) else: c = c[idx, :, :length].unsqueeze(0) assert c.dim() == 3 print("Shape of local conditioning features: {}".format(c.size())) if g is not None: # TODO: test g = g[idx] print("Shape of global conditioning features: {}".format(g.size())) # Dummy silence if is_mulaw_quantize(hparams.input_type): initial_value = P.mulaw_quantize(0, hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): initial_value = P.mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 # (C,) if is_mulaw_quantize(hparams.input_type): initial_input = to_categorical( initial_value, num_classes=hparams.quantize_channels).astype(np.float32) initial_input = torch.from_numpy(initial_input).view( 1, 1, hparams.quantize_channels) else: initial_input = torch.zeros(1, 1, 1).fill_(initial_value) initial_input = initial_input.to(device) # Run the model in fast eval mode with torch.no_grad(): y_hat = model.incremental_forward( initial_input, c=c, g=g, T=length, softmax=True, quantize=True, tqdm=tqdm, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels - 1) y_target = P.inv_mulaw_quantize(y_target, hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): y_hat = P.inv_mulaw(y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels) y_target = P.inv_mulaw(y_target, hparams.quantize_channels) else: y_hat = y_hat.view(-1).cpu().data.numpy() # Save audio os.makedirs(eval_dir, exist_ok=True) path = join(eval_dir, "step{:09d}_predicted.wav".format(global_step)) librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate) path = join(eval_dir, "step{:09d}_target.wav".format(global_step)) librosa.output.write_wav(path, y_target, sr=hparams.sample_rate) # save figure path = join(eval_dir, "step{:09d}_waveplots.png".format(global_step)) save_waveplot(path, y_hat, y_target) # add audio and figures to tensorboard writer.add_audio('target_audio', y_target, global_step, hparams.sample_rate) writer.add_audio('generated_audio', y_hat, global_step, hparams.sample_rate)
def wavegen(model, length=None, c=None, g=None, initial_value=None, fast=False, tqdm=tqdm): """Generate waveform samples by WaveNet. Multiple waveforms can be generated in single batch Args: model (nn.Module) : WaveNet decoder length (int): Time steps to generate. If conditinlal features are given, then this is determined by the feature size. c (numpy.ndarray or list): Conditional features, of shape T x C g (scalar or list): Speaker ID initial_value (int) : initial_value for the WaveNet decoder. fast (Bool): Whether to remove weight normalization or not. tqdm (lambda): tqdm Returns: numpy.ndarray or list : Generated waveform samples """ from train import sanity_check sanity_check(model, c, g) model.eval() if fast: model.make_generation_fast_() # Prepare Local Condition batch_size = 1 output_should_be_list = False if c is None: assert length is not None else: if type(c)==list : output_should_be_list = True c = [_to_numpy(x) for x in c] for x in c : if x.ndim != 2: raise RuntimeError( "Expected 2-dim shape (T, {}) for the conditional feature, but {} was actually given.".format(hparams.cin_channels, x.shape)) assert x.ndim == 2 batch_size = len(c) batch = np.zeros([batch_size, max([x.shape[0] for x in c]), c[0].shape[1]]) for i in range(batch_size) : batch[i,:c[i].shape[0],:] = c[i][:,:] upsample_factor = audio.get_hop_size() # length_list : used to cut silence when batch_size > 1 length_list = [x.shape[0]*upsample_factor for x in c] length = max(length_list) if not hparams.upsample_conditional_features: batch = np.repeat(batch, upsample_factor, axis=1) c = torch.FloatTensor(np.transpose(batch, [0, 2, 1])) else : c = _to_numpy(c) # (Tc, D) if c.ndim != 2: raise RuntimeError( "Expected 2-dim shape (T, {}) for the conditional feature, but {} was actually given.".format(hparams.cin_channels, c.shape)) assert c.ndim == 2 Tc = c.shape[0] upsample_factor = audio.get_hop_size() # Overwrite length according to feature size length = Tc * upsample_factor # (Tc, D) -> (Tc', D) # Repeat features before feeding it to the network if not hparams.upsample_conditional_features: c = np.repeat(c, upsample_factor, axis=0) # B x C x T c = torch.FloatTensor(c.T).unsqueeze(0) # Prepare initial_input if initial_value is None: if is_mulaw_quantize(hparams.input_type): initial_value = P.mulaw_quantize(0, hparams.quantize_channels) else: initial_value = 0.0 if is_mulaw_quantize(hparams.input_type): assert initial_value >= 0 and initial_value < hparams.quantize_channels initial_input = np_utils.to_categorical( initial_value, num_classes=hparams.quantize_channels).astype(np.float32) initial_input = torch.from_numpy(initial_input).view( 1, 1, hparams.quantize_channels) else: initial_input = torch.zeros(1, 1, 1).fill_(initial_value) initial_input = initial_input.repeat(batch_size, 1, 1) # Prepare Global Condition if type(g)==list : g = [_to_numpy(x) for x in g] g = torch.LongTensor(g) elif g is not None : g = _to_numpy(g) g = torch.LongTensor([g]) # Transform data to GPU initial_input = initial_input.to(device) g = None if g is None else g.to(device) c = None if c is None else c.to(device) with torch.no_grad(): y_hat = model.incremental_forward( initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = y_hat.max(1)[1].view(batch_size, -1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = P.inv_mulaw(y_hat.view(batch_size, -1).cpu().data.numpy(), hparams.quantize_channels) else: y_hat = y_hat.view(batch_size, -1).cpu().data.numpy() if output_should_be_list : return [y_hat[i, :length_list[i]] for i in range(batch_size)] else : return y_hat[0, :]
def _process_utterance(out_dir, index, wav_path, text, trim_silence=False): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Trim begin/end silences # NOTE: the threshold was chosen for clean signals # TODO: Remove, get this out of here. if trim_silence: wav, _ = librosa.effects.trim(wav, top_db=60, frame_length=2048, hop_length=512) if hparams.highpass_cutoff > 0.0: wav = audio.low_cut_filter(wav, hparams.sample_rate, hparams.highpass_cutoff) # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # Trim silences in mul-aw quantized domain silence_threshold = 0 if silence_threshold > 0: # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) start, end = audio.start_and_end_indices(out, silence_threshold) wav = wav[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels - 1) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] constant_values = P.mulaw(0.0, hparams.quantize_channels - 1) out_dtype = np.float32 else: # [-1, 1] constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.logmelspectrogram(wav).astype(np.float32).T if hparams.global_gain_scale > 0: wav *= hparams.global_gain_scale # Time domain preprocessing if hparams.preprocess is not None and hparams.preprocess not in [ "", "none" ]: f = getattr(audio, hparams.preprocess) wav = f(wav) # Clip if np.abs(wav).max() > 1.0: print("""Warning: abs max value exceeds 1.0: {}""".format( np.abs(wav).max())) # ignore this sample return ("dummy", "dummy", -1, "dummy") wav = np.clip(wav, -1.0, 1.0) # Set waveform target (out) if is_mulaw_quantize(hparams.input_type): out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): out = P.mulaw(wav, hparams.quantize_channels - 1) else: out = wav # zero pad # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.pad_lr(out, hparams.fft_size, audio.get_hop_size()) if l > 0 or r > 0: out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 assert_ready_for_upsampling(out, mel_spectrogram, cin_pad=0, debug=True) # Write the spectrograms to disk: name = splitext(basename(wav_path))[0] audio_filename = "%s-wave.npy" % (name) mel_filename = "%s-feats.npy" % (name) np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save( os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False, ) # Return a tuple describing this training example: return (audio_filename, mel_filename, N, text)
def _process_song(out_dir, index, wav_path, text): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Trim begin/end silences # NOTE: the threshold was chosen for clean signals wav, _ = librosa.effects.trim(wav, top_db=60, frame_length=2048, hop_length=512) if hparams.highpass_cutoff > 0.0: wav = audio.low_cut_filter(wav, hparams.sample_rate, hparams.highpass_cutoff) # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # Trim silences in mul-aw quantized domain silence_threshold = 0 if silence_threshold > 0: # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) start, end = audio.start_and_end_indices(out, silence_threshold) wav = wav[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels - 1) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] constant_values = P.mulaw(0.0, hparams.quantize_channels - 1) out_dtype = np.float32 else: # [-1, 1] constant_values = 0.0 out_dtype = np.float32 #### CLAIRE Work here wav_name = os.path.splitext(os.path.basename(wav_path))[0] os.makedirs('./pwavs', exist_ok=True) pwav_path = './pwavs/{0}.wav'.format(wav_name) scipy.io.wavfile.write(pwav_path, 16000, wav) # make the chord directory if it does not exist chord_dir = "chord_dir" os.makedirs(chord_dir, exist_ok=True) # create xml file with notes and timestamps #subprocess.check_call(['./extract_chord_notes.sh', wav_path, chord_dir], shell=True) #os.system('./extract_chord_notes.sh {0} {1}'.format(pwav_path, chord_dir)) os.system('./extract_chromagram.sh {0} {1} > /dev/null 2>&1'.format( pwav_path, chord_dir)) note_filename = '{0}/{1}.csv'.format(chord_dir, wav_name) #### Instead of computing the Mel Spectrogram, here return a time series of one hot encoded chords. # vector with 1 in row for each note played # 1000 samples per second note_samples = int(len(wav) / 2048) # 12 notes per octave chords_time_series = np.zeros((24, note_samples)) #print(np.shape(chords_time_series)) with open(note_filename, newline='\n') as csvfile: #chordreader = csv.reader(csvfile, delimeter=',') chordreader = csvfile.readlines() #print(chordreader) for idx, row in enumerate(chordreader): row = row.split(",") chromogram_samples = np.array(row).astype(np.float)[1:] chords_time_series[:, idx] = chromogram_samples chords_time_series = chords_time_series.T # if hparams.global_gain_scale > 0: # wav *= hparams.global_gain_scale # Time domain preprocessing if hparams.preprocess is not None and hparams.preprocess not in [ "", "none" ]: f = getattr(audio, hparams.preprocess) wav = f(wav) # wav = np.clip(wav, -1.0, 1.0) # Set waveform target (out) if is_mulaw_quantize(hparams.input_type): out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): out = P.mulaw(wav, hparams.quantize_channels - 1) else: out = wav # zero pad # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.pad_lr(out, hparams.fft_size, audio.get_hop_size()) if l > 0 or r > 0: out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = chords_time_series.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 # Write the spectrograms to disk: name = splitext(basename(wav_path))[0] audio_filename = '%s-wave.npy' % (name) chords_filename = '%s-feats.npy' % (name) np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, chords_filename), chords_time_series.astype(out_dtype), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, chords_filename, N, text)
def wavegen(model, length=None, c=None, g=None, initial_value=None, fast=False, tqdm=tqdm): """Generate waveform samples by WaveNet. Args: model (nn.Module) : WaveNet decoder length (int): Time steps to generate. If conditinlal features are given, then this is determined by the feature size. c (numpy.ndarray): Conditional features, of shape T x C g (scaler): Speaker ID initial_value (int) : initial_value for the WaveNet decoder. fast (Bool): Whether to remove weight normalization or not. tqdm (lambda): tqdm Returns: numpy.ndarray : Generated waveform samples """ from train import sanity_check sanity_check(model, c, g) c = _to_numpy(c) g = _to_numpy(g) if use_cuda: model = model.cuda() model.eval() if fast: model.make_generation_fast_() if c is None: assert length is not None else: # (Tc, D) assert c.ndim == 2 Tc = c.shape[0] upsample_factor = audio.get_hop_size() # Overwrite length according to feature size length = Tc * upsample_factor # (Tc, D) -> (Tc', D) # Repeat features before feeding it to the network if not hparams.upsample_conditional_features: c = np.repeat(c, upsample_factor, axis=0) # B x C x T c = Variable(torch.FloatTensor(c.T).unsqueeze(0)) if initial_value is None: if is_mulaw_quantize(hparams.input_type): initial_value = P.mulaw_quantize(0, hparams.quantize_channels) else: initial_value = 0.0 if is_mulaw_quantize(hparams.input_type): assert initial_value >= 0 and initial_value < hparams.quantize_channels initial_input = np_utils.to_categorical( initial_value, num_classes=hparams.quantize_channels).astype(np.float32) initial_input = Variable(torch.from_numpy(initial_input)).view( 1, 1, hparams.quantize_channels) else: initial_input = Variable(torch.zeros(1, 1, 1)).fill_(initial_value) g = None if g is None else Variable(torch.LongTensor([g])) if use_cuda: initial_input = initial_input.cuda() g = None if g is None else g.cuda() c = None if c is None else c.cuda() y_hat = model.incremental_forward(initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = P.inv_mulaw( y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels) else: y_hat = y_hat.view(-1).cpu().data.numpy() return y_hat
def _process_utterance(out_dir,wav_path,sp2ind_dir,text): sp_f = open(sp2ind_dir,'r') sp2ind = json.load(sp_f) sp = wav_path.split('/')[-1].split('.')[0].split('_')[0] if sp in sp2ind: sp_ind = sp2ind[sp] else: sp_ind = -1 wav = audio.load_wav(wav_path) if not 'test' in wav_path: wav,_ = librosa.effects.trim(wav,top_db=60,frame_length=2048,hop_length=512) if hparams.highpass_cutoff > 0.0: wav = audio.low_cut_filter(wav, hparams.sample_rate, hparams.highpass_cutoff) if is_mulaw_quantize(hparams.input_type): # Trim silences in mul-aw quantized domain silence_threshold = 0 if silence_threshold > 0: # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) start, end = audio.start_and_end_indices(out, silence_threshold) wav = wav[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels - 1) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] constant_values = P.mulaw(0.0, hparams.quantize_channels - 1) out_dtype = np.float32 else: # [-1, 1] constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.logmelspectrogram(wav).astype(np.float32).T mfcc = audio.mfcc(wav).astype(np.float32).T if hparams.global_gain_scale > 0: wav *= hparams.global_gain_scale # Time domain preprocessing if hparams.preprocess is not None and hparams.preprocess not in ["", "none"]: f = getattr(audio, hparams.preprocess) wav = f(wav) # Clip if np.abs(wav).max() > 1.0: print("""Warning: abs max value exceeds 1.0: {}""".format(np.abs(wav).max())) # ignore this sample #return ("dummy", "dummy","dummy", -1,-1, "dummy") wav = np.clip(wav, -1.0, 1.0) # Set waveform target (out) if is_mulaw_quantize(hparams.input_type): out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): out = P.mulaw(wav, hparams.quantize_channels - 1) else: out = wav # zero pad # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.pad_lr(out, hparams.fft_size, audio.get_hop_size()) if l > 0 or r > 0: out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 # Write the spectrograms to disk: #name = splitext(basename(wav_path))[0] #audio_filename = '%s-wave.npy' % (name) #mel_filename = '%s-feats.npy' % (name) audio_filename = f'{out_dir}wave.npy' mel_filename = f'{out_dir}mel.npy' mfcc_filename = f'{out_dir}mfcc.npy' assert mfcc.shape[0] == N np.save(audio_filename, out.astype(out_dtype), allow_pickle=False) np.save(mel_filename, mel_spectrogram.astype(np.float32), allow_pickle=False) np.save(mfcc_filename, mfcc.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (out_dir, N, sp_ind,text)