def assert_ready_for_upsampling(x, c): assert len(x) % len(c) == 0 and len(x) // len(c) == audio.get_hop_size()
def eval_model(global_step, writer, device, model, y, c, g, input_lengths, eval_dir, ema=None): if ema is not None: print("Using averaged model for evaluation") model = clone_as_averaged_model(device, model, ema) model.make_generation_fast_() model.eval() idx = np.random.randint(0, len(y)) length = input_lengths[idx].data.cpu().item() # (T,) y_target = y[idx].view(-1).data.cpu().numpy()[:length] if c is not None: if hparams.upsample_conditional_features: c = c[idx, :, :length // audio.get_hop_size()].unsqueeze(0) else: c = c[idx, :, :length].unsqueeze(0) assert c.dim() == 3 print("Shape of local conditioning features: {}".format(c.size())) if g is not None: # TODO: test g = g[idx] print("Shape of global conditioning features: {}".format(g.size())) # Dummy silence if is_mulaw_quantize(hparams.input_type): initial_value = P.mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = P.mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 print("Intial value:", initial_value) # (C,) if is_mulaw_quantize(hparams.input_type): initial_input = np_utils.to_categorical( initial_value, num_classes=hparams.quantize_channels).astype(np.float32) initial_input = torch.from_numpy(initial_input).view( 1, 1, hparams.quantize_channels) else: initial_input = torch.zeros(1, 1, 1).fill_(initial_value) initial_input = initial_input.to(device) # Run the model in fast eval mode with torch.no_grad(): y_hat = model.incremental_forward(initial_input, c=c, g=g, T=length, softmax=True, quantize=True, tqdm=tqdm, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) y_target = P.inv_mulaw_quantize(y_target, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = P.inv_mulaw( y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels) y_target = P.inv_mulaw(y_target, hparams.quantize_channels) else: y_hat = y_hat.view(-1).cpu().data.numpy() # Save audio os.makedirs(eval_dir, exist_ok=True) path = join(eval_dir, "step{:09d}_predicted.wav".format(global_step)) librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate) path = join(eval_dir, "step{:09d}_target.wav".format(global_step)) librosa.output.write_wav(path, y_target, sr=hparams.sample_rate) # save figure path = join(eval_dir, "step{:09d}_waveplots.png".format(global_step)) save_waveplot(path, y_hat, y_target)
def wavegen(model, length=None, c=None, g=None, initial_value=None, fast=False, tqdm=tqdm): """Generate waveform samples by WaveNet. Args: model (nn.Module) : WaveNet decoder length (int): Time steps to generate. If conditinlal features are given, then this is determined by the feature size. c (numpy.ndarray): Conditional features, of shape T x C g (scaler): Speaker ID initial_value (int) : initial_value for the WaveNet decoder. fast (Bool): Whether to remove weight normalization or not. tqdm (lambda): tqdm Returns: numpy.ndarray : Generated waveform samples """ from train import sanity_check sanity_check(model, c, g) c = _to_numpy(c) g = _to_numpy(g) if use_cuda: model = model.cuda() model.eval() if fast: model.make_generation_fast_() if c is None: assert length is not None else: # (Tc, D) assert c.ndim == 2 Tc = c.shape[0] upsample_factor = audio.get_hop_size() # Overwrite length according to feature size length = Tc * upsample_factor # (Tc, D) -> (Tc', D) # Repeat features before feeding it to the network if not hparams.upsample_conditional_features: c = np.repeat(c, upsample_factor, axis=0) # B x C x T c = Variable(torch.FloatTensor(c.T).unsqueeze(0)) if initial_value is None: initial_value = P.mulaw_quantize(0) # dummy silence assert initial_value >= 0 and initial_value < 256 initial_input = np_utils.to_categorical( initial_value, num_classes=256).astype(np.float32) initial_input = Variable(torch.from_numpy(initial_input)).view(1, 1, 256) g = None if g is None else Variable(torch.LongTensor([g])) if use_cuda: initial_input = initial_input.cuda() g = None if g is None else g.cuda() c = None if c is None else c.cuda() y_hat = model.incremental_forward( initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True) y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat) return y_hat
def eval_model(global_step, writer, device, student, teacher, y, c, g, input_lengths, eval_dir, ema=None): if ema is not None: print("Using averaged model for evaluation") student = clone_as_averaged_model(device, student, ema) student.make_generation_fast_() student.eval() teacher.eval() idx = np.random.randint(0, len(y)) length = input_lengths[idx].data.cpu().item() # (T,) y_target = y[idx].view(-1).data.cpu().numpy()[:length] if c is not None: if hparams.upsample_conditional_features: c = c[idx, :, :length // audio.get_hop_size()].unsqueeze(0) else: c = c[idx, :, :length].unsqueeze(0) assert c.dim() == 3 print("Shape of local conditioning features: {}".format(c.size())) if g is not None: # TODO: test g = g[idx] print("Shape of global conditioning features: {}".format(g.size())) # noise input dist = torch.distributions.normal.Normal(loc=0., scale=1.) z = dist.sample((1, 1, length)).to(device) # Run the model with torch.no_grad(): student_hat, _, _, _ = student(x=z, c=c, g=g, log_scale_min=hparams.log_scale_min, device=device) teacher_output = teacher(student_hat, c=c, g=g, softmax=False) teacher_output = teacher_output.transpose(1, 2) teacher_hat = sample_from_gaussian(teacher_output, log_scale_min=hparams.log_scale_min) # if is_mulaw_quantize(hparams.input_type): # y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() # y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) # y_target = P.inv_mulaw_quantize(y_target, hparams.quantize_channels) # elif is_mulaw(hparams.input_type): # y_hat = P.inv_mulaw(y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels) # y_target = P.inv_mulaw(y_target, hparams.quantize_channels) # else: # y_hat = y_hat.view(-1).cpu().data.numpy() teacher_hat = teacher_hat.view(-1).cpu().data.numpy() student_hat = student_hat.view(-1).cpu().data.numpy() # Save audio os.makedirs(eval_dir, exist_ok=True) path = join(eval_dir, "step{:09d}_student.wav".format(global_step)) librosa.output.write_wav(path, student_hat, sr=hparams.sample_rate) path = join(eval_dir, "step{:09d}_teacher.wav".format(global_step)) librosa.output.write_wav(path, teacher_hat, sr=hparams.sample_rate) path = join(eval_dir, "step{:09d}_target.wav".format(global_step)) librosa.output.write_wav(path, y_target, sr=hparams.sample_rate) # save figure path = join(eval_dir, "step{:09d}_waveplots.png".format(global_step)) save_waveplot(path, teacher_hat, y_target, student_hat)
def collate_fn(batch): """Create batch Args: batch(tuple): List of tuples - x[0] (ndarray,int) : list of (T,) - x[1] (ndarray,int) : list of (T, D) - x[2] (ndarray,int) : list of (1,), speaker id Returns: tuple: Tuple of batch - x (FloatTensor) : Network inputs (B, C, T) - y (LongTensor) : Network targets (B, T, 1) """ local_conditioning = len(batch[0]) >= 2 and hparams.cin_channels > 0 global_conditioning = len(batch[0]) >= 3 and hparams.gin_channels > 0 if hparams.max_time_sec is not None: max_time_steps = int(hparams.max_time_sec * hparams.sample_rate) elif hparams.max_time_steps is not None: max_time_steps = hparams.max_time_steps else: max_time_steps = None # Time resolution adjustment if local_conditioning: new_batch = [] for idx in range(len(batch)): x, c, g = batch[idx] if hparams.upsample_conditional_features: assert_ready_for_upsampling(x, c) if max_time_steps is not None: max_steps = ensure_divisible(max_time_steps, audio.get_hop_size(), True) if len(x) > max_steps: max_time_frames = max_steps // audio.get_hop_size() s = np.random.randint(0, len(c) - max_time_frames) ts = s * audio.get_hop_size() x = x[ts:ts + audio.get_hop_size() * max_time_frames] c = c[s:s + max_time_frames, :] assert_ready_for_upsampling(x, c) else: x, c = audio.adjust_time_resolution(x, c) if max_time_steps is not None and len(x) > max_time_steps: s = np.random.randint(0, len(x) - max_time_steps) x, c = x[s:s + max_time_steps], c[s:s + max_time_steps, :] assert len(x) == len(c) new_batch.append((x, c, g)) batch = new_batch else: new_batch = [] for idx in range(len(batch)): x, c, g = batch[idx] x = audio.trim(x) if max_time_steps is not None and len(x) > max_time_steps: s = np.random.randint(0, len(x) - max_time_steps) if local_conditioning: x, c = x[s:s + max_time_steps], c[s:s + max_time_steps, :] else: x = x[s:s + max_time_steps] new_batch.append((x, c, g)) batch = new_batch # Lengths input_lengths = [len(x[0]) for x in batch] max_input_len = max(input_lengths) # (B, T, C) # pad for time-axis if is_mulaw_quantize(hparams.input_type): padding_value = P.mulaw_quantize(0, mu=hparams.quantize_channels) x_batch = np.array([ _pad_2d( np_utils.to_categorical(x[0], num_classes=hparams.quantize_channels), max_input_len, 0, padding_value) for x in batch ], dtype=np.float32) else: x_batch = np.array( [_pad_2d(x[0].reshape(-1, 1), max_input_len) for x in batch], dtype=np.float32) assert len(x_batch.shape) == 3 # (B, T) if is_mulaw_quantize(hparams.input_type): padding_value = P.mulaw_quantize(0, mu=hparams.quantize_channels) y_batch = np.array([ _pad(x[0], max_input_len, constant_values=padding_value) for x in batch ], dtype=np.int) else: y_batch = np.array([_pad(x[0], max_input_len) for x in batch], dtype=np.float32) assert len(y_batch.shape) == 2 # (B, T, D) if local_conditioning: max_len = max([len(x[1]) for x in batch]) c_batch = np.array([_pad_2d(x[1], max_len) for x in batch], dtype=np.float32) assert len(c_batch.shape) == 3 # (B x C x T) c_batch = torch.FloatTensor(c_batch).transpose(1, 2).contiguous() else: c_batch = None if global_conditioning: g_batch = torch.LongTensor([x[2] for x in batch]) else: g_batch = None # Covnert to channel first i.e., (B, C, T) x_batch = torch.FloatTensor(x_batch).transpose(1, 2).contiguous() # Add extra axis if is_mulaw_quantize(hparams.input_type): y_batch = torch.LongTensor(y_batch).unsqueeze(-1).contiguous() else: y_batch = torch.FloatTensor(y_batch).unsqueeze(-1).contiguous() input_lengths = torch.LongTensor(input_lengths) return x_batch, y_batch, c_batch, g_batch, input_lengths
def get_data_loaders(dump_root, speaker_id, test_shuffle=True): data_loaders = {} local_conditioning = hparams.cin_channels > 0 if hparams.max_time_steps is not None: max_steps = ensure_divisible(hparams.max_time_steps, audio.get_hop_size(), True) else: max_steps = None for phase in ["train_no_dev", "dev"]: train = phase == "train_no_dev" X = FileSourceDataset( RawAudioDataSource(join(dump_root, phase), speaker_id=speaker_id, max_steps=max_steps, cin_pad=hparams.cin_pad, hop_size=audio.get_hop_size())) if local_conditioning: Mel = FileSourceDataset( MelSpecDataSource(join(dump_root, phase), speaker_id=speaker_id, max_steps=max_steps, cin_pad=hparams.cin_pad, hop_size=audio.get_hop_size())) assert len(X) == len(Mel) print("Local conditioning enabled. Shape of a sample: {}.".format( Mel[0].shape)) else: Mel = None print("[{}]: length of the dataset is {}".format(phase, len(X))) if train: lengths = np.array(X.file_data_source.lengths) # Prepare sampler sampler = PartialyRandomizedSimilarTimeLengthSampler( lengths, batch_size=hparams.batch_size) shuffle = False # make sure that there's no sorting bugs for https://github.com/r9y9/wavenet_vocoder/issues/130 sampler_idx = np.asarray( sorted(list(map(lambda s: int(s), sampler)))) assert (sampler_idx == np.arange(len(sampler_idx), dtype=np.int)).all() else: sampler = None shuffle = test_shuffle dataset = PyTorchDataset(X, Mel) data_loader = data_utils.DataLoader(dataset, batch_size=hparams.batch_size, drop_last=True, num_workers=hparams.num_workers, sampler=sampler, shuffle=shuffle, collate_fn=collate_fn, pin_memory=hparams.pin_memory) speaker_ids = {} if X.file_data_source.multi_speaker: for idx, (x, c, g) in enumerate(dataset): if g is not None: try: speaker_ids[g] += 1 except KeyError: speaker_ids[g] = 1 if len(speaker_ids) > 0: print("Speaker stats:", speaker_ids) data_loaders[phase] = data_loader return data_loaders
def assert_ready_for_upsampling(x, c, cin_pad): assert len(x) == (len(c) - 2 * cin_pad) * audio.get_hop_size()
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams, step_factor=1): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate * step_factor) if step_factor > 1: wav = wav[::step_factor] audio_time = len(wav) / hparams.sample_rate except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #Trim lead/trail silences if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #Pre-emphasize preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize) #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max #Assert all audio is in [-1, 1] if (wav > 1.).any() or (wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) if (preem_wav > 1.).any() or (preem_wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] preem_wav = preem_wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(preem_wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames if hparams.use_lws: #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad audio signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: #Ensure time resolution adjustement between audio and mel-spectrogram l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams), hparams.wavenet_pad_sides) #Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'audio-{}.npy'.format(index) mel_filename = 'mel-{}.npy'.format(index) linear_filename = 'linear-{}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (wav_path, audio_filename, mel_filename, linear_filename, time_steps, mel_frames, audio_time, text, len(text))
def eval_model(global_step, writer, device, model, y, c, g, input_lengths, eval_dir, ema=None): if ema is not None: print("Using averaged model for evaluation") model = clone_as_averaged_model(device, model, ema) model.make_generation_fast_() model.eval() idx = np.random.randint(0, len(y)) length = input_lengths[idx].data.cpu().item() # (T,) y_target = y[idx][:length].data.cpu().numpy() # print(y_target.size()) if c is not None: if hparams.upsample_conditional_features: c = c[idx, :, :length // audio.get_hop_size()].unsqueeze(0) else: c = c[idx, :, :length].unsqueeze(0) assert c.dim() == 3 print("Shape of local conditioning features: {}".format(c.size())) if g is not None: # TODO: test g = g[idx] print("Shape of global conditioning features: {}".format(g.size())) # print(c.shape) # Dummy silence initial_value = 0.0 print("Intial value:", initial_value) # (C,) initial_input = torch.zeros(1, 1, 80).fill_(initial_value) initial_input = initial_input.to(device) # Run the model in fast eval mode with torch.no_grad(): y_hat = model.incremental_forward(initial_input, c=c, g=g, T=length, softmax=True, quantize=True, tqdm=tqdm, log_scale_min=hparams.log_scale_min) # save figure y_hat = y_hat.squeeze().cpu().data.numpy() y_target = np.squeeze(y_target) # print(y_target.size()) path = join(eval_dir, "step{:09d}_waveplots".format(global_step)) save_waveplot(path, c, y_hat, y_target, writer, global_step)
def _process_utterance(out_dir, index, wav_path, text, mel_method): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Trim begin/end silences # NOTE: the threshold was chosen for clean signals wav, _ = librosa.effects.trim(wav, top_db=60, frame_length=hparams.fft_size, hop_length=hparams.hop_size) if hparams.highpass_cutoff > 0.0: wav = audio.low_cut_filter(wav, hparams.sample_rate, hparams.highpass_cutoff) # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # Trim silences in mul-aw quantized domain silence_threshold = 0 if silence_threshold > 0: # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) start, end = audio.start_and_end_indices(out, silence_threshold) wav = wav[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels - 1) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] constant_values = P.mulaw(0.0, hparams.quantize_channels - 1) out_dtype = np.float32 else: # [-1, 1] constant_values = 0.0 out_dtype = np.float32 wav = np.clip(wav, -1.0, 1.0) # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = MelSpectrogramCreator.mel_spectrogram(wav, mel_method) if hparams.global_gain_scale > 0: wav *= hparams.global_gain_scale # Time domain preprocessing if hparams.preprocess is not None and hparams.preprocess not in [ "", "none" ]: f = getattr(audio, hparams.preprocess) wav = f(wav) # Clip if np.abs(wav).max() > 1.0: print("""Warning: abs max value exceeds 1.0: {}""".format( np.abs(wav).max())) # ignore this sample return ("dummy", "dummy", -1, "dummy") # Set waveform target (out) if is_mulaw_quantize(hparams.input_type): out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): out = P.mulaw(wav, hparams.quantize_channels - 1) else: out = wav # zero pad # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.pad_lr(out, hparams.fft_size, audio.get_hop_size()) if l > 0 or r > 0: out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 # Write the spectrograms to disk: name = splitext(basename(wav_path))[0] audio_filename = '%s-wave.npy' % (name) mel_filename = '%s-feats.npy' % (name) np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: speaker_id = _get_speaker_from_path(audio_filename) return (audio_filename, mel_filename, N, text, speaker_id)
def _process_utterance(out_dir, index, audio_filepath, text): # Load the audio to a numpy array: wav_whole = audio.load_wav(audio_filepath) if hparams.rescaling: wav_whole = wav_whole / np.abs(wav_whole).max() * hparams.rescaling_max # This is a librivox source, so the audio files are going to be v. long # compared to a typical 'utterance' : So split the wav into chunks tup_results = [] n_samples = int(8.0 * hparams.sample_rate) # All 8 second utterances n_chunks = wav_whole.shape[0] // n_samples for chunk_idx in range(n_chunks): chunk_start, chunk_end = chunk_idx * n_samples, (chunk_idx + 1) * n_samples if chunk_idx == n_chunks - 1: # This is the last chunk - allow it to extend to the end of the file chunk_end = None wav = wav_whole[chunk_start:chunk_end] # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'librivox-audio-%04d-%05d.npy' % ( index, chunk_idx, ) mel_filename = 'librivox-mel-%04d-%05d.npy' % ( index, chunk_idx, ) text_idx = '%s - %05d' % ( text, chunk_idx, ) np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Add results tuple describing this training example: tup_results.append((audio_filename, mel_filename, timesteps, text_idx)) # Return all the audio results tuples (unpack in caller) return tup_results
def _process_utterance(out_dir, index, speaker_id, wav_path, text): sr = hparams.sample_rate # Load the audio to a numpy array. Resampled if needed wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab") # Trim silence from hts labels if available # TODO if exists(lab_path) and False: labels = hts.load(lab_path) b = int(start_at(labels) * 1e-7 * sr) e = int(end_at(labels) * 1e-7 * sr) wav = wav[b:e] wav, _ = librosa.effects.trim(wav, top_db=20) else: wav, _ = librosa.effects.trim(wav, top_db=20) # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjast time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjastment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'cmu_arctic-audio-%05d.npy' % index mel_filename = 'cmu_arctic-mel-%05d.npy' % index np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text, speaker_id)
def _process_song(out_dir, index, wav_path, text): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Trim begin/end silences # NOTE: the threshold was chosen for clean signals wav, _ = librosa.effects.trim(wav, top_db=60, frame_length=2048, hop_length=512) if hparams.highpass_cutoff > 0.0: wav = audio.low_cut_filter(wav, hparams.sample_rate, hparams.highpass_cutoff) # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # Trim silences in mul-aw quantized domain silence_threshold = 0 if silence_threshold > 0: # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) start, end = audio.start_and_end_indices(out, silence_threshold) wav = wav[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels - 1) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] constant_values = P.mulaw(0.0, hparams.quantize_channels - 1) out_dtype = np.float32 else: # [-1, 1] constant_values = 0.0 out_dtype = np.float32 #### CLAIRE Work here wav_name = os.path.splitext(os.path.basename(wav_path))[0] os.makedirs('./pwavs', exist_ok=True) pwav_path = './pwavs/{0}.wav'.format(wav_name) scipy.io.wavfile.write(pwav_path, 16000, wav) # make the chord directory if it does not exist chord_dir = "chord_dir" os.makedirs(chord_dir, exist_ok=True) # create xml file with notes and timestamps #subprocess.check_call(['./extract_chord_notes.sh', wav_path, chord_dir], shell=True) #os.system('./extract_chord_notes.sh {0} {1}'.format(pwav_path, chord_dir)) os.system('./extract_chord_notes.sh {0} {1} > /dev/null 2>&1'.format( pwav_path, chord_dir)) note_filename = '{0}/{1}.csv'.format(chord_dir, wav_name) #### Instead of computing the Mel Spectrogram, here return a time series of one hot encoded chords. # vector with 1 in row for each note played # 1000 samples per second note_samples = int((len(wav) / hparams.sample_rate) * 1000) # 12 notes per octave chords_time_series = np.zeros((12, note_samples)) #print(np.shape(chords_time_series)) with open(note_filename, newline='\n') as csvfile: #chordreader = csv.reader(csvfile, delimeter=',') chordreader = csvfile.readlines() #print(chordreader) for row in chordreader: row = row.split(",") start_time = float(row[0]) end_time = float(row[1]) + start_time note = int(row[2]) % 12 start_sample = min(note_samples - 1, int(start_time * 1000)) end_sample = min(note_samples, int(end_time * 1000)) try: chords_time_series[note][start_sample:end_sample] = 1 # print('wav {0} start {1} end {2} note {3} num_notes {4}'.format(wav_name, start_sample, end_sample, note, note_samples)) except Exception as e: print(np.shape(chords_time_series)) # print('wav {0} start {1} end {2} note {3} num_notes {4}'.format(wav_name, start_sample, end_sample, note, note_samples)) chords_time_series = chords_time_series.T # if hparams.global_gain_scale > 0: # wav *= hparams.global_gain_scale # Time domain preprocessing if hparams.preprocess is not None and hparams.preprocess not in [ "", "none" ]: f = getattr(audio, hparams.preprocess) wav = f(wav) # wav = np.clip(wav, -1.0, 1.0) # Set waveform target (out) if is_mulaw_quantize(hparams.input_type): out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): out = P.mulaw(wav, hparams.quantize_channels - 1) else: out = wav # zero pad # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.pad_lr(out, hparams.fft_size, audio.get_hop_size()) if l > 0 or r > 0: out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = chords_time_series.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 # Write the spectrograms to disk: name = splitext(basename(wav_path))[0] audio_filename = '%s-wave.npy' % (name) chords_filename = '%s-feats.npy' % (name) np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, chords_filename), chords_time_series.astype(np.int16), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, chords_filename, N, text)
os.makedirs(dst_dir, exist_ok=True) dst_dir_name = basename(os.path.normpath(dst_dir)) generated_utterances = {} cin_pad = hparams.cin_pad file_idx = 0 for idx, (x, y, c, g, input_lengths) in enumerate(test_data_loader): if cin_pad > 0: c = F.pad(c, pad=(cin_pad, cin_pad), mode="replicate") # B x 1 x T if x[0] is not None: B, _, T = x.shape else: B, _, Tn = c.shape T = Tn * audio.get_hop_size() if g is None and num_utterances > 0 and B * idx >= num_utterances: break ref_files = [] ref_feats = [] for i in range(B): # Yes this is ugly... if hasattr(test_data_loader.dataset, "X"): ref_files.append( test_data_loader.dataset.X.collected_files[file_idx][0]) else: pass if hasattr(test_data_loader.dataset, "Mel"): ref_feats.append(
def _process_utterance(out_dir, index, speaker_id, wav_path, mgc_path, lab_path, binary_dict, continuous_dict, text): # Load the audio to a numpy array. Resampled if needed wav = audio.load_wav(wav_path) # determine sessionID and uttID wavbn = os.path.basename(wav_path) uttID = os.path.splitext(wavbn)[0] if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # time-aligned context if hparams.frame_shift_ms is None: frame_shift_in_micro_sec = (hparams.hop_size * 10000000) // hparams.sample_rate else: frame_shift_in_micro_sec = hparams.frame_shift_ms * 10000 labels = hts.HTSLabelFile(frame_shift_in_micro_sec) labels.load(lab_path) linguistic_features = fe.linguistic_features( labels, binary_dict, continuous_dict, add_frame_features=True, frame_shift_in_micro_sec=frame_shift_in_micro_sec) Nwav = len(out) // audio.get_hop_size() out = out[:Nwav * audio.get_hop_size()] timesteps = len(out) fp = open(mgc_path) mgc = np.fromfile(fp, np.float32, -1) - np.log(32768) fp.close() N = len(mgc) // hparams.num_mels mgc = np.reshape(mgc, (N, hparams.num_mels)) c0 = audio._normalize(audio._amp_to_db(np.exp(mgc[0:Nwav, 0:1]))) # combine linguistic + c0 context = np.hstack((linguistic_features, c0)) # Write the spectrograms to disk: audio_filename = 'audio-' + uttID + '.npy' context_filename = 'context-' + uttID + '.npy' np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, context_filename), context.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, context_filename, timesteps, text, speaker_id)
def thread_main(self, sess): stop = False while not stop: iterator = load_npy_data(self.metadata_filename, self.npy_dataroot, self.speaker_id) for wav, local_condition, global_condition in iterator: if self.coord.should_stop(): stop = True break # force to align the audio and local_condition # if audio.shape[0] > local_condition.shape[0]: # audio = audio[:local_condition.shape[0], :] # else: # local_condition = local_condition[:audio.shape[0], :] # audio = np.pad(audio, [[self.receptive_field, 0], [0, 0]], mode='constant') # local_condition = np.pad(local_condition, [[self.receptive_field, 0], [0, 0]], mode='constant') # if self.sample_size: # while len(audio) > self.receptive_field: # audio_piece = audio[:(self.receptive_field + self.sample_size), :] # audio = audio[self.sample_size:, :] # # local_condition_piece = local_condition[:(self.receptive_field + self.sample_size), :] # local_condition = local_condition[self.sample_size:, :] # # if self.gc_enable: # sess.run(self.enqueue, feed_dict= # dict(zip(self._placeholders, (audio_piece, local_condition_piece, global_condition)))) # else: # sess.run(self.enqueue, feed_dict= # dict(zip(self._placeholders, (audio_piece, local_condition_piece)))) # else: # if self.gc_enable: # sess.run(self.enqueue, feed_dict=dict(zip( # self._placeholders, (audio, local_condition, global_condition)))) # else: # sess.run(self.enqueue, feed_dict=dict(zip(self._placeholders, (audio, local_condition)))) if hparams.upsample_conditional_features: wav = wav.reshape(-1, 1) assert_ready_for_upsampling(wav, local_condition) if self.sample_size is not None: sample_size = ensure_divisible(self.sample_size, audio.get_hop_size(), True) if wav.shape[0] > sample_size: max_frames = sample_size // audio.get_hop_size() s = np.random.randint(0, len(local_condition) - max_frames) ts = s * audio.get_hop_size() wav = wav[ts:ts + audio.get_hop_size() * max_frames, :] local_condition = local_condition[s:s + max_frames, :] if self.gc_enable: sess.run(self.enqueue, feed_dict=dict(zip( self._placeholders, (wav, local_condition, global_condition) ))) else: sess.run(self.enqueue, feed_dict=dict(zip( self._placeholders, (wav, local_condition) ))) else: wav, local_condition = audio.adjust_time_resolution(wav, local_condition) wav = wav.reshape(-1, 1) if self.sample_size is not None: while wav.shape[0] > self.sample_size: wav_piece = wav[:(self.receptive_field + self.sample_size), :] local_condition_piece = local_condition[:(self.receptive_field + self.sample_size), :] wav = wav[:self.sample_size, :] local_condition = local_condition[:self.sample_size, :] assert len(wav_piece) == len(local_condition_piece) if self.gc_enable: sess.run(self.enqueue, feed_dict=dict(zip( self._placeholders, (wav_piece, local_condition_piece, global_condition)))) else: sess.run(self.enqueue, feed_dict=dict(zip( self._placeholders, (wav_piece, local_condition_piece))))