def main(audio_files, model_filename, output_dir, batch_size, speaker_id, implementation): audio_files = utils.files_to_list(audio_files) model = torch.load(model_filename)['model'] model.eval() wavenet = nv_wavenet.NVWaveNet( **(model.decoders[speaker_id].export_weights())) for files in chunker(audio_files, batch_size): audio_ = [] for file_path in files: print(file_path) audio, sampling_rate = utils.load_wav_to_torch(file_path) if sampling_rate != 16000: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, 16000)) audio = utils.mu_law_encode(audio / utils.MAX_WAV_VALUE, 256) audio = utils.to_gpu(audio) audio_.append(torch.unsqueeze(audio, 0)) latent = model.get_latent_input(torch.cat(audio_, 0)) cond_input = model.decoders[speaker_id].get_cond_input(latent) audio_data = wavenet.infer(cond_input, implementation) for i, file_path in enumerate(files): file_name = os.path.splitext(os.path.basename(file_path))[0] audio = utils.mu_law_decode_numpy(audio_data[i, :].cpu().numpy(), wavenet.A) audio = utils.MAX_WAV_VALUE * audio wavdata = audio.astype('int16') write("{}/{}.wav".format(output_dir, file_name), 16000, wavdata)
def __getitem__(self, index): # Read audio filename = self.audio_files[index] wav = deepaudio.load_wav(filename) # load in raw_audio via utils raw_audio, _ = utils.load_wav_to_torch(filename) # convert wav to numpy audio = torch.from_numpy(wav) # take segment if audio.size(0) >= self.segment_length: max_audio_start = audio.size(0) - self.segment_length audio_start = random.randint(0, max_audio_start) audio = audio[audio_start:audio_start + self.segment_length] # update raw audio as well raw_audio = raw_audio[audio_start:audio_start + self.segment_length] else: audio = torch.nn.functional.pad( audio, (0, self.segment_length - audio.size(0)), 'constant').data # pad raw audio as well raw_audio = torch.nn.functional.pad( raw_audio, (0, self.segment_length - raw_audio.size(0)), 'constant').data # compute mel mel = deepaudio.melspectrogram(audio.numpy()) # convert mel to torch mel = torch.from_numpy(mel) audio = utils.mu_law_encode(raw_audio / utils.MAX_WAV_VALUE, self.mu_quantization) return (mel, audio)
def main(audio_file_path, model_filename, output_path): model = torch.load(model_filename, map_location=torch.device('cpu'))['model'] # mels = [] # for file_path in files: # print(file_path) # mel = torch.load(file_path) # mel = utils.to_gpu(mel) # mels.append(torch.unsqueeze(mel, 0)) # cond_input = model.get_cond_input(torch.cat(mels, 0)) # audio_data = wavenet.infer(cond_input, implementation) first_audio_data, _ = utils.load_wav_to_torch(audio_file_path) first_audio_data = first_audio_data[:10000] first_audio_data = utils.mu_law_encode(first_audio_data / utils.MAX_WAV_VALUE, 256) print("first_audio_data.shape", first_audio_data.shape) print("first_audio_data.shape", first_audio_data.dtype) audio_data = model.generate(first_samples = first_audio_data, num_samples=1000, receptive_field=6000) np.savetxt("audio_data.txt", audio_data.numpy().astype(int), fmt='%d') # for i, file_path in enumerate(files): # file_name = os.path.splitext(os.path.basename(file_path))[0] audio = utils.mu_law_decode_numpy(audio_data.cpu().numpy(), model.n_out_channels) audio = utils.MAX_WAV_VALUE * audio print("audio: ", audio) wavdata = audio.astype('int16') write(output_path, 16000, wavdata)
def __getitem__(self, index): audios = self.audio_buffer[index] rand_pos = np.random.randint(0, len(audios) - self.sample_size) if self.use_local_condition: local_condition = self.fbank_buffer[index] local_condition = np.repeat(local_condition, self.upsample_factor, axis=0) local_condition = local_condition[rand_pos:rand_pos + self.sample_size] else: audios = np.pad(audios, [[self.receptive_field, 0], [0, 0]], 'constant') local_condition = None audios = audios[rand_pos:rand_pos + self.sample_size] target = mu_law_encode(audios, self.quantization_channels) if self.noise_injecting: noise = np.random.normal(0.0, 1.0 / self.quantization_channels, audios.shape) audios = audios + noise audios = np.pad(audios, [[self.receptive_field, 0], [0, 0]], 'constant') local_condition = np.pad(local_condition, [[self.receptive_field, 0], [0, 0]], 'constant') return torch.FloatTensor(audios), torch.LongTensor( target), torch.FloatTensor(local_condition)
def __getitem__(self, index): filename = self.audio_files[index] audio, sampling_rate = utils.load_wav_to_torch(filename) if sampling_rate != self.sampling_rate: raise ValueError("Sampling rate doesn't math") if audio.size(0) >= self.segment_length: max_audio_start = audio.size(0) - self.segment_length audio_start = random.randint(0, max_audio_start) audio = audio[audio_start:audio_start + self.segment_length] else: audio = F.pad(audio, (0, self.segment_length - audio.size(0)), 'constant').data audio = utils.mu_law_encode(audio / utils.MAX_WAV_VALUE, self.mu_quantization) return audio
def __getitem__(self, index): # Read audio filename = self.audio_files[index] audio, sampling_rate = utils.load_wav_to_torch(filename) if sampling_rate != self.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) # Take segment if audio.size(0) >= self.segment_length: max_audio_start = audio.size(0) - self.segment_length audio_start = random.randint(0, max_audio_start) audio = audio[audio_start:audio_start + self.segment_length] else: audio = torch.nn.functional.pad( audio, (0, self.segment_length - audio.size(0)), 'constant').data mel = self.get_mel(audio) audio = utils.mu_law_encode(audio / utils.MAX_WAV_VALUE, self.mu_quantization) return (mel, audio)
def quantize(self, wave): """convert the wave to a discrete integer format""" return mu_law_encode(torch.tensor(wave), self.mu_quantization)
def __call__(self, data): data['audio_quantized'] = mu_law_encode( data['audio'], quantization_channels=self.quantization_channels) return data
def inference(self, cond_features, use_logistic_mix=False, teacher_audio=None, mu_quantization=256, randomize_input=False, rand_sample_chance=0., length=0, batch_size=0, cond_channels=0, device="cuda"): """ Generates audio samples equivalent to the length of upsampled cond features - Will use teacher audio as forward input, if provided - If teacher_audio_length < features_length, switches forward input to inference samples when teacher samples exhasted. - If cond_features=None, generates unconditional output. Last four params (length, batch_size, cond_channels, device) control unconditional output. """ assert ((cond_features is not None) or (length > 0)) # get metadata from condition features if cond_features is not None: assert (len(cond_features.size()) == 3) device = cond_features.device length = cond_features.size(-1) * self.upscale cond_channels = cond_features.size(1) batch_size = cond_features.size(0) if (self.upscale != 1): cond_features = self.upsample(cond_features) else: assert (batch_size > 0 and cond_channels > 0) cond_features = torch.zeros( size=[batch_size, cond_channels, length]).to(device) if self.use_cond_conv: # make condition features for every timestep and res layer cond_features = self.cond_layers(cond_features) if not self.same_cond_each_resblock: cond_features = cond_features.view(batch_size, self.n_layers, 2 * self.n_residual_channels, length) # output buffers logits = torch.zeros(self.n_out_channels, length).to(device) output_audio = torch.zeros(size=[batch_size, length + 1]).to(device) output_audio = utils.mu_law_encode(output_audio) if teacher_audio is not None: teacher_length = teacher_audio.size(1) else: teacher_length = 0 if use_logistic_mix: sampler = SampleDiscretizedMixLogistics() else: sampler = utils.CategoricalSampler() ################# # inference loop: ################## start_time = time.time() print("Inference progress:") for s in range(length - 1): # print progress every 100 samples if (s % 100 == 0): print(str(s / length), end='\r', flush=True) if self.use_cond_conv: cond_sample = cond_features[:, :, :, s] else: cond_sample = cond_features[:, :, s] # flip biased coin to see if raandom sample used if randomize_input and (random.uniform < rand_sample_chance): forward_sample = torch.randint_like(forward_sample, low=0, high=mu_quantization) else: # draw from teacher or previous sample? if (s < teacher_length): forward_sample = teacher_audio[:, s].clone() else: forward_sample = output_audio[:, s].clone() logits[:, s + 1] = self.infer_step(cond_sample, forward_sample) output_audio[:, s + 1] = sampler(logits[:, s + 1]) end_time = time.time() ################### # end inference ################### print("Inference complete in " + str(end_time - start_time)) return utils.mu_law_decode(output_audio, mu_quantization)
def collect_features(self, path): # 1.Load audio --> 2. pre-emphasis --> 3. 8bit mu-law x, fs = librosa.load(path, sr=self.target_sr, mono=True, dtype=np.float64) x = x * 1.3 x_mulaw = mu_law_encode(x) return x_mulaw.astype(np.uint8)
def __getitem__(self, index): # Read audio audio_filename, mel_filename = self.audio_files[index] audio, sample_rate = utils.load_wav(audio_filename) pad_size = self.window_size - self.window_step left_pad = pad_size right_pad = pad_size + self.window_step - len(audio) % self.window_step audio = np.pad(audio, (left_pad, right_pad), mode="constant", constant_values=0) audio /= np.abs(audio).max() if self.apply_preemphasis: audio = self.preemphasis(audio) audio /= np.abs(audio).max() if sample_rate != self.sample_rate: raise ValueError("{} SR doesn't match target {} SR".format( sample_rate, self.sample_rate)) if self.no_chunks: if self.load_mel: mel = np.load(mel_filename).T else: # as by default lws always pad from left and right mel = self.get_mel(audio[left_pad:-right_pad]) else: if mel_filename != "" and self.load_mel: if self.segment_length % self.window_step != 0: raise ValueError( "Hop length should be a divider of segment length") mel = np.load(mel_filename) mel = np.clip(mel, -self.audio_params["max_abs_value"], self.audio_params["max_abs_value"]) # Take segment if mel.shape[0] >= self.mel_segment_length: max_mel_start = mel.shape[0] - self.mel_segment_length mel_start = random.randint(0, max_mel_start) mel = mel[mel_start:mel_start + self.mel_segment_length] assert mel.shape[0] == self.mel_segment_length audio_start = mel_start * self.window_step audio = audio[audio_start:audio_start + self.segment_length] assert audio.shape[0] == self.segment_length else: audio = np.pad(audio, (0, self.segment_length - audio.shape[0]), 'constant') mel = np.pad( mel, (0, 0, 0, self.mel_segment_length - mel.shape[0]), 'constant') else: if audio.shape[0] >= self.segment_length: max_audio_start = audio.shape[0] - self.segment_length audio_start = random.randint(0, max_audio_start) audio = audio[audio_start:audio_start + self.segment_length] else: audio = np.pad(audio, (0, self.segment_length - audio.shape[0]), 'constant') mel = self.get_mel(audio) mel_length = min(mel.shape[1], len(audio) // self.window_step) mel = mel[:, :mel_length] audio = audio[:mel_length * self.window_step] # as we want to apply transpose convolution assert len(audio) // self.window_step == mel.shape[1] mel = torch.FloatTensor(mel) audio = torch.FloatTensor(audio) audio = utils.mu_law_encode(audio, self.mu_quantization) return mel, audio