def generate_performance_audio(self, spectrograms): batch_size = spectrograms.shape[0] #number of voices spec_depth = spectrograms.shape[ 1] #depth of each spectrogram. should be 80 capacity = 80 * 80 * 9 #maximum volume of a tensor the wavenet can handle at a time spec_hop = floor(capacity / (batch_size * spec_depth)) spec_start = 0 audio_start = 0 audio_hop = spec_hop * self.sample_conversion batch = 1 num_batches = ceil(self.spec_length / spec_hop) spectrograms = utils.to_gpu(spectrograms) audio = np.zeros((batch_size, self.audio_length)) print('Generating audio with WaveNet Vocoder...') while spec_start + spec_hop < self.spec_length: print(' - batch %d of %d' % (batch, num_batches)) #get clip clip = spectrograms[:, :, spec_start:spec_start + spec_hop] #get audio from network cond_input = self.wavenet.get_cond_input(clip) audio_data = self.nv_wavenet.infer(cond_input, nv_wavenet.Impl.AUTO) torch.cuda.empty_cache() # pdb.set_trace() for i in range(batch_size): audio[i, audio_start:audio_start + audio_hop] = utils.mu_law_decode_numpy( audio_data[i, :].cpu().numpy(), self.nv_wavenet.A) #add into at start:start+hop spec_start += spec_hop audio_start += audio_hop batch += 1 #need to update the wavenet embeddings so that sound stream is continuous #here there be demons # self.nv_wavenet.embedding_prev = self.nv_wavenet.embedding_curr #add the last section if it didn't fit print(' - batch %d of %d' % (batch, num_batches)) # spec_remaining = self.spec_length - spec_start clip = spectrograms[:, :, spec_start:self.spec_length] #get audio from network cond_input = self.wavenet.get_cond_input(clip) audio_data = self.nv_wavenet.infer(cond_input, nv_wavenet.Impl.AUTO) torch.cuda.empty_cache() for i in range(batch_size): audio[i, audio_start:self.audio_length] = utils.mu_law_decode_numpy( audio_data[i, :].cpu().numpy(), self.nv_wavenet.A) return audio
def main(mel_files, model_filename, output_dir, batch_size, implementation): mel_files = utils.files_to_list(mel_files) model = torch.load(model_filename)['model'] wavenet = nv_wavenet.NVWaveNet(**(model.export_weights())) for files in chunker(mel_files, batch_size): mels = [] for file_path in files: print(file_path) mel = torch.load(file_path) mel = utils.to_gpu(mel) mels.append(torch.unsqueeze(mel, 0)) cond_input = model.get_cond_input(torch.cat(mels, 0)) audio_data = wavenet.infer(cond_input, implementation) for i, file_path in enumerate(files): file_name = os.path.splitext(os.path.basename(file_path))[0] audio = utils.mu_law_decode_numpy(audio_data[i, :].cpu().numpy(), wavenet.A) audio = utils.MAX_WAV_VALUE * audio audio = 32768.0 * audio wavdata = audio.astype('int16') #wavdata = audio.astype('float16') write("{}/{}.wav".format(output_dir, file_name), 16000, wavdata)
def main(audio_files, model_filename, output_dir, batch_size, speaker_id, implementation): audio_files = utils.files_to_list(audio_files) model = torch.load(model_filename)['model'] model.eval() wavenet = nv_wavenet.NVWaveNet( **(model.decoders[speaker_id].export_weights())) for files in chunker(audio_files, batch_size): audio_ = [] for file_path in files: print(file_path) audio, sampling_rate = utils.load_wav_to_torch(file_path) if sampling_rate != 16000: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, 16000)) audio = utils.mu_law_encode(audio / utils.MAX_WAV_VALUE, 256) audio = utils.to_gpu(audio) audio_.append(torch.unsqueeze(audio, 0)) latent = model.get_latent_input(torch.cat(audio_, 0)) cond_input = model.decoders[speaker_id].get_cond_input(latent) audio_data = wavenet.infer(cond_input, implementation) for i, file_path in enumerate(files): file_name = os.path.splitext(os.path.basename(file_path))[0] audio = utils.mu_law_decode_numpy(audio_data[i, :].cpu().numpy(), wavenet.A) audio = utils.MAX_WAV_VALUE * audio wavdata = audio.astype('int16') write("{}/{}.wav".format(output_dir, file_name), 16000, wavdata)
def main(audio_file_path, model_filename, output_path): model = torch.load(model_filename, map_location=torch.device('cpu'))['model'] # mels = [] # for file_path in files: # print(file_path) # mel = torch.load(file_path) # mel = utils.to_gpu(mel) # mels.append(torch.unsqueeze(mel, 0)) # cond_input = model.get_cond_input(torch.cat(mels, 0)) # audio_data = wavenet.infer(cond_input, implementation) first_audio_data, _ = utils.load_wav_to_torch(audio_file_path) first_audio_data = first_audio_data[:10000] first_audio_data = utils.mu_law_encode(first_audio_data / utils.MAX_WAV_VALUE, 256) print("first_audio_data.shape", first_audio_data.shape) print("first_audio_data.shape", first_audio_data.dtype) audio_data = model.generate(first_samples = first_audio_data, num_samples=1000, receptive_field=6000) np.savetxt("audio_data.txt", audio_data.numpy().astype(int), fmt='%d') # for i, file_path in enumerate(files): # file_name = os.path.splitext(os.path.basename(file_path))[0] audio = utils.mu_law_decode_numpy(audio_data.cpu().numpy(), model.n_out_channels) audio = utils.MAX_WAV_VALUE * audio print("audio: ", audio) wavdata = audio.astype('int16') write(output_path, 16000, wavdata)
def main(input_files, model_dir, output_dir, batch_size, implementation, data_config, audio_config, preload_mels=False): model_filename = get_latest_checkpoint(model_dir) print("Model path: {}".format(model_filename)) model = torch.load(model_filename)['model'] wavenet = nv_wavenet.NVWaveNet(**(model.export_weights())) print("Wavenet num layers: {}, max_dilation: {}".format( wavenet.num_layers, wavenet.max_dilation)) writer = SummaryWriter(output_dir) mel_extractor = Mel2SampOnehot(audio_config=audio_config, **data_config) input_files = utils.files_to_list(input_files) audio_processor = AudioProcessor(audio_config) for j, files in enumerate(chunker(input_files, batch_size)): mels = [] for i, file_path in enumerate(files): if preload_mels: mel = np.load(file_path[0]).T mel = torch.from_numpy(mel) mel = utils.to_gpu(mel) else: audio, _ = utils.load_wav_to_torch(file_path) file_name = os.path.splitext(os.path.basename(file_path))[0] writer.add_audio("eval_true/{}/{}".format(i, file_name), audio / utils.MAX_WAV_VALUE, 0, 22050) mel = mel_extractor.get_mel(audio) mel = mel.t().cuda() mels.append(torch.unsqueeze(mel, 0)) mels = torch.cat(mels, 0) cond_input = model.get_cond_input(mels) audio_data = wavenet.infer(cond_input, implementation) for i, file_path in enumerate(files): file_name = os.path.splitext(os.path.basename(file_path[0]))[0] audio = utils.mu_law_decode_numpy(audio_data[i, :].cpu().numpy(), 256) print("Range of {}.wav before deemphasis : {} to {}".format( file_name, audio.min(), audio.max())) if mel_extractor.apply_preemphasis: audio = audio.astype("float32") audio = audio_processor.deemphasis(audio[None, :]) audio = audio.numpy()[0] print("Range of {}.wav after deemphasis : {} to {}".format( file_name, audio.min(), audio.max())) audio = np.tanh(audio) output_filepath = "{}.wav".format(file_name) output_filepath = os.path.join(output_dir, output_filepath) assert audio.dtype in [np.float64, np.float32] assert (np.abs(audio)).max() <= 1 writer.add_audio(output_filepath, audio, 0, 22050) audio = (audio * 32767).astype("int16") scipy.io.wavfile.write(output_filepath, 22050, audio)
def SaveTestData(audioX, midiX, fileNum, output_dir, test_segment_length, audio_hz, midi_hz, mu_law_encode=True): """ Save torch tensors for inference.py A random segment in the piece will be chosen. The length is specified by test_segment_length This also plots a visualization of the midi roll, and the ground truth audio segment """ fig, ax = plt.subplots() filename = output_dir + "/" + str(fileNum) # save midi tensor if midiX is not None: segment_samples = int(np.floor(midi_hz * test_segment_length)) starting_pos = random.randint(0, midiX.shape[1] - segment_samples) midiX = midiX[:, starting_pos:(starting_pos + segment_samples)] midiX = midiX.todense() torch.save(torch.from_numpy(midiX), filename + ".midiX") # plot midi roll plt.cla() ax.spy(midiX[:89, :], markersize=3, aspect="auto", origin='lower') plt.savefig(filename + ".png") # save ground truth audio if audioX is not None: segment_samples = int(audio_hz * test_segment_length) audio_start_pos = int(starting_pos * (audio_hz / midi_hz)) audioX = audioX[audio_start_pos:(audio_start_pos + segment_samples)] torch.save(torch.from_numpy(audioX), filename + ".audioX") # save ground truth audio if mu_law_encode: raw_audio = utils.mu_law_decode_numpy(audioX) else: raw_audio = audioX.numpy() raw_audio = utils.MAX_WAV_VALUE * raw_audio wavdata = raw_audio.astype('int16') write(filename + "_groundTruth.wav", 16000, wavdata)
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # ***************************************************************************** """ Tests that the NV-WaveNet class is producing audio """ import torch from scipy.io.wavfile import write import nv_wavenet import utils if __name__ == '__main__': model = torch.load("model.pt") wavenet = nv_wavenet.NVWaveNet(**model) cond_input = torch.load("cond_input.pt") samples = wavenet.infer(cond_input, nv_wavenet.Impl.PERSISTENT)[0] audio = utils.mu_law_decode_numpy(samples.cpu().numpy(), 256) audio = utils.MAX_WAV_VALUE * audio wavdata = audio.astype('int16') write('audio.wav', 16000, wavdata)
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, iters_per_checkpoint, iters_per_eval, batch_size, seed, checkpoint_path, log_dir, ema_decay=0.9999): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== if train_data_config["no_chunks"]: criterion = MaskedCrossEntropyLoss() else: criterion = CrossEntropyLoss() model = WaveNet(**wavenet_config).cuda() ema = ExponentialMovingAverage(ema_decay) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) scheduler = StepLR(optimizer, step_size=200000, gamma=0.5) # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, scheduler, iteration, ema = load_checkpoint(checkpoint_path, model, optimizer, scheduler, ema) iteration += 1 # next iteration is iteration + 1 trainset = Mel2SampOnehot(audio_config=audio_config, verbose=True, **train_data_config) validset = Mel2SampOnehot(audio_config=audio_config, verbose=False, **valid_data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None valid_sampler = DistributedSampler(validset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== print(train_data_config) if train_data_config["no_chunks"]: collate_fn = utils.collate_fn else: collate_fn = torch.utils.data.dataloader.default_collate train_loader = DataLoader(trainset, num_workers=1, shuffle=False, collate_fn=collate_fn, sampler=train_sampler, batch_size=batch_size, pin_memory=True, drop_last=True) valid_loader = DataLoader(validset, num_workers=1, shuffle=False, sampler=valid_sampler, batch_size=1, pin_memory=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) writer = SummaryWriter(log_dir) print("Checkpoints writing to: {}".format(log_dir)) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): if low_memory: torch.cuda.empty_cache() scheduler.step() model.zero_grad() if train_data_config["no_chunks"]: x, y, seq_lens = batch seq_lens = to_gpu(seq_lens) else: x, y = batch x = to_gpu(x).float() y = to_gpu(y) x = (x, y) # auto-regressive takes outputs as inputs y_pred = model(x) if train_data_config["no_chunks"]: loss = criterion(y_pred, y, seq_lens) else: loss = criterion(y_pred, y) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus)[0] else: reduced_loss = loss.data[0] loss.backward() optimizer.step() for name, param in model.named_parameters(): if name in ema.shadow: ema.update(name, param.data) print("{}:\t{:.9f}".format(iteration, reduced_loss)) if rank == 0: writer.add_scalar('loss', reduced_loss, iteration) if (iteration % iters_per_checkpoint == 0 and iteration): if rank == 0: checkpoint_path = "{}/wavenet_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, scheduler, learning_rate, iteration, checkpoint_path, ema, wavenet_config) if (iteration % iters_per_eval == 0 and iteration > 0 and not config["no_validation"]): if low_memory: torch.cuda.empty_cache() if rank == 0: model_eval = nv_wavenet.NVWaveNet(**(model.export_weights())) for j, valid_batch in enumerate(valid_loader): mel, audio = valid_batch mel = to_gpu(mel).float() cond_input = model.get_cond_input(mel) predicted_audio = model_eval.infer(cond_input, nv_wavenet.Impl.AUTO) predicted_audio = utils.mu_law_decode_numpy(predicted_audio[0, :].cpu().numpy(), 256) writer.add_audio("valid/predicted_audio_{}".format(j), predicted_audio, iteration, 22050) audio = utils.mu_law_decode_numpy(audio[0, :].cpu().numpy(), 256) writer.add_audio("valid_true/audio_{}".format(j), audio, iteration, 22050) if low_memory: torch.cuda.empty_cache() iteration += 1
pin_memory=False, drop_last=True) for batch in test_loader: # conditions, true_audio = testset[0]#batch x, y = batch true_audio = y.clone() y = torch.zeros_like(y) #removing the waveform for pure inference x = utils.to_gpu(x).float() y = utils.to_gpu(y) x = (x, y) # auto-regressive takes outputs as inputs y_pred = model(x) single = y_pred[0].detach().cpu() values, indices = single.max(0) indices = utils.mu_law_decode_numpy(indices.numpy(), 256) indices = utils.MAX_WAV_VALUE * indices indices = indices.astype('int16') true_audio = utils.mu_law_decode_numpy(true_audio[0].cpu().numpy(), 256) true_audio = utils.MAX_WAV_VALUE * true_audio true_audio = true_audio.astype('int16') play(indices, 16000) time.sleep(0.25) play(true_audio, 16000) time.sleep(1.0) del x, y, y_pred, single, values, indices, true_audio torch.cuda.empty_cache()
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, iters_per_checkpoint, batch_size, seed, checkpoint_path): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = CrossEntropyLoss() model = WaveNet(**wavenet_config).cpu() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 print(f"receptive_field: {model.receptive_field()}") trainset = WavenetDataset( dataset_file='data/dataset.npz', item_length=model.receptive_field() + 1000 + model.output_length - 1, target_length=model.output_length, file_location='data/', test_stride=500, ) print(trainset._length) print('the dataset has ' + str(len(trainset)) + ' items') train_loader = DataLoader( trainset, batch_size=batch_size, shuffle=True, pin_memory=False, ) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== start = time.time() for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() y, target = batch y = to_gpu(y).float() target = to_gpu(target) y_pred = model((None, y)) loss = criterion(y_pred[:, :, -model.output_length:], target) loss.backward() optimizer.step() print("{}:\t{:.9f}".format(iteration, loss)) print_etr(start, total_iterations=(epochs - epoch_offset) * len(train_loader), current_iteration=epoch * len(train_loader) + i + 1) writer.add_scalar('Loss/train', loss, global_step=iteration) if (iteration % iters_per_checkpoint == 0): y_choice = y_pred[0].detach().cpu().transpose(0, 1) y_prob = F.softmax(y_choice, dim=1) y_prob_collapsed = torch.multinomial(y_prob, num_samples=1).squeeze(1) y_pred_audio = mu_law_decode_numpy(y_prob_collapsed.numpy(), model.n_out_channels) import torchaudio y_audio = mu_law_decode_numpy(y.numpy(), model.n_out_channels) torchaudio.save("test_in.wav", torch.tensor(y_audio), 16000) torchaudio.save("test_out.wav", torch.tensor(y_pred_audio), 16000) writer.add_audio('Audio', y_pred_audio, global_step=iteration, sample_rate=data_config['sampling_rate']) checkpoint_path = "{}/wavenet_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) writer.flush() iteration += 1