ljspeech_train, train_batch_fn, batch_size, sampler=RandomSampler(ljspeech_train)) # only batch=1 for validation is enabled valid_cargo = DataCargo( ljspeech_valid, valid_batch_fn, batch_size=1, sampler=SequentialSampler(ljspeech_valid)) # conditioner(upsampling net) conditioner_config = config["conditioner"] upsampling_factors = conditioner_config["upsampling_factors"] upsample_net = UpsampleNet(upscale_factors=upsampling_factors) freeze(upsample_net) residual_channels = teacher_config["residual_channels"] loss_type = teacher_config["loss_type"] output_dim = teacher_config["output_dim"] log_scale_min = teacher_config["log_scale_min"] assert loss_type == "mog" and output_dim == 3, \ "the teacher wavenet should be a wavenet with single gaussian output" teacher = WaveNet(n_loop, n_layer, residual_channels, output_dim, n_mels, filter_size, loss_type, log_scale_min) # load & freeze upsample_net & teacher freeze(teacher) student_config = config["student"]
valid_batch_fn, batch_size=1, sampler=SequentialSampler(ljspeech_valid)) if not os.path.exists(args.output): os.makedirs(args.output) if args.device == -1: place = fluid.CPUPlace() else: place = fluid.CUDAPlace(args.device) with dg.guard(place): model_config = config["model"] upsampling_factors = model_config["upsampling_factors"] encoder = UpsampleNet(upsampling_factors) n_loop = model_config["n_loop"] n_layer = model_config["n_layer"] residual_channels = model_config["residual_channels"] output_dim = model_config["output_dim"] loss_type = model_config["loss_type"] log_scale_min = model_config["log_scale_min"] decoder = WaveNet(n_loop, n_layer, residual_channels, output_dim, n_mels, filter_size, loss_type, log_scale_min) model = ConditionalWavenet(encoder, decoder) summary(model) # load model parameters checkpoint_dir = os.path.join(args.output, "checkpoints")
def synthesis_with_clarinet(config_path, checkpoint, mel_spectrogram, place): with open(config_path, 'rt') as f: config = yaml.safe_load(f) data_config = config["data"] n_mels = data_config["n_mels"] teacher_config = config["teacher"] n_loop = teacher_config["n_loop"] n_layer = teacher_config["n_layer"] filter_size = teacher_config["filter_size"] # only batch=1 for validation is enabled with dg.guard(place): # conditioner(upsampling net) conditioner_config = config["conditioner"] upsampling_factors = conditioner_config["upsampling_factors"] upsample_net = UpsampleNet(upscale_factors=upsampling_factors) freeze(upsample_net) residual_channels = teacher_config["residual_channels"] loss_type = teacher_config["loss_type"] output_dim = teacher_config["output_dim"] log_scale_min = teacher_config["log_scale_min"] assert loss_type == "mog" and output_dim == 3, \ "the teacher wavenet should be a wavenet with single gaussian output" teacher = WaveNet(n_loop, n_layer, residual_channels, output_dim, n_mels, filter_size, loss_type, log_scale_min) # load & freeze upsample_net & teacher freeze(teacher) student_config = config["student"] n_loops = student_config["n_loops"] n_layers = student_config["n_layers"] student_residual_channels = student_config["residual_channels"] student_filter_size = student_config["filter_size"] student_log_scale_min = student_config["log_scale_min"] student = ParallelWaveNet(n_loops, n_layers, student_residual_channels, n_mels, student_filter_size) stft_config = config["stft"] stft = STFT(n_fft=stft_config["n_fft"], hop_length=stft_config["hop_length"], win_length=stft_config["win_length"]) lmd = config["loss"]["lmd"] model = Clarinet(upsample_net, teacher, student, stft, student_log_scale_min, lmd) io.load_parameters(model=model, checkpoint_path=checkpoint) if not os.path.exists(args.output): os.makedirs(args.output) model.eval() # Rescale mel_spectrogram. min_level, ref_level = 1e-5, 20 # hard code it mel_spectrogram = 20 * np.log10(np.maximum(min_level, mel_spectrogram)) mel_spectrogram = mel_spectrogram - ref_level mel_spectrogram = np.clip((mel_spectrogram + 100) / 100, 0, 1) mel_spectrogram = dg.to_variable(mel_spectrogram) mel_spectrogram = fluid.layers.transpose(mel_spectrogram, [0, 2, 1]) wav_var = model.synthesis(mel_spectrogram) wav_np = wav_var.numpy()[0] return wav_np