def from_pretrained(cls, config, checkpoint_path): """Build a ConditionalWaveNet model from a pretrained model. Parameters ---------- config: yacs.config.CfgNode model configs checkpoint_path: Path or str the path of pretrained model checkpoint, without extension name Returns ------- ConditionalWaveNet The model built from pretrained result. """ model = cls(upsample_factors=config.model.upsample_factors, n_stack=config.model.n_stack, n_loop=config.model.n_loop, residual_channels=config.model.residual_channels, output_dim=config.model.output_dim, n_mels=config.data.n_mels, filter_size=config.model.filter_size, loss_type=config.model.loss_type, log_scale_min=config.model.log_scale_min) layer_tools.summary(model) checkpoint.load_parameters(model, checkpoint_path=checkpoint_path) return model
with dg.guard(place): model_config = config["model"] upsampling_factors = model_config["upsampling_factors"] encoder = UpsampleNet(upsampling_factors) n_loop = model_config["n_loop"] n_layer = model_config["n_layer"] residual_channels = model_config["residual_channels"] output_dim = model_config["output_dim"] loss_type = model_config["loss_type"] log_scale_min = model_config["log_scale_min"] decoder = WaveNet(n_loop, n_layer, residual_channels, output_dim, n_mels, filter_size, loss_type, log_scale_min) model = ConditionalWavenet(encoder, decoder) summary(model) # load model parameters checkpoint_dir = os.path.join(args.output, "checkpoints") if args.checkpoint: iteration = io.load_parameters(model, checkpoint_path=args.checkpoint) else: iteration = io.load_parameters(model, checkpoint_dir=checkpoint_dir, iteration=args.iteration) assert iteration > 0, "A trained model is needed." # WARNING: don't forget to remove weight norm to re-compute each wrapped layer's weight # removing weight norm also speeds up computation for layer in model.sublayers():
use_memory_mask = model_config["use_memory_mask"] query_position_rate = model_config["query_position_rate"] key_position_rate = model_config["key_position_rate"] window_backward = model_config["window_backward"] window_ahead = model_config["window_ahead"] key_projection = model_config["key_projection"] value_projection = model_config["value_projection"] dv3 = make_model( n_speakers, speaker_dim, speaker_embed_std, embed_dim, padding_idx, embedding_std, max_positions, n_vocab, freeze_embedding, filter_size, encoder_channels, n_mels, decoder_channels, r, trainable_positional_encodings, use_memory_mask, query_position_rate, key_position_rate, window_backward, window_ahead, key_projection, value_projection, downsample_factor, linear_dim, use_decoder_states, converter_channels, dropout) summary(dv3) # =========================loss========================= loss_config = config["loss"] masked_weight = loss_config["masked_loss_weight"] priority_freq = loss_config["priority_freq"] # Hz priority_bin = int(priority_freq / (0.5 * sample_rate) * linear_dim) priority_freq_weight = loss_config["priority_freq_weight"] binary_divergence_weight = loss_config["binary_divergence_weight"] guided_attention_sigma = loss_config["guided_attention_sigma"] criterion = TTSLoss( masked_weight=masked_weight, priority_bin=priority_bin, priority_weight=priority_freq_weight, binary_divergence_weight=binary_divergence_weight, guided_attention_sigma=guided_attention_sigma,