def load_tts(self, model_path, model_file, model_config, use_cuda): tts_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_file) print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > model file: ", model_file) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.tts_config.text_cleaner], self.tts_config.phoneme_language, self.tts_config.enable_eos_bos_chars) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence(sen, [self.tts_config.text_cleaner]) self.tts_model = setup_model(self.input_size, self.tts_config) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000
def load_tts(self, model_path, model_file, model_config, use_cuda): tts_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_file) print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > model file: ", model_file) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping(os.path.join(model_path, self.config.tts_speakers)) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(self.model_file) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp and self.tts_config.model in ["Tacotron", "TacotronGST"]: self.tts_model.decoder.set_r(cp['r'])
def main(args): # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) model = setup_model(num_chars, c, args.use_half) print(" | > Num output units : {}".format(ap.num_freq), flush=True) if args.use_half: print(' | > Use half mode') optimizer_eps = 1e-08 if not args.use_half else 1e-04 optimizer = optim.Adam(model.parameters(), lr=c.lr, weight_decay=0, eps=optimizer_eps) # optimizer = optim.SGD(model.parameters(), lr=c.lr, weight_decay=0) if c.stopnet and c.separate_stopnet: optimizer_st = optim.Adam(model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0, eps=optimizer_eps) # optimizer_st = optim.SGD(model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0) else: optimizer_st = None if c.loss_masking: criterion = L1LossMasked() if c.model == "Tacotron" else MSELossMasked( ) else: criterion = nn.L1Loss() if c.model == "Tacotron" else nn.MSELoss() criterion_st = nn.BCEWithLogitsLoss() if c.stopnet else None if args.restore_path: checkpoint = torch.load(args.restore_path) try: # TODO: fix optimizer init, model.cuda() needs to be called before # optimizer restore # optimizer.load_state_dict(checkpoint['optimizer']) if len(c.reinit_layers) > 0: raise RuntimeError model.load_state_dict(checkpoint['model']) except: print(" > Partial model initialization.") partial_init_flag = True model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint, c) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group['lr'] = c.lr print(" > Model restored from step %d" % checkpoint['step'], flush=True) start_epoch = checkpoint['epoch'] args.restore_step = checkpoint['step'] else: args.restore_step = 0 # use half mode if args.use_half: model.half() for layer in model.modules(): if isinstance(layer, torch.nn.BatchNorm1d): layer.float() if use_cuda: model = model.cuda() criterion.cuda() if criterion_st: criterion_st.cuda() if args.restore_path: # print(checkpoint['optimizer']) # print('---opt', optimizer) optimizer.load_state_dict(checkpoint['optimizer']) # DISTRUBUTED if num_gpus > 1: model = apply_gradient_allreduce(model) # reset lr if args.reset_lr: for group in optimizer.param_groups: group['initial_lr'] = c.lr if c.lr_decay: scheduler = NoamLR( optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1, use_half=args.use_half, ) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if 'best_loss' not in locals(): best_loss = float('inf') for epoch in range(0, c.epochs): train_loss, current_step = train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, ap, epoch, args.use_half) if c.run_eval: val_loss = evaluate(model, criterion, criterion_st, ap, current_step, epoch, args.use_half) print( " | > Training Loss: {:.5f} Validation Loss: {:.5f}".format( train_loss, val_loss), flush=True) target_loss = val_loss else: print(" | > Training Loss: {:.5f}".format(train_loss), flush=True) target_loss = train_loss best_loss = save_best_model(model, optimizer, target_loss, best_loss, OUT_PATH, current_step, epoch)
def main(args): #pylint: disable=redefined-outer-name # Audio processor ap = AudioProcessor(**c.audio) # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) if c.use_speaker_embedding: speakers = get_speakers(c.data_path, c.meta_file_train, c.dataset) if args.restore_path: prev_out_path = os.path.dirname(args.restore_path) speaker_mapping = load_speaker_mapping(prev_out_path) assert all([speaker in speaker_mapping for speaker in speakers]), "As of now you, you cannot " \ "introduce new speakers to " \ "a previously trained model." else: speaker_mapping = {name: i for i, name in enumerate(speakers)} save_speaker_mapping(OUT_PATH, speaker_mapping) num_speakers = len(speaker_mapping) print("Training with {} speakers: {}".format(num_speakers, ", ".join(speakers))) else: num_speakers = 0 model = setup_model(num_chars, num_speakers, c) print(" | > Num output units : {}".format(ap.num_freq), flush=True) #optimizer = optim.Adam(model.parameters(), lr=c.lr, weight_decay=0) optimizer = Ranger(model.parameters(), lr=c.lr, weight_decay=c.wd) optimizer_gst = Ranger(model.textgst.parameters(), lr=c.lr, weight_decay=c.wd) if c.text_gst else None if c.stopnet and c.separate_stopnet: optimizer_st = Ranger(model.decoder.stopnet.parameters(), lr=c.lr) else: optimizer_st = None if c.loss_masking: criterion = L1LossMasked() if c.model in ["Tacotron", "TacotronGST"] else MSELossMasked() else: criterion = nn.L1Loss() if c.model in ["Tacotron", "TacotronGST"] else nn.MSELoss() criterion_st = nn.BCEWithLogitsLoss() if c.stopnet else None criterion_gst = nn.L1Loss() if c.text_gst else None if args.restore_path: checkpoint = torch.load(args.restore_path) try: # TODO: fix optimizer init, model.cuda() needs to be called before # optimizer restore # optimizer.load_state_dict(checkpoint['optimizer']) if c.reinit_layers: raise RuntimeError model.load_state_dict(checkpoint['model']) except: print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint, c) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group['lr'] = c.lr print( " > Model restored from step %d" % checkpoint['step'], flush=True) args.restore_step = checkpoint['step'] else: args.restore_step = 0 if use_cuda: model = model.cuda() criterion.cuda() if criterion_st: criterion_st.cuda() # DISTRUBUTED if num_gpus > 1: model = apply_gradient_allreduce(model) if c.lr_decay: scheduler = NoamLR( optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if 'best_loss' not in locals(): best_loss = float('inf') global_step = args.restore_step for epoch in range(0, c.epochs): # set gradual training if c.gradual_training is not None: r, c.batch_size = gradual_training_scheduler(global_step, c) c.r = r model.decoder.set_r(r) print(" > Number of outputs per iteration:", model.decoder.r) train_loss, global_step = train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, ap, global_step, epoch, criterion_gst=criterion_gst, optimizer_gst=optimizer_gst) if epoch % 5 == 0: val_loss = evaluate(model, criterion, criterion_st, criterion_gst, ap, global_step, epoch) print( " | > Training Loss: {:.5f} Validation Loss: {:.5f}".format( train_loss, val_loss), flush=True) target_loss = train_loss if c.run_eval: target_loss = val_loss best_loss = save_best_model(model, optimizer, optimizer_st, optimizer_gst, target_loss, best_loss, OUT_PATH, global_step, epoch)
try: path = os.path.realpath(os.path.dirname(__file__)) except NameError as e: path = './' C = load_config(os.path.join(path, 'pretrained_models/TTS/config.json')) C.forward_attn_mask = False C.windowing = True # load the audio processor ap = AudioProcessor(**C.audio) num_speakers = 0 # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) model = setup_model(num_chars, num_speakers, C) cp = torch.load(os.path.join(path, 'pretrained_models/TTS/best_model.pth.tar'), map_location='cpu') model.load_state_dict(cp['model'], strict=False) model.r = cp['r'] model.decoder.r = cp['r'] model.eval() if use_cuda: model.cuda() VC = load_config( os.path.join(path, 'pretrained_models/WaveRNN/config.json')) bits = 10 vocoder_model = VocoderModel( rnn_dims=512,
# load the audio processor ap = AudioProcessor(**C.audio) # load speakers if args.speakers_json != '': speakers = json.load(open(args.speakers_json, 'r')) num_speakers = len(speakers) else: num_speakers = 0 # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) #- remove num_speaker # model = setup_model(num_chars, num_speakers, C) model = setup_model(num_chars, C) cp = torch.load(args.model_path) model.load_state_dict(cp['model']) model.eval() if args.use_cuda: model.cuda() # load vocoder model if args.vocoder_path != "": VC = load_config(args.vocoder_config_path) bits = 10 vocoder_model = VocoderModel(rnn_dims=512, fc_dims=512, mode=VC.mode, mulaw=VC.mulaw, pad=VC.pad,
def _create_random_model(self): config = load_config(os.path.join(get_tests_output_path(), 'dummy_model_config.json')) num_chars = len(phonemes) if config.use_phonemes else len(symbols) model = setup_model(num_chars, 0, config) output_path = os.path.join(get_tests_output_path()) save_checkpoint(model, None, None, None, output_path, 10, 10)