def setup(self): # load configs self.TTS_CONFIG = load_config(self.TTS_CONFIG) self.VOCODER_CONFIG = load_config(self.VOCODER_CONFIG) # load the audio processor self.ap = AudioProcessor(**self.TTS_CONFIG.audio) # load the model num_chars = len(phonemes) if self.TTS_CONFIG.use_phonemes else len( symbols) self.model = setup_model(num_chars, len(self.speakers), self.TTS_CONFIG) self.model, _ = load_checkpoint(self.model, self.TTS_MODEL, use_cuda=self.use_cuda) self.model.eval() # LOAD VOCODER MODEL self.vocoder_model = setup_generator(self.VOCODER_CONFIG) self.vocoder_model, _ = load_vocoder_checkpoint( self.vocoder_model, checkpoint_path=self.VOCODER_MODEL) self.vocoder_model.remove_weight_norm() self.vocoder_model.inference_padding = 0 self.ap_vocoder = AudioProcessor(**self.VOCODER_CONFIG['audio']) if self.use_cuda: self.vocoder_model.cuda() self.vocoder_model.eval()
def load_tacotron2(use_cuda): """ Loads the Tacotron2 model Parameters ---------- use_cuda : bool whether to use the gpu Returns ------- model, audio processor, model config """ TTS_MODEL = model_path / 'model.pth.tar' TTS_CONFIG = model_path / 'config.json' TTS_CONFIG = load_config(TTS_CONFIG) TTS_CONFIG.audio['stats_path'] = str(model_path / 'scale_stats.npy') ap = AudioProcessor(**TTS_CONFIG.audio) num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols) model = setup_model(num_chars, 0, TTS_CONFIG) cp = torch.load(TTS_MODEL, map_location=torch.device('cpu')) model.load_state_dict(cp['model']) if use_cuda: model.cuda() if 'r' in cp: model.decoder.set_r(cp['r']) model.eval() return model, ap, TTS_CONFIG
def _create_random_model(self): # pylint: disable=global-statement global symbols, phonemes config = load_config(os.path.join(get_tests_output_path(), 'dummy_model_config.json')) if 'characters' in config.keys(): symbols, phonemes = make_symbols(**config.characters) num_chars = len(phonemes) if config.use_phonemes else len(symbols) model = setup_model(num_chars, 0, config) output_path = os.path.join(get_tests_output_path()) save_checkpoint(model, None, 10, 10, 1, output_path)
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data, symbols, phonemes, model_characters, speaker_mapping # Audio processor ap = AudioProcessor(**c.audio) if "characters" in c.keys() and c["characters"]: symbols, phonemes = make_symbols(**c.characters) # set model characters model_characters = phonemes if c.use_phonemes else symbols num_chars = len(model_characters) # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets) # use eval and training partitions meta_data = meta_data_train + meta_data_eval # parse speakers num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(c, args, meta_data_train, None) # setup model model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim=speaker_embedding_dim) # restore model checkpoint = torch.load(args.checkpoint_path, map_location="cpu") model.load_state_dict(checkpoint["model"]) if use_cuda: model.cuda() num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) # set r r = 1 if c.model.lower() == "glow_tts" else model.decoder.r own_loader = setup_loader(ap, r, verbose=True) extract_spectrograms( own_loader, model, ap, args.output_path, quantized_wav=args.quantized, save_audio=args.save_audio, debug=args.debug, metada_name="metada.txt", )
def test_Tacotron(): # set paths config_path = os.path.join(get_tests_input_path(), "test_tacotron_config.json") checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth.tar") output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/") # load config c = load_config(config_path) # create model num_chars = len(phonemes if c.use_phonemes else symbols) model = setup_model(num_chars, 1, c, speaker_embedding_dim=None) # save model torch.save({"model": model.state_dict()}, checkpoint_path) # run test run_cli( f'CUDA_VISIBLE_DEVICES="" python TTS/bin/extract_tts_spectrograms.py --config_path "{config_path}" --checkpoint_path "{checkpoint_path}" --output_path "{output_path}"' ) run_cli(f'rm -rf "{output_path}" "{checkpoint_path}"')
def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -> None: """Load the TTS model. Args: tts_checkpoint (str): path to the model checkpoint. tts_config_path (str): path to the model config file. use_cuda (bool): enable/disable CUDA use. """ # pylint: disable=global-statement global symbols, phonemes self.tts_config = load_config(tts_config_path) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(verbose=False, **self.tts_config.audio) if "characters" in self.tts_config.keys(): symbols, phonemes = make_symbols(**self.tts_config.characters) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) if self.tts_config.use_speaker_embedding is True: self.tts_speakers_file = ( self.tts_speakers_file if self.tts_speakers_file else self.tts_config["external_speaker_embedding_file"]) self._load_speakers(self.tts_speakers_file) self.tts_model = setup_model( self.input_size, num_speakers=self.num_speakers, c=self.tts_config, speaker_embedding_dim=self.speaker_embedding_dim, ) self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) if use_cuda: self.tts_model.cuda()
def load_tts(self, tts_checkpoint, tts_config, use_cuda): # pylint: disable=global-statement global symbols, phonemes self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if 'characters' in self.tts_config.keys(): symbols, phonemes = make_symbols(**self.tts_config.characters) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) self.tts_model = setup_model(self.input_size, num_speakers=self.num_speakers, c=self.tts_config) self.tts_model.load_checkpoint(tts_config, tts_checkpoint, eval=True) if use_cuda: self.tts_model.cuda()
def load_tts(self, tts_checkpoint, tts_config, use_cuda): # pylint: disable=global-statement global symbols, phonemes print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > checkpoint file: ", tts_checkpoint) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if 'characters' in self.tts_config.keys(): symbols, phonemes = make_symbols(**self.tts_config.characters) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # TODO: fix this for multi-speaker model - load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping(self.config.tts_speakers) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(tts_checkpoint, map_location=torch.device('cpu')) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp: self.tts_model.decoder.set_r(cp['r']) print(f" > model reduction factor: {cp['r']}")
def __init__(self): # load the audio processor self.audio_processor = AudioProcessor(**TTS_CONFIG.audio) # LOAD TTS MODEL # multi speaker speakers = [] speaker_id = None # load the model num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols) self.model = setup_model(num_chars, len(speakers), TTS_CONFIG) # load model state self.model, _ = load_checkpoint(self.model, TTS_MODEL, use_cuda=USE_CUDA) self.model.eval() self.model.store_inverse() # LOAD VOCODER MODEL self.vocoder_model = setup_generator(VOCODER_CONFIG) self.vocoder_model.load_state_dict( torch.load(VOCODER_MODEL, map_location="cpu")["model"]) self.vocoder_model.remove_weight_norm() self.vocoder_model.inference_padding = 0 # scale factor for sampling rate difference self.scale_factor = [ 1, VOCODER_CONFIG['audio']['sample_rate'] / self.audio_processor.sample_rate ] print(f"scale_factor: {self.scale_factor}") self.ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) if USE_CUDA: self.vocoder_model.cuda() self.vocoder_model.eval()
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train, meta_data_eval, symbols, phonemes, model_characters, speaker_mapping # Audio processor ap = AudioProcessor(**config.audio.to_dict()) if config.has("characters") and config.characters: symbols, phonemes = make_symbols(**config.characters.to_dict()) # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, config.distributed["backend"], config.distributed["url"]) # set model characters model_characters = phonemes if config.use_phonemes else symbols num_chars = len(model_characters) # load data instances meta_data_train, meta_data_eval = load_meta_data(config.datasets, eval_split=True) # parse speakers num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers( config, args, meta_data_train, OUT_PATH) # setup model model = setup_model(num_chars, num_speakers, config, speaker_embedding_dim=speaker_embedding_dim) optimizer = RAdam(model.parameters(), lr=config.lr, weight_decay=0, betas=(0.9, 0.98), eps=1e-9) criterion = AlignTTSLoss(config) if args.restore_path: print(f" > Restoring from {os.path.basename(args.restore_path)} ...") checkpoint = torch.load(args.restore_path, map_location="cpu") try: # TODO: fix optimizer init, model.cuda() needs to be called before # optimizer restore optimizer.load_state_dict(checkpoint["optimizer"]) if config.reinit_layers: raise RuntimeError model.load_state_dict(checkpoint["model"]) except: # pylint: disable=bare-except print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint["model"], config) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group["initial_lr"] = config.lr print(" > Model restored from step %d" % checkpoint["step"], flush=True) args.restore_step = checkpoint["step"] else: args.restore_step = 0 if use_cuda: model.cuda() criterion.cuda() # DISTRUBUTED if num_gpus > 1: model = DDP_th(model, device_ids=[args.rank]) if config.noam_schedule: scheduler = NoamLR(optimizer, warmup_steps=config.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if args.restore_step == 0 or not args.best_path: best_loss = float("inf") print(" > Starting with inf best loss.") else: print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] print(f" > Starting with loaded last best loss {best_loss}.") keep_all_best = config.keep_all_best keep_after = config.keep_after # void if keep_all_best False # define dataloaders train_loader = setup_loader(ap, 1, is_val=False, verbose=True) eval_loader = setup_loader(ap, 1, is_val=True, verbose=True) global_step = args.restore_step def set_phase(): """Set AlignTTS training phase""" if isinstance(config.phase_start_steps, list): vals = [i < global_step for i in config.phase_start_steps] if not True in vals: phase = 0 else: phase = ( len(config.phase_start_steps) - [i < global_step for i in config.phase_start_steps][::-1].index(True) - 1) else: phase = None return phase for epoch in range(0, config.epochs): cur_phase = set_phase() print(f"\n > Current AlignTTS phase: {cur_phase}") c_logger.print_epoch_start(epoch, config.epochs) train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch, cur_phase) eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, global_step, epoch, cur_phase) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict["avg_loss"] if config.run_eval: target_loss = eval_avg_loss_dict["avg_loss"] best_loss = save_best_model( target_loss, best_loss, model, optimizer, global_step, epoch, 1, OUT_PATH, model_characters, keep_all_best=keep_all_best, keep_after=keep_after, )
ap = AudioProcessor(**C.audio) # if the vocabulary was passed, replace the default if 'characters' in C.keys(): symbols, phonemes = make_symbols(**C.characters) # load speakers if args.speakers_json != '': speakers = json.load(open(args.speakers_json, 'r')) num_speakers = len(speakers) else: num_speakers = 0 # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) model = setup_model(num_chars, num_speakers, C) cp = torch.load(args.model_path, map_location=torch.device('cpu')) model.load_state_dict(cp['model']) model.eval() if args.use_cuda: model.cuda() model.decoder.set_r(cp['r']) # load vocoder model if args.vocoder_path != "": VC = load_config(args.vocoder_config_path) vocoder_model = setup_generator(VC) vocoder_model.load_state_dict( torch.load(args.vocoder_path, map_location="cpu")["model"]) vocoder_model.remove_weight_norm() if args.use_cuda:
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train, meta_data_eval, symbols, phonemes # Audio processor ap = AudioProcessor(**c.audio) if 'characters' in c.keys(): symbols, phonemes = make_symbols(**c.characters) # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets) # set the portion of the data used for training if 'train_portion' in c.keys(): meta_data_train = meta_data_train[:int( len(meta_data_train) * c.train_portion)] if 'eval_portion' in c.keys(): meta_data_eval = meta_data_eval[:int( len(meta_data_eval) * c.eval_portion)] # parse speakers num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers( c, args, meta_data_train, OUT_PATH) model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim) # scalers for mixed precision training scaler = torch.cuda.amp.GradScaler() if c.mixed_precision else None scaler_st = torch.cuda.amp.GradScaler( ) if c.mixed_precision and c.separate_stopnet else None params = set_weight_decay(model, c.wd) optimizer = RAdam(params, lr=c.lr, weight_decay=0) if c.stopnet and c.separate_stopnet: optimizer_st = RAdam(model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0) else: optimizer_st = None # setup criterion criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4) if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') try: print(" > Restoring Model.") model.load_state_dict(checkpoint['model']) # optimizer restore print(" > Restoring Optimizer.") optimizer.load_state_dict(checkpoint['optimizer']) if "scaler" in checkpoint and c.mixed_precision: print(" > Restoring AMP Scaler...") scaler.load_state_dict(checkpoint["scaler"]) if c.reinit_layers: raise RuntimeError except KeyError: print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint['model'], c) # torch.save(model_dict, os.path.join(OUT_PATH, 'state_dict.pt')) # print("State Dict saved for debug in: ", os.path.join(OUT_PATH, 'state_dict.pt')) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group['lr'] = c.lr print(" > Model restored from step %d" % checkpoint['step'], flush=True) args.restore_step = checkpoint['step'] else: args.restore_step = 0 if use_cuda: model.cuda() criterion.cuda() # DISTRUBUTED if num_gpus > 1: model = apply_gradient_allreduce(model) if c.noam_schedule: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if 'best_loss' not in locals(): best_loss = float('inf') global_step = args.restore_step for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) # set gradual training if c.gradual_training is not None: r, c.batch_size = gradual_training_scheduler(global_step, c) c.r = r model.decoder.set_r(r) if c.bidirectional_decoder: model.decoder_backward.set_r(r) print("\n > Number of output frames:", model.decoder.r) train_avg_loss_dict, global_step = train(model, criterion, optimizer, optimizer_st, scheduler, ap, global_step, epoch, scaler, scaler_st, speaker_mapping) eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch, speaker_mapping) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_postnet_loss'] if c.run_eval: target_loss = eval_avg_loss_dict['avg_postnet_loss'] best_loss = save_best_model( target_loss, best_loss, model, optimizer, global_step, epoch, c.r, OUT_PATH, scaler=scaler.state_dict() if c.mixed_precision else None)
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train, meta_data_eval, symbols, phonemes # Audio processor ap = AudioProcessor(**c.audio) if 'characters' in c.keys(): symbols, phonemes = make_symbols(**c.characters) # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets) # set the portion of the data used for training if 'train_portion' in c.keys(): meta_data_train = meta_data_train[:int( len(meta_data_train) * c.train_portion)] if 'eval_portion' in c.keys(): meta_data_eval = meta_data_eval[:int( len(meta_data_eval) * c.eval_portion)] # parse speakers if c.use_speaker_embedding: speakers = get_speakers(meta_data_train) if args.restore_path: if c.use_external_speaker_embedding_file: # if restore checkpoint and use External Embedding file prev_out_path = os.path.dirname(args.restore_path) speaker_mapping = load_speaker_mapping(prev_out_path) if not speaker_mapping: print( "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" ) speaker_mapping = load_speaker_mapping( c.external_speaker_embedding_file) if not speaker_mapping: raise RuntimeError( "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file" ) speaker_embedding_dim = len(speaker_mapping[list( speaker_mapping.keys())[0]]['embedding']) elif not c.use_external_speaker_embedding_file: # if restore checkpoint and don't use External Embedding file prev_out_path = os.path.dirname(args.restore_path) speaker_mapping = load_speaker_mapping(prev_out_path) speaker_embedding_dim = None assert all([speaker in speaker_mapping for speaker in speakers]), "As of now you, you cannot " \ "introduce new speakers to " \ "a previously trained model." elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: # if start new train using External Embedding file speaker_mapping = load_speaker_mapping( c.external_speaker_embedding_file) speaker_embedding_dim = len(speaker_mapping[list( speaker_mapping.keys())[0]]['embedding']) elif c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file: # if start new train using External Embedding file and don't pass external embedding file raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" else: # if start new train and don't use External Embedding file speaker_mapping = {name: i for i, name in enumerate(speakers)} speaker_embedding_dim = None save_speaker_mapping(OUT_PATH, speaker_mapping) num_speakers = len(speaker_mapping) print("Training with {} speakers: {}".format(num_speakers, ", ".join(speakers))) else: num_speakers = 0 speaker_embedding_dim = None speaker_mapping = None model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim) params = set_weight_decay(model, c.wd) optimizer = RAdam(params, lr=c.lr, weight_decay=0) if c.stopnet and c.separate_stopnet: optimizer_st = RAdam(model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0) else: optimizer_st = None if c.apex_amp_level == "O1": # pylint: disable=import-outside-toplevel from apex import amp model.cuda() model, optimizer = amp.initialize(model, optimizer, opt_level=c.apex_amp_level) else: amp = None # setup criterion criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4) if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') try: # TODO: fix optimizer init, model.cuda() needs to be called before # optimizer restore # optimizer.load_state_dict(checkpoint['optimizer']) if c.reinit_layers: raise RuntimeError model.load_state_dict(checkpoint['model']) except KeyError: print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint['model'], c) # torch.save(model_dict, os.path.join(OUT_PATH, 'state_dict.pt')) # print("State Dict saved for debug in: ", os.path.join(OUT_PATH, 'state_dict.pt')) model.load_state_dict(model_dict) del model_dict if amp and 'amp' in checkpoint: amp.load_state_dict(checkpoint['amp']) for group in optimizer.param_groups: group['lr'] = c.lr print(" > Model restored from step %d" % checkpoint['step'], flush=True) args.restore_step = checkpoint['step'] else: args.restore_step = 0 if use_cuda: model.cuda() criterion.cuda() # DISTRUBUTED if num_gpus > 1: model = apply_gradient_allreduce(model) if c.noam_schedule: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if 'best_loss' not in locals(): best_loss = float('inf') global_step = args.restore_step for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) # set gradual training if c.gradual_training is not None: r, c.batch_size = gradual_training_scheduler(global_step, c) c.r = r model.decoder.set_r(r) if c.bidirectional_decoder: model.decoder_backward.set_r(r) print("\n > Number of output frames:", model.decoder.r) train_avg_loss_dict, global_step = train(model, criterion, optimizer, optimizer_st, scheduler, ap, global_step, epoch, amp, speaker_mapping) eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch, speaker_mapping) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_postnet_loss'] if c.run_eval: target_loss = eval_avg_loss_dict['avg_postnet_loss'] best_loss = save_best_model( target_loss, best_loss, model, optimizer, global_step, epoch, c.r, OUT_PATH, amp_state_dict=amp.state_dict() if amp else None)
default=16, type=int, help='Batch size for the model. Use batch_size=1 if you have no CUDA.') args = parser.parse_args() C = load_config(args.config_path) ap = AudioProcessor(**C.audio) # if the vocabulary was passed, replace the default if 'characters' in C.keys(): symbols, phonemes = make_symbols(**C.characters) # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) # TODO: handle multi-speaker model = setup_model(num_chars, num_speakers=0, c=C) model, _ = load_checkpoint(model, args.model_path, None, args.use_cuda) model.eval() # data loader preprocessor = importlib.import_module('TTS.tts.datasets.preprocess') preprocessor = getattr(preprocessor, args.dataset) meta_data = preprocessor(args.data_path, args.dataset_metafile) dataset = MyDataset( model.decoder.r, C.text_cleaner, compute_linear_spec=False, ap=ap, meta_data=meta_data, tp=C.characters if 'characters' in C.keys() else None, add_blank=C['add_blank'] if 'add_blank' in C.keys() else False,
def setup(USE_CUDA): TEXT = '' OUT_PATH = 'tests-audios/' # create output path os.makedirs(OUT_PATH, exist_ok=True) SPEAKER_FILEID = None # if None use the first embedding from speakers.json # model vars MODEL_PATH = 'best_model.pth.tar' CONFIG_PATH = 'config.json' # vocoder vars VOCODER_PATH = '' VOCODER_CONFIG_PATH = '' # load the config C = load_config(CONFIG_PATH) C.forward_attn_mask = True # load the audio processor ap = AudioProcessor(**C.audio) # if the vocabulary was passed, replace the default if 'characters' in C.keys(): symbols, phonemes = make_symbols(**C.characters) speaker_embedding = None speaker_embedding_dim = None num_speakers = 0 # load speakers if SPEAKER_JSON != '': speaker_mapping = json.load(open(SPEAKER_JSON, 'r')) num_speakers = len(speaker_mapping) if C.use_external_speaker_embedding_file: if SPEAKER_FILEID is not None: speaker_embedding = speaker_mapping[SPEAKER_FILEID][ 'embedding'] else: # if speaker_fileid is not specificated use the first sample in speakers.json choise_speaker = list(speaker_mapping.keys())[0] print(" Speaker: ", choise_speaker.split('_')[0], 'was chosen automatically', "(this speaker seen in training)") speaker_embedding = speaker_mapping[choise_speaker][ 'embedding'] speaker_embedding_dim = len(speaker_embedding) print(speaker_embedding_dim) # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim) cp = torch.load(MODEL_PATH, map_location=torch.device('cpu')) model.load_state_dict(cp['model']) model.eval() if USE_CUDA: model.cuda() model.decoder.set_r(cp['r']) # load vocoder model if VOCODER_PATH != "": VC = load_config(VOCODER_CONFIG_PATH) vocoder_model = setup_generator(VC) vocoder_model.load_state_dict( torch.load(VOCODER_PATH, map_location="cpu")["model"]) vocoder_model.remove_weight_norm() if USE_CUDA: vocoder_model.cuda() vocoder_model.eval() else: vocoder_model = None VC = None # synthesize voice use_griffin_lim = VOCODER_PATH == "" if not C.use_external_speaker_embedding_file: if SPEAKER_FILEID.isdigit(): SPEAKER_FILEID = int(SPEAKER_FILEID) else: SPEAKER_FILEID = None else: SPEAKER_FILEID = None print("Using vocoder:", vocoder_model) return model, vocoder_model, C, ap, SPEAKER_FILEID, speaker_embedding
def __init__(self, text, expected_output_audio_format, file_name): # set a pysbd segmenter to be used later to divide the input into segments self.seg = pysbd.Segmenter(language="en", clean=True) # runtime settings use_cuda = False # model paths - models and config files are taken from Mozilla TTS's github page TTS_MODEL = "/path/to/checkpoint_130000.pth.tar" TTS_CONFIG = "server/config/config.json" VOCODER_MODEL = "/path/to/checkpoint_1450000.pth.tar" VOCODER_CONFIG = "server/config/config_vocoder.json" # load configs TTS_CONFIG = load_config(TTS_CONFIG) self.TTS_CONFIG = TTS_CONFIG # set it as a class variable to be later used by convert_audio_to() VOCODER_CONFIG = load_config(VOCODER_CONFIG) # load the audio processor ap = AudioProcessor(**TTS_CONFIG.audio) # LOAD TTS MODEL # multi speaker self.speaker_id = None self.speakers = [] # use the imported symbols and phonemes global symbols, phonemes use_phonemes = TTS_CONFIG.use_phonemes if 'characters' in TTS_CONFIG.keys(): symbols, phonemes = make_symbols(**TTS_CONFIG.characters) if use_phonemes: num_chars = len(phonemes) else: num_chars = len(symbols) # load the model model = setup_model(num_chars, len(self.speakers), TTS_CONFIG) # load model state cp = torch.load(TTS_MODEL, map_location=torch.device('cpu')) # load the model model.load_state_dict(cp['model']) if use_cuda: model.cuda() model.eval() model.decoder.max_decoder_steps = 3000 # set model stepsize if 'r' in cp: model.decoder.set_r(cp['r']) # # LOAD VOCODER MODEL self.vocoder_model = setup_generator(VOCODER_CONFIG) self.vocoder_model.load_state_dict( torch.load(VOCODER_MODEL, map_location="cpu")["model"]) self.vocoder_model.remove_weight_norm() self.vocoder_model.inference_padding = 0 # ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) if use_cuda: self.vocoder_model.cuda() self.vocoder_model.eval() # TODO: need to train a model? wav = self.tts(model, text, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True) print(len(wav.tobytes())) # save the generated .wav file as (file_name + "_audio.wav") wavfile.write(file_name + "_audio.wav", TTS_CONFIG.audio["sample_rate"], wav) # convert the generated audio file to the specifed audio format self.convert_audio_to(expected_output_audio_format, file_name)
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train, meta_data_eval, speaker_mapping, symbols, phonemes, model_characters # Audio processor ap = AudioProcessor(**c.audio) # setup custom characters if set in config file. if "characters" in c.keys(): symbols, phonemes = make_symbols(**c.characters) # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) model_characters = phonemes if c.use_phonemes else symbols # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets) # set the portion of the data used for training if "train_portion" in c.keys(): meta_data_train = meta_data_train[:int( len(meta_data_train) * c.train_portion)] if "eval_portion" in c.keys(): meta_data_eval = meta_data_eval[:int( len(meta_data_eval) * c.eval_portion)] # parse speakers num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers( c, args, meta_data_train, OUT_PATH) model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim) # scalers for mixed precision training scaler = torch.cuda.amp.GradScaler() if c.mixed_precision else None scaler_st = torch.cuda.amp.GradScaler( ) if c.mixed_precision and c.separate_stopnet else None params = set_weight_decay(model, c.wd) optimizer = RAdam(params, lr=c.lr, weight_decay=0) if c.stopnet and c.separate_stopnet: optimizer_st = RAdam(model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0) else: optimizer_st = None # setup criterion criterion = TacotronLoss(c, stopnet_pos_weight=c.stopnet_pos_weight, ga_sigma=0.4) if args.restore_path: print(f" > Restoring from {os.path.basename(args.restore_path)}...") checkpoint = torch.load(args.restore_path, map_location="cpu") try: print(" > Restoring Model...") model.load_state_dict(checkpoint["model"]) # optimizer restore print(" > Restoring Optimizer...") optimizer.load_state_dict(checkpoint["optimizer"]) if "scaler" in checkpoint and c.mixed_precision: print(" > Restoring AMP Scaler...") scaler.load_state_dict(checkpoint["scaler"]) if c.reinit_layers: raise RuntimeError except (KeyError, RuntimeError): print(" > Partial model initialization...") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint["model"], c) # torch.save(model_dict, os.path.join(OUT_PATH, 'state_dict.pt')) # print("State Dict saved for debug in: ", os.path.join(OUT_PATH, 'state_dict.pt')) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group["lr"] = c.lr print(" > Model restored from step %d" % checkpoint["step"], flush=True) args.restore_step = checkpoint["step"] else: args.restore_step = 0 if use_cuda: model.cuda() criterion.cuda() # DISTRUBUTED if num_gpus > 1: model = apply_gradient_allreduce(model) if c.noam_schedule: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if args.restore_step == 0 or not args.best_path: best_loss = float("inf") print(" > Starting with inf best loss.") else: print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] print(f" > Starting with loaded last best loss {best_loss}.") keep_all_best = c.get("keep_all_best", False) keep_after = c.get("keep_after", 10000) # void if keep_all_best False # define data loaders train_loader = setup_loader(ap, model.decoder.r, is_val=False, verbose=True) eval_loader = setup_loader(ap, model.decoder.r, is_val=True) global_step = args.restore_step for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) # set gradual training if c.gradual_training is not None: r, c.batch_size = gradual_training_scheduler(global_step, c) c.r = r model.decoder.set_r(r) if c.bidirectional_decoder: model.decoder_backward.set_r(r) train_loader.dataset.outputs_per_step = r eval_loader.dataset.outputs_per_step = r train_loader = setup_loader(ap, model.decoder.r, is_val=False, dataset=train_loader.dataset) eval_loader = setup_loader(ap, model.decoder.r, is_val=True, dataset=eval_loader.dataset) print("\n > Number of output frames:", model.decoder.r) # train one epoch train_avg_loss_dict, global_step = train( train_loader, model, criterion, optimizer, optimizer_st, scheduler, ap, global_step, epoch, scaler, scaler_st, ) # eval one epoch eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict["avg_postnet_loss"] if c.run_eval: target_loss = eval_avg_loss_dict["avg_postnet_loss"] best_loss = save_best_model( target_loss, best_loss, model, optimizer, global_step, epoch, c.r, OUT_PATH, model_characters, keep_all_best=keep_all_best, keep_after=keep_after, scaler=scaler.state_dict() if c.mixed_precision else None, )
f_date = date(2020, 9, 16) l_date = date.today() delta = l_date - f_date AGE = delta.days print(AGE) use_cuda = False # LOAD TTS MODEL # multi speaker speaker_id = None speakers = [] # load the model num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols) model = setup_model(num_chars, len(speakers), TTS_CONFIG) # load model state cp = torch.load(TTS_MODEL, map_location=torch.device('cpu')) # load the model model.load_state_dict(cp['model']) if use_cuda: model.cuda() model.eval() # set model stepsize if 'r' in cp: model.decoder.set_r(cp['r']) # LOAD VOCODER MODEL
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train, meta_data_eval, symbols, phonemes # Audio processor ap = AudioProcessor(**c.audio) if 'characters' in c.keys(): symbols, phonemes = make_symbols(**c.characters) # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets) # set the portion of the data used for training if 'train_portion' in c.keys(): meta_data_train = meta_data_train[:int(len(meta_data_train) * c.train_portion)] if 'eval_portion' in c.keys(): meta_data_eval = meta_data_eval[:int(len(meta_data_eval) * c.eval_portion)] # parse speakers if c.use_speaker_embedding: speakers = get_speakers(meta_data_train) if args.restore_path: prev_out_path = os.path.dirname(args.restore_path) speaker_mapping = load_speaker_mapping(prev_out_path) assert all([speaker in speaker_mapping for speaker in speakers]), "As of now you, you cannot " \ "introduce new speakers to " \ "a previously trained model." else: speaker_mapping = {name: i for i, name in enumerate(speakers)} save_speaker_mapping(OUT_PATH, speaker_mapping) num_speakers = len(speaker_mapping) print("Training with {} speakers: {}".format(num_speakers, ", ".join(speakers))) else: num_speakers = 0 # setup model model = setup_model(num_chars, num_speakers, c) optimizer = RAdam(model.parameters(), lr=c.lr, weight_decay=0, betas=(0.9, 0.98), eps=1e-9) criterion = GlowTTSLoss() if c.apex_amp_level: # pylint: disable=import-outside-toplevel from apex import amp from apex.parallel import DistributedDataParallel as DDP model.cuda() model, optimizer = amp.initialize(model, optimizer, opt_level=c.apex_amp_level) else: amp = None if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') try: # TODO: fix optimizer init, model.cuda() needs to be called before # optimizer restore optimizer.load_state_dict(checkpoint['optimizer']) if c.reinit_layers: raise RuntimeError model.load_state_dict(checkpoint['model']) except: #pylint: disable=bare-except print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint['model'], c) model.load_state_dict(model_dict) del model_dict if amp and 'amp' in checkpoint: amp.load_state_dict(checkpoint['amp']) for group in optimizer.param_groups: group['initial_lr'] = c.lr print(" > Model restored from step %d" % checkpoint['step'], flush=True) args.restore_step = checkpoint['step'] else: args.restore_step = 0 if use_cuda: model.cuda() criterion.cuda() # DISTRUBUTED if num_gpus > 1: model = DDP(model) if c.noam_schedule: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if 'best_loss' not in locals(): best_loss = float('inf') global_step = args.restore_step model = data_depended_init(model, ap) for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) train_avg_loss_dict, global_step = train(model, criterion, optimizer, scheduler, ap, global_step, epoch, amp) eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_loss'] if c.run_eval: target_loss = eval_avg_loss_dict['avg_loss'] best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r, OUT_PATH, amp_state_dict=amp.state_dict() if amp else None)
# load speakers if args.speakers_json != '': speaker_mapping = json.load(open(args.speakers_json, 'r')) num_speakers = len(speaker_mapping) if C.use_external_speaker_embedding_file: if args.speaker_fileid is not None: speaker_embedding = speaker_mapping[ args.speaker_fileid]['embedding'] else: # if speaker_fileid is not specificated use the first sample in speakers.json speaker_embedding = speaker_mapping[list( speaker_mapping.keys())[0]]['embedding'] speaker_embedding_dim = len(speaker_embedding) # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim) cp = torch.load(args.model_path, map_location=torch.device('cpu')) model.load_state_dict(cp['model']) model.eval() if args.use_cuda: model.cuda() if is_tacotron(C): model.decoder.set_r(cp['r']) # load vocoder model if args.vocoder_path != "": VC = load_config(args.vocoder_config_path) vocoder_model = setup_generator(VC) vocoder_model.load_state_dict( torch.load(args.vocoder_path, map_location="cpu")["model"]) vocoder_model.remove_weight_norm()
def load(self): # load the config C = load_config(self.config_path) self.config = C # Resolve scale_stats path stats_path = C.audio.get("stats_path") if stats_path and not os.path.isfile(stats_path): # Look for stats next to config model_stats_path = os.path.join(os.path.dirname(self.config_path), "scale_stats.npy") if os.path.isfile(model_stats_path): # Patch config C.audio["stats_path"] = model_stats_path else: _LOGGER.warning("No scale stats found at %s", C.audio["stats_path"]) C.audio["stats_path"] = "" C.forward_attn_mask = True if "gst" not in C.keys(): # Patch config gst = { "gst_use_speaker_embedding": False, "gst_style_input": None, "gst_embedding_dim": 512, "gst_num_heads": 4, "gst_style_tokens": 10, } C["gst"] = gst setattr(C, "gst", gst) if "use_external_speaker_embedding_file" not in C.keys(): C["use_external_speaker_embedding_file"] = False setattr(C, "use_external_speaker_embedding_file", False) if "gst_use_speaker_embedding" not in C.gst: C.gst["gst_use_speaker_embedding"] = False # load the audio processor ap = AudioProcessor(**C.audio) self.ap = ap # if the vocabulary was passed, replace the default if "characters" in C.keys(): symbols, phonemes = make_symbols(**C.characters) else: from TTS.tts.utils.text.symbols import phonemes, symbols speaker_embedding = None speaker_embedding_dim = None num_speakers = 0 # load speakers if self.speakers_json != "": speaker_mapping = json.load(open(self.speakers_json, "r")) num_speakers = len(speaker_mapping) if C.use_external_speaker_embedding_file: if self.speaker_fileid is not None: speaker_embedding = speaker_mapping[ self.speaker_fileid]["embedding"] else: # if speaker_fileid is not specificated use the first sample in speakers.json speaker_embedding = speaker_mapping[list( speaker_mapping.keys())[0]]["embedding"] speaker_embedding_dim = len(speaker_embedding) self.speaker_embedding = speaker_embedding # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim) cp = torch.load(self.model_path, map_location=torch.device("cpu")) model.load_state_dict(cp["model"]) model.eval() if self.use_cuda: model.cuda() if hasattr(model.decoder, "set_r"): model.decoder.set_r(cp["r"]) self.model = model # load vocoder model if self.vocoder_path: VC = load_config(self.vocoder_config_path) # Resolve scale_stats path stats_path = VC.audio.get("stats_path") if stats_path and not os.path.isfile(stats_path): # Look for stats next to config vocoder_stats_path = os.path.join( os.path.dirname(self.vocoder_config_path), "scale_stats.npy") if os.path.isfile(vocoder_stats_path): # Patch config VC.audio["stats_path"] = vocoder_stats_path else: # Try next to TTS config vocoder_stats_path = os.path.join( os.path.dirname(self.config_path), "scale_stats.npy") if os.path.isfile(vocoder_stats_path): # Patch config VC.audio["stats_path"] = vocoder_stats_path else: _LOGGER.warning("No vocoder scale stats found at %s", VC.audio["stats_path"]) VC.audio["stats_path"] = "" self.ap_vocoder = AudioProcessor(**VC.audio) vocoder_model = setup_generator(VC) vocoder_model.load_state_dict( torch.load(self.vocoder_path, map_location="cpu")["model"]) vocoder_model.remove_weight_norm() vocoder_model.inference_padding = 0 if self.use_cuda: vocoder_model.cuda() vocoder_model.eval() if hasattr(vocoder_model, "compute_noise_level"): noise_schedule_path = os.path.join( os.path.dirname(self.vocoder_path), "noise_schedule.npy") if os.path.isfile(noise_schedule_path): _LOGGER.debug("Loading noise schedule from %s", noise_schedule_path) beta = np.load(noise_schedule_path, allow_pickle=True).tolist()["beta"] else: # Use if not computed noise schedule with tune_wavegrad _LOGGER.debug("Using default noise schedule") beta = np.linspace(1e-6, 0.01, self.wavegrad_iters) vocoder_model.compute_noise_level(beta) else: vocoder_model = None VC = None self.ap_vocoder = None self.vocoder_model = vocoder_model self.vocoder_config = VC # synthesize voice self.use_griffin_lim = self.vocoder_model is None if not C.use_external_speaker_embedding_file: if self.speaker_fileid and self.speaker_fileid.isdigit(): self.speaker_fileid = int(self.speaker_fileid) else: self.speaker_fileid = None else: self.speaker_fileid = None if (self.gst_style is None) and ("gst" in C.keys()): gst_style = C.gst.get("gst_style_input", None) else: # check if gst_style string is a dict, if is dict convert else use string try: gst_style = json.loads(self.gst_style) if max(map(int, gst_style.keys())) >= C.gst["gst_style_tokens"]: raise RuntimeError( "The highest value of the gst_style dictionary key must be less than the number of GST Tokens, \n Highest dictionary key value: {} \n Number of GST tokens: {}" .format(max(map(int, gst_style.keys())), C.gst["gst_style_tokens"])) except ValueError: gst_style = self.gst_style self.gst_style = gst_style # Pre-load language if C.get("phoneme_backend") == "gruut": load_gruut_language(C["phoneme_language"]) # Compute scale factors in case TTS/vocoder sample rates differ # See: https://github.com/mozilla/TTS/issues/520 self.scale_factors = None if self.ap_vocoder and (self.ap.sample_rate != self.ap_vocoder.sample_rate): self.scale_factors = (1, self.ap_vocoder.sample_rate / self.ap.sample_rate)
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train, meta_data_eval, symbols, phonemes, model_characters, speaker_mapping # Audio processor ap = AudioProcessor(**c.audio) if 'characters' in c.keys(): symbols, phonemes = make_symbols(**c.characters) # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) # set model characters model_characters = phonemes if c.use_phonemes else symbols num_chars = len(model_characters) # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets) # set the portion of the data used for training if 'train_portion' in c.keys(): meta_data_train = meta_data_train[:int( len(meta_data_train) * c.train_portion)] if 'eval_portion' in c.keys(): meta_data_eval = meta_data_eval[:int( len(meta_data_eval) * c.eval_portion)] # parse speakers num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers( c, args, meta_data_train, OUT_PATH) # setup model model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim=speaker_embedding_dim) optimizer = RAdam(model.parameters(), lr=c.lr, weight_decay=0, betas=(0.9, 0.98), eps=1e-9) criterion = GlowTTSLoss() if args.restore_path: print(f" > Restoring from {os.path.basename(args.restore_path)} ...") checkpoint = torch.load(args.restore_path, map_location='cpu') try: # TODO: fix optimizer init, model.cuda() needs to be called before # optimizer restore optimizer.load_state_dict(checkpoint['optimizer']) if c.reinit_layers: raise RuntimeError model.load_state_dict(checkpoint['model']) except: #pylint: disable=bare-except print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint['model'], c) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group['initial_lr'] = c.lr print(f" > Model restored from step {checkpoint['step']:d}", flush=True) args.restore_step = checkpoint['step'] else: args.restore_step = 0 if use_cuda: model.cuda() criterion.cuda() # DISTRUBUTED if num_gpus > 1: model = DDP_th(model, device_ids=[args.rank]) if c.noam_schedule: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if args.restore_step == 0 or not args.best_path: best_loss = float('inf') print(" > Starting with inf best loss.") else: print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") best_loss = torch.load(args.best_path, map_location='cpu')['model_loss'] print(f" > Starting with loaded last best loss {best_loss}.") keep_all_best = c.get('keep_all_best', False) keep_after = c.get('keep_after', 10000) # void if keep_all_best False # define dataloaders train_loader = setup_loader(ap, 1, is_val=False, verbose=True) eval_loader = setup_loader(ap, 1, is_val=True, verbose=True) global_step = args.restore_step model = data_depended_init(train_loader, model) for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch) eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_loss'] if c.run_eval: target_loss = eval_avg_loss_dict['avg_loss'] best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r, OUT_PATH, model_characters, keep_all_best=keep_all_best, keep_after=keep_after)
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train, meta_data_eval, symbols, phonemes # Audio processor ap = AudioProcessor(**c.audio) if 'characters' in c.keys(): symbols, phonemes = make_symbols(**c.characters) # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets) # set the portion of the data used for training if 'train_portion' in c.keys(): meta_data_train = meta_data_train[:int(len(meta_data_train) * c.train_portion)] if 'eval_portion' in c.keys(): meta_data_eval = meta_data_eval[:int(len(meta_data_eval) * c.eval_portion)] # parse speakers if c.use_speaker_embedding: speakers = get_speakers(meta_data_train) if args.restore_path: prev_out_path = os.path.dirname(args.restore_path) speaker_mapping = load_speaker_mapping(prev_out_path) assert all([speaker in speaker_mapping for speaker in speakers]), "As of now you, you cannot " \ "introduce new speakers to " \ "a previously trained model." else: speaker_mapping = {name: i for i, name in enumerate(speakers)} save_speaker_mapping(OUT_PATH, speaker_mapping) num_speakers = len(speaker_mapping) print("Training with {} speakers: {}".format(num_speakers, ", ".join(speakers))) else: num_speakers = 0 model = setup_model(num_chars, num_speakers, c) params = set_weight_decay(model, c.wd) optimizer = RAdam(params, lr=c.lr, weight_decay=0) if c.stopnet and c.separate_stopnet: optimizer_st = RAdam(model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0) else: optimizer_st = None # setup criterion criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4) if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') try: # TODO: fix optimizer init, model.cuda() needs to be called before # optimizer restore # optimizer.load_state_dict(checkpoint['optimizer']) if c.reinit_layers: raise RuntimeError model.load_state_dict(checkpoint['model']) except: print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint['model'], c) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group['lr'] = c.lr print(" > Model restored from step %d" % checkpoint['step'], flush=True) args.restore_step = checkpoint['step'] else: args.restore_step = 0 if use_cuda: model.cuda() criterion.cuda() # DISTRUBUTED if num_gpus > 1: model = apply_gradient_allreduce(model) if c.noam_schedule: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if 'best_loss' not in locals(): best_loss = float('inf') global_step = args.restore_step for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) # set gradual training if c.gradual_training is not None: r, c.batch_size = gradual_training_scheduler(global_step, c) c.r = r model.decoder.set_r(r) if c.bidirectional_decoder: model.decoder_backward.set_r(r) print("\n > Number of output frames:", model.decoder.r) train_avg_loss_dict, global_step = train(model, criterion, optimizer, optimizer_st, scheduler, ap, global_step, epoch) eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_postnet_loss'] if c.run_eval: target_loss = eval_avg_loss_dict['avg_postnet_loss'] best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r, OUT_PATH)