def setup(self): # load configs self.TTS_CONFIG = load_config(self.TTS_CONFIG) self.VOCODER_CONFIG = load_config(self.VOCODER_CONFIG) # load the audio processor self.ap = AudioProcessor(**self.TTS_CONFIG.audio) # load the model num_chars = len(phonemes) if self.TTS_CONFIG.use_phonemes else len( symbols) self.model = setup_model(num_chars, len(self.speakers), self.TTS_CONFIG) self.model, _ = load_checkpoint(self.model, self.TTS_MODEL, use_cuda=self.use_cuda) self.model.eval() # LOAD VOCODER MODEL self.vocoder_model = setup_generator(self.VOCODER_CONFIG) self.vocoder_model, _ = load_vocoder_checkpoint( self.vocoder_model, checkpoint_path=self.VOCODER_MODEL) self.vocoder_model.remove_weight_norm() self.vocoder_model.inference_padding = 0 self.ap_vocoder = AudioProcessor(**self.VOCODER_CONFIG['audio']) if self.use_cuda: self.vocoder_model.cuda() self.vocoder_model.eval()
def load_vocoder(self, model_file, model_config, use_cuda): self.vocoder_config = load_config(model_config) self.vocoder_ap = AudioProcessor(**self.vocoder_config['audio']) self.vocoder_model = setup_generator(self.vocoder_config) self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True) if use_cuda: self.vocoder_model.cuda()
def load_tts(self, model_path, model_file, model_config, use_cuda): tts_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_file) print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > model file: ", model_file) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping( os.path.join(model_path, self.config.tts_speakers)) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(self.model_file) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp and self.tts_config.model in ["Tacotron", "TacotronGST"]: self.tts_model.decoder.set_r(cp['r'])
def load_tts(self, tts_checkpoint, tts_config, use_cuda): print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > checkpoint file: ", tts_checkpoint) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # TODO: fix this for multi-speaker model - load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping(self.config.tts_speakers) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(tts_checkpoint, map_location=torch.device('cpu')) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp: self.tts_model.decoder.set_r(cp['r'])
def say(self, text, output): # load the model model = Tacotron(self.CONFIG.embedding_size, self.CONFIG.num_freq, self.CONFIG.num_mels, self.CONFIG.r) # load the audio processor ap = AudioProcessor(self.CONFIG.sample_rate, self.CONFIG.num_mels, self.CONFIG.min_level_db, self.CONFIG.frame_shift_ms, self.CONFIG.frame_length_ms, self.CONFIG.ref_level_db, self.CONFIG.num_freq, self.CONFIG.power, self.CONFIG.preemphasis, 60) # load model state if self.use_cuda: cp = torch.load(self.MODEL_PATH) else: cp = torch.load(self.MODEL_PATH, map_location=lambda storage, loc: storage) # load the model model.load_state_dict(cp['model']) if self.use_cuda: model.cuda() model.eval() model.decoder.max_decoder_steps = 400 wavs = self.text2audio(text, model, self.CONFIG, self.use_cuda, ap) audio = np.concatenate(wavs) ap.save_wav(audio, output) return
def __init__(self, csv_file, root_dir, outputs_per_step, sample_rate, text_cleaner, num_mels, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_freq, power, min_seq_len=0): with open(csv_file, "r") as f: self.frames = [line.split('\t') for line in f] self.root_dir = root_dir self.outputs_per_step = outputs_per_step self.sample_rate = sample_rate self.cleaners = text_cleaner self.min_seq_len = min_seq_len self.ap = AudioProcessor(sample_rate, num_mels, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_freq, power) print(" > Reading TWEB from - {}".format(root_dir)) print(" | > Number of instances : {}".format(len(self.frames))) self._sort_frames()
def test_speaker_embedding(): # load config config = load_config(encoder_config_path) config.audio.resample = True # create a dummy speaker encoder model = setup_speaker_encoder_model(config) save_checkpoint(model, None, None, get_tests_input_path(), 0) # load audio processor and speaker encoder ap = AudioProcessor(**config.audio) manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path) # load a sample audio and compute embedding waveform = ap.load_wav(sample_wav_path) mel = ap.melspectrogram(waveform) d_vector = manager.compute_d_vector(mel) assert d_vector.shape[1] == 256 # compute d_vector directly from an input file d_vector = manager.compute_d_vector_from_clip(sample_wav_path) d_vector2 = manager.compute_d_vector_from_clip(sample_wav_path) d_vector = torch.FloatTensor(d_vector) d_vector2 = torch.FloatTensor(d_vector2) assert d_vector.shape[0] == 256 assert (d_vector - d_vector2).sum() == 0.0 # compute d_vector from a list of wav files. d_vector3 = manager.compute_d_vector_from_clip([sample_wav_path, sample_wav_path2]) d_vector3 = torch.FloatTensor(d_vector3) assert d_vector3.shape[0] == 256 assert (d_vector - d_vector3).sum() != 0.0 # remove dummy model os.remove(encoder_model_path)
def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> None: """Load the vocoder model. Args: model_file (str): path to the model checkpoint. model_config (str): path to the model config file. use_cuda (bool): enable/disable CUDA use. """ self.vocoder_config = load_config(model_config) # Patch stats_path stats_path = self.vocoder_config["audio"].get("stats_path", "") if stats_path and (not os.path.isfile(stats_path)): stats_path = os.path.join(os.path.dirname(model_file), os.path.split(stats_path)[1]) self.vocoder_config["audio"]["stats_path"] = stats_path self.vocoder_ap = AudioProcessor(verbose=False, **self.vocoder_config.audio) self.vocoder_model = setup_vocoder_model(self.vocoder_config) self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True) if use_cuda: self.vocoder_model.cuda()
def load_model(self, model_path, model_name, model_config, use_cuda): model_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_name) print(" > Loading model ...") print(" | > model config: ", model_config) print(" | > model file: ", self.model_file) config = load_config(model_config) self.config = config self.use_cuda = use_cuda self.model = Tacotron(config.embedding_size, config.num_freq, config.num_mels, config.r) self.ap = AudioProcessor(config.sample_rate, config.num_mels, config.min_level_db, config.frame_shift_ms, config.frame_length_ms, config.preemphasis, config.ref_level_db, config.num_freq, config.power, griffin_lim_iters=60) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if use_cuda: self.model.cuda() self.model.eval()
def load(self, t_checkpoint_path, v_checkpoint_path, t_config_path=None, v_config_path=None, model_name='tacotron'): if t_checkpoint_path.endswith('.pt'): self.model_name = 'nvidia' print('Constructing model: %s' % self.model_name) # set-up params hparams = create_hparams() # load model from checkpoint self.model = Tacotron2(hparams) self.model.load_state_dict(torch.load(t_checkpoint_path, map_location='cpu')['state_dict']) _ = self.model.eval() else: # elif t_checkpoint_path.endswith('.pth.tar'): self.model_name = 'coqui' print('Constructing model: %s' % self.model_name) # load tts config and audio processor self.tts_config = load_config(t_config_path) self.tts_model = setup_tts_model(config=self.tts_config) self.tts_model.load_checkpoint(self.tts_config, t_checkpoint_path, eval=True) self.ap = AudioProcessor(verbose=False, **self.tts_config.audio) # load vocoder config and audio processor vocoder_config = load_config(v_config_path) self.vocoder_ap = AudioProcessor(verbose=False, **vocoder_config.audio) # Load neurips MelGAN for mel2audio synthesis self.vocoder = torch.hub.load('descriptinc/melgan-neurips', 'load_melgan') melgan_ckpt = torch.load(v_checkpoint_path, map_location='cpu') self.vocoder.mel2wav.load_state_dict(melgan_ckpt)
def init_speaker_encoder(self, model_path: str, config_path: str) -> None: self.speaker_encoder_config = load_config(config_path) self.speaker_encoder = setup_model(self.speaker_encoder_config) self.speaker_encoder.load_checkpoint(config_path, model_path, True) self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio) # normalize the input audio level and trim silences self.speaker_encoder_ap.do_sound_norm = True self.speaker_encoder_ap.do_trim_silence = True
def init_speaker_encoder(self, model_path: str, config_path: str) -> None: """Initialize a speaker encoder model. Args: model_path (str): Model file path. config_path (str): Model config file path. """ self.speaker_encoder_config = load_config(config_path) self.speaker_encoder = setup_speaker_encoder_model(self.speaker_encoder_config) self.speaker_encoder.load_checkpoint(config_path, model_path, eval=True, use_cuda=self.use_cuda) self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio)
def setup(): use_cuda = True # model paths TTS_MODEL = "tts_model.pth.tar" TTS_CONFIG = "config.json" VOCODER_MODEL = "vocoder_model.pth.tar" VOCODER_CONFIG = "config_vocoder.json" # Load configs TTS_CONFIG = load_config(TTS_CONFIG) VOCODER_CONFIG = load_config(VOCODER_CONFIG) ap = AudioProcessor(**TTS_CONFIG.audio) # LOAD TTS MODEL # multi speaker speaker_id = None speakers = [] # load the model num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols) model = setup_model(num_chars, len(speakers), TTS_CONFIG) # load model state cp = torch.load(TTS_MODEL, map_location=torch.device('cpu')) # load the model model.load_state_dict(cp['model']) if use_cuda: model.cuda() model.eval() # set model stepsize if 'r' in cp: model.decoder.set_r(cp['r']) from TTS.vocoder.utils.generic_utils import setup_generator # LOAD VOCODER MODEL vocoder_model = setup_generator(VOCODER_CONFIG) vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location="cpu")["model"]) vocoder_model.remove_weight_norm() vocoder_model.inference_padding = 0 ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) if use_cuda: vocoder_model.cuda() vocoder_model.eval() return model, vocoder_model, speaker_id, TTS_CONFIG, use_cuda, ap
def __init__(self, csv_file, root_dir, outputs_per_step, sample_rate, text_cleaner, num_mels, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_freq, power): self.frames = pd.read_csv(csv_file, sep='|', header=None) self.root_dir = root_dir self.outputs_per_step = outputs_per_step self.sample_rate = sample_rate self.cleaners = text_cleaner self.ap = AudioProcessor(sample_rate, num_mels, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_freq, power) print(" > Reading LJSpeech from - {}".format(root_dir)) print(" | > Number of instances : {}".format(len(self.frames)))
def load_tts_model(): MODEL_PATH = dirpath + '/tts_model/best_model.pth.tar' CONFIG_PATH = dirpath + '/tts_model/config.json' CONFIG = load_config(CONFIG_PATH) use_cuda = False num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols) model = Tacotron(num_chars, CONFIG.embedding_size, CONFIG.audio['num_freq'], CONFIG.audio['num_mels'], CONFIG.r, attn_windowing=False) num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols) model = Tacotron(num_chars, CONFIG.embedding_size, CONFIG.audio['num_freq'], CONFIG.audio['num_mels'], CONFIG.r, attn_windowing=False) # load the audio processor # CONFIG.audio["power"] = 1.3 CONFIG.audio["preemphasis"] = 0.97 ap = AudioProcessor(**CONFIG.audio) # load model state if use_cuda: cp = torch.load(MODEL_PATH) else: cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage) # load the model model.load_state_dict(cp['model']) if use_cuda: model.cuda() #model.eval() model.decoder.max_decoder_steps = 1000 return model, ap, MODEL_PATH, CONFIG, use_cuda
def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): # pylint: disable=no-self-use alignments = outputs["alignments"] text_input = batch["text_input"] text_lengths = batch["text_lengths"] mel_input = batch["mel_input"] d_vectors = batch["d_vectors"] speaker_ids = batch["speaker_ids"] # model runs reverse flow to predict spectrograms pred_outputs = self.inference( text_input[:1], aux_input={ "x_lengths": text_lengths[:1], "d_vectors": d_vectors, "speaker_ids": speaker_ids }, ) model_outputs = pred_outputs["model_outputs"] pred_spec = model_outputs[0].data.cpu().numpy() gt_spec = mel_input[0].data.cpu().numpy() align_img = alignments[0].data.cpu().numpy() figures = { "prediction": plot_spectrogram(pred_spec, ap, output_fig=False), "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), "alignment": plot_alignment(align_img, output_fig=False), } # Sample audio train_audio = ap.inv_melspectrogram(pred_spec.T) return figures, {"audio": train_audio}
def load_tacotron2(use_cuda): """ Loads the Tacotron2 model Parameters ---------- use_cuda : bool whether to use the gpu Returns ------- model, audio processor, model config """ TTS_MODEL = model_path / 'model.pth.tar' TTS_CONFIG = model_path / 'config.json' TTS_CONFIG = load_config(TTS_CONFIG) TTS_CONFIG.audio['stats_path'] = str(model_path / 'scale_stats.npy') ap = AudioProcessor(**TTS_CONFIG.audio) num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols) model = setup_model(num_chars, 0, TTS_CONFIG) cp = torch.load(TTS_MODEL, map_location=torch.device('cpu')) model.load_state_dict(cp['model']) if use_cuda: model.cuda() if 'r' in cp: model.decoder.set_r(cp['r']) model.eval() return model, ap, TTS_CONFIG
def load_vocoder(use_cuda): """ Loads the Vocoder model Parameters ---------- use_cuda : bool whether to use the gpu Returns ------- model, audio processor, model config """ VOCODER_MODEL = model_path / 'vocoder_model.pth.tar' VOCODER_CONFIG = model_path / 'vocoder_config.json' VOCODER_CONFIG = load_config(VOCODER_CONFIG) VOCODER_CONFIG.audio['stats_path'] = str(model_path / 'vocoder_scale_stats.npy') ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) vocoder_model = setup_generator(VOCODER_CONFIG) cp = torch.load(VOCODER_MODEL, map_location=torch.device('cpu')) vocoder_model.load_state_dict(cp['model']) vocoder_model.remove_weight_norm() vocoder_model.inference_padding = 0 if use_cuda: vocoder_model.cuda() vocoder_model.eval() return vocoder_model, ap_vocoder, VOCODER_CONFIG
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train global meta_data_eval ap = AudioProcessor(**c.audio) model = setup_model(c) optimizer = RAdam(model.parameters(), lr=c.lr) # pylint: disable=redefined-outer-name meta_data_train, meta_data_eval = load_meta_data(c.datasets, eval_split=False) data_loader, num_speakers = setup_loader(ap, is_val=False, verbose=True) if c.loss == "ge2e": criterion = GE2ELoss(loss_method="softmax") elif c.loss == "angleproto": criterion = AngleProtoLoss() elif c.loss == "softmaxproto": criterion = SoftmaxAngleProtoLoss(c.model["proj_dim"], num_speakers) else: raise Exception("The %s not is a loss supported" % c.loss) if args.restore_path: checkpoint = torch.load(args.restore_path) try: model.load_state_dict(checkpoint["model"]) if "criterion" in checkpoint: criterion.load_state_dict(checkpoint["criterion"]) except (KeyError, RuntimeError): print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint["model"], c) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group["lr"] = c.lr print(" > Model restored from step %d" % checkpoint["step"], flush=True) args.restore_step = checkpoint["step"] else: args.restore_step = 0 if c.lr_decay: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if use_cuda: model = model.cuda() criterion.cuda() global_step = args.restore_step _, global_step = train(model, optimizer, scheduler, criterion, data_loader, global_step)
def load_model(self, MODEL_PATH, sentence, CONFIG, use_cuda, OUT_FILE): # load the model num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols) model = Tacotron(num_chars, CONFIG.embedding_size, CONFIG.audio['num_freq'], CONFIG.audio['num_mels'], CONFIG.r, attn_windowing=False) # load the audio processor # CONFIG.audio["power"] = 1.3 CONFIG.audio["preemphasis"] = 0.97 ap = AudioProcessor(**CONFIG.audio) # load model state if use_cuda: cp = torch.load(MODEL_PATH) else: cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage) # load the model model.load_state_dict(cp['model']) if use_cuda: model.cuda() model.eval() model.decoder.max_decoder_steps = 1000 align, spec, stop_tokens, wav_norm = self.tts(model, sentence, CONFIG, use_cuda, ap, OUT_FILE) return wav_norm
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train global meta_data_eval ap = AudioProcessor(**c.audio) model = SpeakerEncoder( input_dim=c.model["input_dim"], proj_dim=c.model["proj_dim"], lstm_dim=c.model["lstm_dim"], num_lstm_layers=c.model["num_lstm_layers"], ) optimizer = RAdam(model.parameters(), lr=c.lr) if c.loss == "ge2e": criterion = GE2ELoss(loss_method="softmax") elif c.loss == "angleproto": criterion = AngleProtoLoss() else: raise Exception("The %s not is a loss supported" % c.loss) if args.restore_path: checkpoint = torch.load(args.restore_path) try: # TODO: fix optimizer init, model.cuda() needs to be called before # optimizer restore # optimizer.load_state_dict(checkpoint['optimizer']) if c.reinit_layers: raise RuntimeError model.load_state_dict(checkpoint["model"]) except KeyError: print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint, c) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group["lr"] = c.lr print(" > Model restored from step %d" % checkpoint["step"], flush=True) args.restore_step = checkpoint["step"] else: args.restore_step = 0 if use_cuda: model = model.cuda() criterion.cuda() if c.lr_decay: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) # pylint: disable=redefined-outer-name meta_data_train, meta_data_eval = load_meta_data(c.datasets) global_step = args.restore_step _, global_step = train(model, criterion, optimizer, scheduler, ap, global_step)
def main(): """Run `tts` model training directly by a `config.json` file.""" # init trainer args train_args = TrainVocoderArgs() parser = train_args.init_argparse(arg_prefix="") # override trainer args from comman-line args args, config_overrides = parser.parse_known_args() train_args.parse_args(args) # load config.json and register if args.config_path or args.continue_path: if args.config_path: # init from a file config = load_config(args.config_path) if len(config_overrides) > 0: config.parse_known_args(config_overrides, relaxed_parser=True) elif args.continue_path: # continue from a prev experiment config = load_config( os.path.join(args.continue_path, "config.json")) if len(config_overrides) > 0: config.parse_known_args(config_overrides, relaxed_parser=True) else: # init from console args from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel config_base = BaseTrainingConfig() config_base.parse_known_args(config_overrides) config = register_config(config_base.model)() # load training samples if "feature_path" in config and config.feature_path: # load pre-computed features print(f" > Loading features from: {config.feature_path}") eval_samples, train_samples = load_wav_feat_data( config.data_path, config.feature_path, config.eval_split_size) else: # load data raw wav files eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) # setup audio processor ap = AudioProcessor(**config.audio) # init the model from config model = setup_model(config) # init the trainer and 🚀 trainer = Trainer( train_args, config, config.output_path, model=model, train_samples=train_samples, eval_samples=eval_samples, training_assets={"audio_processor": ap}, parse_command_line_args=False, ) trainer.fit()
def init_encoder(self, model_path: str, config_path: str, use_cuda=False) -> None: """Initialize a speaker encoder model. Args: model_path (str): Model file path. config_path (str): Model config file path. use_cuda (bool, optional): Use CUDA. Defaults to False. """ self.use_cuda = use_cuda self.encoder_config = load_config(config_path) self.encoder = setup_encoder_model(self.encoder_config) self.encoder_criterion = self.encoder.load_checkpoint( self.encoder_config, model_path, eval=True, use_cuda=use_cuda) self.encoder_ap = AudioProcessor(**self.encoder_config.audio)
def init_from_config(config: Coqpit): """Initialize model from config.""" from TTS.utils.audio import AudioProcessor ap = AudioProcessor.init_from_config(config) tokenizer = TTSTokenizer.init_from_config(config) speaker_manager = SpeakerManager.init_from_config(config) return BaseTacotron(config, ap, tokenizer, speaker_manager)
class Synthesizer(object): def load_model(self, model_path, model_name, model_config, use_cuda): model_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_name) print(" > Loading model ...") print(" | > model config: ", model_config) print(" | > model file: ", self.model_file) config = load_config(model_config) self.config = config self.use_cuda = use_cuda self.model = Tacotron(config.embedding_size, config.num_freq, config.num_mels, config.r) self.ap = AudioProcessor(config.sample_rate, config.num_mels, config.min_level_db, config.frame_shift_ms, config.frame_length_ms, config.preemphasis, config.ref_level_db, config.num_freq, config.power, griffin_lim_iters=60) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if use_cuda: self.model.cuda() self.model.eval() def save_wav(self, wav, path): wav *= 32767 / max(1e-8, np.max(np.abs(wav))) # sf.write(path, wav.astype(np.int32), self.config.sample_rate, format='wav') # wav = librosa.util.normalize(wav.astype(np.float), norm=np.inf, axis=None) # wav = wav / wav.max() # sf.write(path, wav.astype('float'), self.config.sample_rate, format='ogg') scipy.io.wavfile.write(path, self.config.sample_rate, wav.astype(np.int16)) # librosa.output.write_wav(path, wav.astype(np.int16), self.config.sample_rate, norm=True) def tts(self, text): text_cleaner = [self.config.text_cleaner] wavs = [] for sen in text.split('.'): if len(sen) < 3: continue sen = sen.strip() sen +='.' print(sen) sen = sen.strip() seq = np.array(text_to_sequence(text, text_cleaner)) chars_var = torch.from_numpy(seq).unsqueeze(0) if self.use_cuda: chars_var = chars_var.cuda() mel_out, linear_out, alignments, stop_tokens = self.model.forward(chars_var) linear_out = linear_out[0].data.cpu().numpy() wav = self.ap.inv_spectrogram(linear_out.T) # wav = wav[:self.ap.find_endpoint(wav)] out = io.BytesIO() wavs.append(wav) wavs.append(np.zeros(10000)) self.save_wav(wav, out) return out
def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> None: """Load the vocoder model. Args: model_file (str): path to the model checkpoint. model_config (str): path to the model config file. use_cuda (bool): enable/disable CUDA use. """ self.vocoder_config = load_config(model_config) self.vocoder_ap = AudioProcessor(verbose=False, **self.vocoder_config["audio"]) self.vocoder_model = setup_generator(self.vocoder_config) self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True) if use_cuda: self.vocoder_model.cuda()
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train global meta_data_eval global train_classes ap = AudioProcessor(**c.audio) model = setup_encoder_model(c) optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model) # pylint: disable=redefined-outer-name meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True) train_data_loader, train_classes, map_classid_to_classname = setup_loader( ap, is_val=False, verbose=True) if c.run_eval: eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True) else: eval_data_loader = None num_classes = len(train_classes) criterion = model.get_criterion(c, num_classes) if c.loss == "softmaxproto" and c.model != "speaker_encoder": c.map_classid_to_classname = map_classid_to_classname copy_model_files(c, OUT_PATH) if args.restore_path: criterion, args.restore_step = model.load_checkpoint( c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion) print(" > Model restored from step %d" % args.restore_step, flush=True) else: args.restore_step = 0 if c.lr_decay: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if use_cuda: model = model.cuda() criterion.cuda() global_step = args.restore_step _, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step)
def __init__(self, use_cuda=False, verbose=False): self.use_cuda = use_cuda self.verbose = verbose # load configs self.TTS_CONFIG = load_config(TTS_CONFIG) self.VOCODER_CONFIG = load_config(VOCODER_CONFIG) # load the audio processor self.ap = AudioProcessor(**self.TTS_CONFIG.audio) # LOAD TTS MODEL self.speaker_id = None speakers = [] # load the model num_chars = len(phonemes) if self.TTS_CONFIG.use_phonemes else len(symbols) self.model = setup_model(num_chars, len(speakers), self.TTS_CONFIG) # load model state cp = torch.load(TTS_MODEL, map_location=torch.device('cpu')) # load the model self.model.load_state_dict(cp['model']) if self.use_cuda: self.model.cuda() self.model.eval() # set model stepsize if 'r' in cp: self.model.decoder.set_r(cp['r']) # LOAD VOCODER MODEL self.vocoder_model = setup_generator(self.VOCODER_CONFIG) self.vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location="cpu")["model"]) self.vocoder_model.remove_weight_norm() self.vocoder_model.inference_padding = 0 ap_vocoder = AudioProcessor(**self.VOCODER_CONFIG['audio']) if self.use_cuda: self.vocoder_model.cuda() self.vocoder_model.eval()
def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> None: """Load the vocoder model. 1. Load the vocoder config. 2. Init the AudioProcessor for the vocoder. 3. Init the vocoder model from the config. 4. Move the model to the GPU if CUDA is enabled. Args: model_file (str): path to the model checkpoint. model_config (str): path to the model config file. use_cuda (bool): enable/disable CUDA use. """ self.vocoder_config = load_config(model_config) self.vocoder_ap = AudioProcessor(verbose=False, **self.vocoder_config.audio) self.vocoder_model = setup_vocoder_model(self.vocoder_config) self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True) if use_cuda: self.vocoder_model.cuda()
def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -> None: """Load the TTS model. Args: tts_checkpoint (str): path to the model checkpoint. tts_config_path (str): path to the model config file. use_cuda (bool): enable/disable CUDA use. """ # pylint: disable=global-statement global symbols, phonemes self.tts_config = load_config(tts_config_path) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(verbose=False, **self.tts_config.audio) if "characters" in self.tts_config.keys(): symbols, phonemes = make_symbols(**self.tts_config.characters) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) if self.tts_config.use_speaker_embedding is True: self.tts_speakers_file = ( self.tts_speakers_file if self.tts_speakers_file else self.tts_config["external_speaker_embedding_file"]) self._load_speakers(self.tts_speakers_file) self.tts_model = setup_model( self.input_size, num_speakers=self.num_speakers, c=self.tts_config, speaker_embedding_dim=self.speaker_embedding_dim, ) self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) if use_cuda: self.tts_model.cuda()