Esempio n. 1
0
    def setup(self):
        # load configs
        self.TTS_CONFIG = load_config(self.TTS_CONFIG)
        self.VOCODER_CONFIG = load_config(self.VOCODER_CONFIG)

        # load the audio processor
        self.ap = AudioProcessor(**self.TTS_CONFIG.audio)

        # load the model
        num_chars = len(phonemes) if self.TTS_CONFIG.use_phonemes else len(
            symbols)
        self.model = setup_model(num_chars, len(self.speakers),
                                 self.TTS_CONFIG)

        self.model, _ = load_checkpoint(self.model,
                                        self.TTS_MODEL,
                                        use_cuda=self.use_cuda)
        self.model.eval()

        # LOAD VOCODER MODEL
        self.vocoder_model = setup_generator(self.VOCODER_CONFIG)
        self.vocoder_model, _ = load_vocoder_checkpoint(
            self.vocoder_model, checkpoint_path=self.VOCODER_MODEL)
        self.vocoder_model.remove_weight_norm()
        self.vocoder_model.inference_padding = 0

        self.ap_vocoder = AudioProcessor(**self.VOCODER_CONFIG['audio'])
        if self.use_cuda:
            self.vocoder_model.cuda()
        self.vocoder_model.eval()
Esempio n. 2
0
 def load_vocoder(self, model_file, model_config, use_cuda):
     self.vocoder_config = load_config(model_config)
     self.vocoder_ap = AudioProcessor(**self.vocoder_config['audio'])
     self.vocoder_model = setup_generator(self.vocoder_config)
     self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True)
     if use_cuda:
         self.vocoder_model.cuda()
Esempio n. 3
0
 def load_tts(self, model_path, model_file, model_config, use_cuda):
     tts_config = os.path.join(model_path, model_config)
     self.model_file = os.path.join(model_path, model_file)
     print(" > Loading TTS model ...")
     print(" | > model config: ", tts_config)
     print(" | > model file: ", model_file)
     self.tts_config = load_config(tts_config)
     self.use_phonemes = self.tts_config.use_phonemes
     self.ap = AudioProcessor(**self.tts_config.audio)
     if self.use_phonemes:
         self.input_size = len(phonemes)
     else:
         self.input_size = len(symbols)
     # load speakers
     if self.config.tts_speakers is not None:
         self.tts_speakers = load_speaker_mapping(
             os.path.join(model_path, self.config.tts_speakers))
         num_speakers = len(self.tts_speakers)
     else:
         num_speakers = 0
     self.tts_model = setup_model(self.input_size,
                                  num_speakers=num_speakers,
                                  c=self.tts_config)
     # load model state
     cp = torch.load(self.model_file)
     # load the model
     self.tts_model.load_state_dict(cp['model'])
     if use_cuda:
         self.tts_model.cuda()
     self.tts_model.eval()
     self.tts_model.decoder.max_decoder_steps = 3000
     if 'r' in cp and self.tts_config.model in ["Tacotron", "TacotronGST"]:
         self.tts_model.decoder.set_r(cp['r'])
Esempio n. 4
0
 def load_tts(self, tts_checkpoint, tts_config, use_cuda):
     print(" > Loading TTS model ...")
     print(" | > model config: ", tts_config)
     print(" | > checkpoint file: ", tts_checkpoint)
     self.tts_config = load_config(tts_config)
     self.use_phonemes = self.tts_config.use_phonemes
     self.ap = AudioProcessor(**self.tts_config.audio)
     if self.use_phonemes:
         self.input_size = len(phonemes)
     else:
         self.input_size = len(symbols)
     # TODO: fix this for multi-speaker model - load speakers
     if self.config.tts_speakers is not None:
         self.tts_speakers = load_speaker_mapping(self.config.tts_speakers)
         num_speakers = len(self.tts_speakers)
     else:
         num_speakers = 0
     self.tts_model = setup_model(self.input_size,
                                  num_speakers=num_speakers,
                                  c=self.tts_config)
     # load model state
     cp = torch.load(tts_checkpoint, map_location=torch.device('cpu'))
     # load the model
     self.tts_model.load_state_dict(cp['model'])
     if use_cuda:
         self.tts_model.cuda()
     self.tts_model.eval()
     self.tts_model.decoder.max_decoder_steps = 3000
     if 'r' in cp:
         self.tts_model.decoder.set_r(cp['r'])
Esempio n. 5
0
    def say(self, text, output):
        # load the model
        model = Tacotron(self.CONFIG.embedding_size, self.CONFIG.num_freq,
                         self.CONFIG.num_mels, self.CONFIG.r)

        # load the audio processor

        ap = AudioProcessor(self.CONFIG.sample_rate, self.CONFIG.num_mels,
                            self.CONFIG.min_level_db,
                            self.CONFIG.frame_shift_ms,
                            self.CONFIG.frame_length_ms,
                            self.CONFIG.ref_level_db, self.CONFIG.num_freq,
                            self.CONFIG.power, self.CONFIG.preemphasis, 60)

        # load model state
        if self.use_cuda:
            cp = torch.load(self.MODEL_PATH)
        else:
            cp = torch.load(self.MODEL_PATH,
                            map_location=lambda storage, loc: storage)

        # load the model
        model.load_state_dict(cp['model'])
        if self.use_cuda:
            model.cuda()
        model.eval()

        model.decoder.max_decoder_steps = 400
        wavs = self.text2audio(text, model, self.CONFIG, self.use_cuda, ap)

        audio = np.concatenate(wavs)
        ap.save_wav(audio, output)

        return
Esempio n. 6
0
    def __init__(self,
                 csv_file,
                 root_dir,
                 outputs_per_step,
                 sample_rate,
                 text_cleaner,
                 num_mels,
                 min_level_db,
                 frame_shift_ms,
                 frame_length_ms,
                 preemphasis,
                 ref_level_db,
                 num_freq,
                 power,
                 min_seq_len=0):

        with open(csv_file, "r") as f:
            self.frames = [line.split('\t') for line in f]
        self.root_dir = root_dir
        self.outputs_per_step = outputs_per_step
        self.sample_rate = sample_rate
        self.cleaners = text_cleaner
        self.min_seq_len = min_seq_len
        self.ap = AudioProcessor(sample_rate, num_mels, min_level_db,
                                 frame_shift_ms, frame_length_ms, preemphasis,
                                 ref_level_db, num_freq, power)
        print(" > Reading TWEB from - {}".format(root_dir))
        print(" | > Number of instances : {}".format(len(self.frames)))
        self._sort_frames()
Esempio n. 7
0
    def test_speaker_embedding():
        # load config
        config = load_config(encoder_config_path)
        config.audio.resample = True

        # create a dummy speaker encoder
        model = setup_speaker_encoder_model(config)
        save_checkpoint(model, None, None, get_tests_input_path(), 0)

        # load audio processor and speaker encoder
        ap = AudioProcessor(**config.audio)
        manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path)

        # load a sample audio and compute embedding
        waveform = ap.load_wav(sample_wav_path)
        mel = ap.melspectrogram(waveform)
        d_vector = manager.compute_d_vector(mel)
        assert d_vector.shape[1] == 256

        # compute d_vector directly from an input file
        d_vector = manager.compute_d_vector_from_clip(sample_wav_path)
        d_vector2 = manager.compute_d_vector_from_clip(sample_wav_path)
        d_vector = torch.FloatTensor(d_vector)
        d_vector2 = torch.FloatTensor(d_vector2)
        assert d_vector.shape[0] == 256
        assert (d_vector - d_vector2).sum() == 0.0

        # compute d_vector from a list of wav files.
        d_vector3 = manager.compute_d_vector_from_clip([sample_wav_path, sample_wav_path2])
        d_vector3 = torch.FloatTensor(d_vector3)
        assert d_vector3.shape[0] == 256
        assert (d_vector - d_vector3).sum() != 0.0

        # remove dummy model
        os.remove(encoder_model_path)
Esempio n. 8
0
    def _load_vocoder(self, model_file: str, model_config: str,
                      use_cuda: bool) -> None:
        """Load the vocoder model.

        Args:
            model_file (str): path to the model checkpoint.
            model_config (str): path to the model config file.
            use_cuda (bool): enable/disable CUDA use.
        """
        self.vocoder_config = load_config(model_config)

        # Patch stats_path
        stats_path = self.vocoder_config["audio"].get("stats_path", "")
        if stats_path and (not os.path.isfile(stats_path)):
            stats_path = os.path.join(os.path.dirname(model_file),
                                      os.path.split(stats_path)[1])
            self.vocoder_config["audio"]["stats_path"] = stats_path

        self.vocoder_ap = AudioProcessor(verbose=False,
                                         **self.vocoder_config.audio)
        self.vocoder_model = setup_vocoder_model(self.vocoder_config)
        self.vocoder_model.load_checkpoint(self.vocoder_config,
                                           model_file,
                                           eval=True)
        if use_cuda:
            self.vocoder_model.cuda()
Esempio n. 9
0
 def load_model(self, model_path, model_name, model_config, use_cuda):
     model_config = os.path.join(model_path, model_config)
     self.model_file = os.path.join(model_path, model_name)
     print(" > Loading model ...")
     print(" | > model config: ", model_config)
     print(" | > model file: ", self.model_file)
     config = load_config(model_config)
     self.config = config
     self.use_cuda = use_cuda
     self.model = Tacotron(config.embedding_size, config.num_freq,
                           config.num_mels, config.r)
     self.ap = AudioProcessor(config.sample_rate,
                              config.num_mels,
                              config.min_level_db,
                              config.frame_shift_ms,
                              config.frame_length_ms,
                              config.preemphasis,
                              config.ref_level_db,
                              config.num_freq,
                              config.power,
                              griffin_lim_iters=60)
     # load model state
     if use_cuda:
         cp = torch.load(self.model_file)
     else:
         cp = torch.load(self.model_file,
                         map_location=lambda storage, loc: storage)
     # load the model
     self.model.load_state_dict(cp['model'])
     if use_cuda:
         self.model.cuda()
     self.model.eval()
Esempio n. 10
0
  def load(self, t_checkpoint_path, v_checkpoint_path,
           t_config_path=None, v_config_path=None, model_name='tacotron'):
    if t_checkpoint_path.endswith('.pt'):
        self.model_name = 'nvidia'
        print('Constructing model: %s' % self.model_name)

        # set-up params
        hparams = create_hparams()

        # load model from checkpoint
        self.model = Tacotron2(hparams)
        self.model.load_state_dict(torch.load(t_checkpoint_path,
                                              map_location='cpu')['state_dict'])
        _ = self.model.eval()
    else: # elif t_checkpoint_path.endswith('.pth.tar'):
        self.model_name = 'coqui'
        print('Constructing model: %s' % self.model_name)

        # load tts config and audio processor
        self.tts_config = load_config(t_config_path)
        self.tts_model = setup_tts_model(config=self.tts_config)
        self.tts_model.load_checkpoint(self.tts_config,
                                       t_checkpoint_path, eval=True)
        self.ap = AudioProcessor(verbose=False, **self.tts_config.audio)

        # load vocoder config and audio processor
        vocoder_config = load_config(v_config_path)
        self.vocoder_ap = AudioProcessor(verbose=False, **vocoder_config.audio)

    # Load neurips MelGAN for mel2audio synthesis
    self.vocoder = torch.hub.load('descriptinc/melgan-neurips', 'load_melgan')
    melgan_ckpt = torch.load(v_checkpoint_path, map_location='cpu')
    self.vocoder.mel2wav.load_state_dict(melgan_ckpt)
Esempio n. 11
0
 def init_speaker_encoder(self, model_path: str, config_path: str) -> None:
     self.speaker_encoder_config = load_config(config_path)
     self.speaker_encoder = setup_model(self.speaker_encoder_config)
     self.speaker_encoder.load_checkpoint(config_path, model_path, True)
     self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio)
     # normalize the input audio level and trim silences
     self.speaker_encoder_ap.do_sound_norm = True
     self.speaker_encoder_ap.do_trim_silence = True
Esempio n. 12
0
    def init_speaker_encoder(self, model_path: str, config_path: str) -> None:
        """Initialize a speaker encoder model.

        Args:
            model_path (str): Model file path.
            config_path (str): Model config file path.
        """
        self.speaker_encoder_config = load_config(config_path)
        self.speaker_encoder = setup_speaker_encoder_model(self.speaker_encoder_config)
        self.speaker_encoder.load_checkpoint(config_path, model_path, eval=True, use_cuda=self.use_cuda)
        self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio)
def setup():
    use_cuda = True

    # model paths
    TTS_MODEL = "tts_model.pth.tar"
    TTS_CONFIG = "config.json"
    VOCODER_MODEL = "vocoder_model.pth.tar"
    VOCODER_CONFIG = "config_vocoder.json"

    # Load configs
    TTS_CONFIG = load_config(TTS_CONFIG)
    VOCODER_CONFIG = load_config(VOCODER_CONFIG)

    ap = AudioProcessor(**TTS_CONFIG.audio)


    # LOAD TTS MODEL
    # multi speaker 
    speaker_id = None
    speakers = []

    # load the model
    num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)
    model = setup_model(num_chars, len(speakers), TTS_CONFIG)

    # load model state
    cp =  torch.load(TTS_MODEL, map_location=torch.device('cpu'))

    # load the model
    model.load_state_dict(cp['model'])
    if use_cuda:
        model.cuda()
    model.eval()

    # set model stepsize
    if 'r' in cp:
        model.decoder.set_r(cp['r'])


    from TTS.vocoder.utils.generic_utils import setup_generator

    # LOAD VOCODER MODEL
    vocoder_model = setup_generator(VOCODER_CONFIG)
    vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location="cpu")["model"])
    vocoder_model.remove_weight_norm()
    vocoder_model.inference_padding = 0

    ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio'])    
    if use_cuda:
        vocoder_model.cuda()
    vocoder_model.eval()

    return model, vocoder_model, speaker_id, TTS_CONFIG, use_cuda, ap
Esempio n. 14
0
 def __init__(self, csv_file, root_dir, outputs_per_step, sample_rate,
              text_cleaner, num_mels, min_level_db, frame_shift_ms,
              frame_length_ms, preemphasis, ref_level_db, num_freq, power):
     self.frames = pd.read_csv(csv_file, sep='|', header=None)
     self.root_dir = root_dir
     self.outputs_per_step = outputs_per_step
     self.sample_rate = sample_rate
     self.cleaners = text_cleaner
     self.ap = AudioProcessor(sample_rate, num_mels, min_level_db,
                              frame_shift_ms, frame_length_ms, preemphasis,
                              ref_level_db, num_freq, power)
     print(" > Reading LJSpeech from - {}".format(root_dir))
     print(" | > Number of instances : {}".format(len(self.frames)))
Esempio n. 15
0
def load_tts_model():

    MODEL_PATH = dirpath + '/tts_model/best_model.pth.tar'
    CONFIG_PATH = dirpath + '/tts_model/config.json'
    CONFIG = load_config(CONFIG_PATH)
    use_cuda = False

    num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
    model = Tacotron(num_chars, CONFIG.embedding_size, CONFIG.audio['num_freq'], CONFIG.audio['num_mels'], CONFIG.r, attn_windowing=False)

    num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
    model = Tacotron(num_chars, CONFIG.embedding_size, CONFIG.audio['num_freq'], CONFIG.audio['num_mels'], CONFIG.r, attn_windowing=False)

    # load the audio processor
    # CONFIG.audio["power"] = 1.3
    CONFIG.audio["preemphasis"] = 0.97
    ap = AudioProcessor(**CONFIG.audio)

    # load model state
    if use_cuda:
        cp = torch.load(MODEL_PATH)
    else:
        cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)

    # load the model
    model.load_state_dict(cp['model'])
    if use_cuda:
        model.cuda()

    #model.eval()
    model.decoder.max_decoder_steps = 1000
    return model, ap, MODEL_PATH, CONFIG, use_cuda
Esempio n. 16
0
    def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict):  # pylint: disable=no-self-use
        alignments = outputs["alignments"]
        text_input = batch["text_input"]
        text_lengths = batch["text_lengths"]
        mel_input = batch["mel_input"]
        d_vectors = batch["d_vectors"]
        speaker_ids = batch["speaker_ids"]

        # model runs reverse flow to predict spectrograms
        pred_outputs = self.inference(
            text_input[:1],
            aux_input={
                "x_lengths": text_lengths[:1],
                "d_vectors": d_vectors,
                "speaker_ids": speaker_ids
            },
        )
        model_outputs = pred_outputs["model_outputs"]

        pred_spec = model_outputs[0].data.cpu().numpy()
        gt_spec = mel_input[0].data.cpu().numpy()
        align_img = alignments[0].data.cpu().numpy()

        figures = {
            "prediction": plot_spectrogram(pred_spec, ap, output_fig=False),
            "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False),
            "alignment": plot_alignment(align_img, output_fig=False),
        }

        # Sample audio
        train_audio = ap.inv_melspectrogram(pred_spec.T)
        return figures, {"audio": train_audio}
Esempio n. 17
0
def load_tacotron2(use_cuda):
    """
    Loads the Tacotron2 model

    Parameters
    ----------
    use_cuda : bool
        whether to use the gpu

    Returns
    -------
    model, audio processor, model config
    """
    TTS_MODEL = model_path / 'model.pth.tar'
    TTS_CONFIG = model_path / 'config.json'

    TTS_CONFIG = load_config(TTS_CONFIG)
    TTS_CONFIG.audio['stats_path'] = str(model_path / 'scale_stats.npy')

    ap = AudioProcessor(**TTS_CONFIG.audio)

    num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)
    model = setup_model(num_chars, 0, TTS_CONFIG)
    cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))
    model.load_state_dict(cp['model'])
    if use_cuda:
        model.cuda()

    if 'r' in cp:
        model.decoder.set_r(cp['r'])

    model.eval()

    return model, ap, TTS_CONFIG
Esempio n. 18
0
def load_vocoder(use_cuda):
    """
    Loads the Vocoder model

    Parameters
    ----------
    use_cuda : bool
        whether to use the gpu

    Returns
    -------
    model, audio processor, model config
    """
    VOCODER_MODEL = model_path / 'vocoder_model.pth.tar'
    VOCODER_CONFIG = model_path / 'vocoder_config.json'

    VOCODER_CONFIG = load_config(VOCODER_CONFIG)
    VOCODER_CONFIG.audio['stats_path'] = str(model_path /
                                             'vocoder_scale_stats.npy')

    ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio'])

    vocoder_model = setup_generator(VOCODER_CONFIG)
    cp = torch.load(VOCODER_MODEL, map_location=torch.device('cpu'))
    vocoder_model.load_state_dict(cp['model'])
    vocoder_model.remove_weight_norm()
    vocoder_model.inference_padding = 0

    if use_cuda:
        vocoder_model.cuda()
    vocoder_model.eval()

    return vocoder_model, ap_vocoder, VOCODER_CONFIG
Esempio n. 19
0
def main(args):  # pylint: disable=redefined-outer-name
    # pylint: disable=global-variable-undefined
    global meta_data_train
    global meta_data_eval

    ap = AudioProcessor(**c.audio)
    model = setup_model(c)

    optimizer = RAdam(model.parameters(), lr=c.lr)

    # pylint: disable=redefined-outer-name
    meta_data_train, meta_data_eval = load_meta_data(c.datasets, eval_split=False)

    data_loader, num_speakers = setup_loader(ap, is_val=False, verbose=True)

    if c.loss == "ge2e":
        criterion = GE2ELoss(loss_method="softmax")
    elif c.loss == "angleproto":
        criterion = AngleProtoLoss()
    elif c.loss == "softmaxproto":
        criterion = SoftmaxAngleProtoLoss(c.model["proj_dim"], num_speakers)
    else:
        raise Exception("The %s  not is a loss supported" % c.loss)

    if args.restore_path:
        checkpoint = torch.load(args.restore_path)
        try:
            model.load_state_dict(checkpoint["model"])

            if "criterion" in checkpoint:
                criterion.load_state_dict(checkpoint["criterion"])

        except (KeyError, RuntimeError):
            print(" > Partial model initialization.")
            model_dict = model.state_dict()
            model_dict = set_init_dict(model_dict, checkpoint["model"], c)
            model.load_state_dict(model_dict)
            del model_dict
        for group in optimizer.param_groups:
            group["lr"] = c.lr

        print(" > Model restored from step %d" % checkpoint["step"], flush=True)
        args.restore_step = checkpoint["step"]
    else:
        args.restore_step = 0

    if c.lr_decay:
        scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1)
    else:
        scheduler = None

    num_params = count_parameters(model)
    print("\n > Model has {} parameters".format(num_params), flush=True)

    if use_cuda:
        model = model.cuda()
        criterion.cuda()

    global_step = args.restore_step
    _, global_step = train(model, optimizer, scheduler, criterion, data_loader, global_step)
Esempio n. 20
0
    def load_model(self, MODEL_PATH, sentence, CONFIG, use_cuda, OUT_FILE):
        # load the model
        num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
        model = Tacotron(num_chars,
                         CONFIG.embedding_size,
                         CONFIG.audio['num_freq'],
                         CONFIG.audio['num_mels'],
                         CONFIG.r,
                         attn_windowing=False)

        # load the audio processor
        # CONFIG.audio["power"] = 1.3
        CONFIG.audio["preemphasis"] = 0.97
        ap = AudioProcessor(**CONFIG.audio)

        # load model state
        if use_cuda:
            cp = torch.load(MODEL_PATH)
        else:
            cp = torch.load(MODEL_PATH,
                            map_location=lambda storage, loc: storage)

        # load the model
        model.load_state_dict(cp['model'])
        if use_cuda:
            model.cuda()

        model.eval()
        model.decoder.max_decoder_steps = 1000
        align, spec, stop_tokens, wav_norm = self.tts(model, sentence, CONFIG,
                                                      use_cuda, ap, OUT_FILE)
        return wav_norm
Esempio n. 21
0
def main(args):  # pylint: disable=redefined-outer-name
    # pylint: disable=global-variable-undefined
    global meta_data_train
    global meta_data_eval

    ap = AudioProcessor(**c.audio)
    model = SpeakerEncoder(
        input_dim=c.model["input_dim"],
        proj_dim=c.model["proj_dim"],
        lstm_dim=c.model["lstm_dim"],
        num_lstm_layers=c.model["num_lstm_layers"],
    )
    optimizer = RAdam(model.parameters(), lr=c.lr)

    if c.loss == "ge2e":
        criterion = GE2ELoss(loss_method="softmax")
    elif c.loss == "angleproto":
        criterion = AngleProtoLoss()
    else:
        raise Exception("The %s  not is a loss supported" % c.loss)

    if args.restore_path:
        checkpoint = torch.load(args.restore_path)
        try:
            # TODO: fix optimizer init, model.cuda() needs to be called before
            # optimizer restore
            # optimizer.load_state_dict(checkpoint['optimizer'])
            if c.reinit_layers:
                raise RuntimeError
            model.load_state_dict(checkpoint["model"])
        except KeyError:
            print(" > Partial model initialization.")
            model_dict = model.state_dict()
            model_dict = set_init_dict(model_dict, checkpoint, c)
            model.load_state_dict(model_dict)
            del model_dict
        for group in optimizer.param_groups:
            group["lr"] = c.lr
        print(" > Model restored from step %d" % checkpoint["step"], flush=True)
        args.restore_step = checkpoint["step"]
    else:
        args.restore_step = 0

    if use_cuda:
        model = model.cuda()
        criterion.cuda()

    if c.lr_decay:
        scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1)
    else:
        scheduler = None

    num_params = count_parameters(model)
    print("\n > Model has {} parameters".format(num_params), flush=True)

    # pylint: disable=redefined-outer-name
    meta_data_train, meta_data_eval = load_meta_data(c.datasets)

    global_step = args.restore_step
    _, global_step = train(model, criterion, optimizer, scheduler, ap, global_step)
Esempio n. 22
0
def main():
    """Run `tts` model training directly by a `config.json` file."""
    # init trainer args
    train_args = TrainVocoderArgs()
    parser = train_args.init_argparse(arg_prefix="")

    # override trainer args from comman-line args
    args, config_overrides = parser.parse_known_args()
    train_args.parse_args(args)

    # load config.json and register
    if args.config_path or args.continue_path:
        if args.config_path:
            # init from a file
            config = load_config(args.config_path)
            if len(config_overrides) > 0:
                config.parse_known_args(config_overrides, relaxed_parser=True)
        elif args.continue_path:
            # continue from a prev experiment
            config = load_config(
                os.path.join(args.continue_path, "config.json"))
            if len(config_overrides) > 0:
                config.parse_known_args(config_overrides, relaxed_parser=True)
        else:
            # init from console args
            from TTS.config.shared_configs import BaseTrainingConfig  # pylint: disable=import-outside-toplevel

            config_base = BaseTrainingConfig()
            config_base.parse_known_args(config_overrides)
            config = register_config(config_base.model)()

    # load training samples
    if "feature_path" in config and config.feature_path:
        # load pre-computed features
        print(f" > Loading features from: {config.feature_path}")
        eval_samples, train_samples = load_wav_feat_data(
            config.data_path, config.feature_path, config.eval_split_size)
    else:
        # load data raw wav files
        eval_samples, train_samples = load_wav_data(config.data_path,
                                                    config.eval_split_size)

    # setup audio processor
    ap = AudioProcessor(**config.audio)

    # init the model from config
    model = setup_model(config)

    # init the trainer and 🚀
    trainer = Trainer(
        train_args,
        config,
        config.output_path,
        model=model,
        train_samples=train_samples,
        eval_samples=eval_samples,
        training_assets={"audio_processor": ap},
        parse_command_line_args=False,
    )
    trainer.fit()
Esempio n. 23
0
    def init_encoder(self,
                     model_path: str,
                     config_path: str,
                     use_cuda=False) -> None:
        """Initialize a speaker encoder model.

        Args:
            model_path (str): Model file path.
            config_path (str): Model config file path.
            use_cuda (bool, optional): Use CUDA. Defaults to False.
        """
        self.use_cuda = use_cuda
        self.encoder_config = load_config(config_path)
        self.encoder = setup_encoder_model(self.encoder_config)
        self.encoder_criterion = self.encoder.load_checkpoint(
            self.encoder_config, model_path, eval=True, use_cuda=use_cuda)
        self.encoder_ap = AudioProcessor(**self.encoder_config.audio)
Esempio n. 24
0
    def init_from_config(config: Coqpit):
        """Initialize model from config."""
        from TTS.utils.audio import AudioProcessor

        ap = AudioProcessor.init_from_config(config)
        tokenizer = TTSTokenizer.init_from_config(config)
        speaker_manager = SpeakerManager.init_from_config(config)
        return BaseTacotron(config, ap, tokenizer, speaker_manager)
Esempio n. 25
0
class Synthesizer(object):

    def load_model(self, model_path, model_name, model_config, use_cuda):
        model_config = os.path.join(model_path, model_config)
        self.model_file = os.path.join(model_path, model_name)        
        print(" > Loading model ...")
        print(" | > model config: ", model_config)
        print(" | > model file: ", self.model_file)
        config = load_config(model_config)
        self.config = config
        self.use_cuda = use_cuda
        self.model = Tacotron(config.embedding_size, config.num_freq, config.num_mels, config.r)
        self.ap = AudioProcessor(config.sample_rate, config.num_mels, config.min_level_db,
                                 config.frame_shift_ms, config.frame_length_ms, config.preemphasis,
                                 config.ref_level_db, config.num_freq, config.power, griffin_lim_iters=60)  
        # load model state
        if use_cuda:
            cp = torch.load(self.model_file)
        else:
            cp = torch.load(self.model_file, map_location=lambda storage, loc: storage)
        # load the model
        self.model.load_state_dict(cp['model'])
        if use_cuda:
            self.model.cuda()
        self.model.eval()       
    
    def save_wav(self, wav, path):
        wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
        # sf.write(path, wav.astype(np.int32), self.config.sample_rate, format='wav')
        # wav = librosa.util.normalize(wav.astype(np.float), norm=np.inf, axis=None)
        # wav = wav / wav.max()
        # sf.write(path, wav.astype('float'), self.config.sample_rate, format='ogg')
        scipy.io.wavfile.write(path, self.config.sample_rate, wav.astype(np.int16))
        # librosa.output.write_wav(path, wav.astype(np.int16), self.config.sample_rate, norm=True)

    def tts(self, text):
        text_cleaner = [self.config.text_cleaner]
        wavs = []
        for sen in text.split('.'):
            if len(sen) < 3:
                continue
            sen = sen.strip()
            sen +='.'
            print(sen)
            sen = sen.strip()
            seq = np.array(text_to_sequence(text, text_cleaner))
            chars_var = torch.from_numpy(seq).unsqueeze(0)
            if self.use_cuda:
                chars_var = chars_var.cuda()
            mel_out, linear_out, alignments, stop_tokens = self.model.forward(chars_var)
            linear_out = linear_out[0].data.cpu().numpy()
            wav = self.ap.inv_spectrogram(linear_out.T)
            # wav = wav[:self.ap.find_endpoint(wav)]
            out = io.BytesIO()
            wavs.append(wav)
            wavs.append(np.zeros(10000))
        self.save_wav(wav, out)
        return out
Esempio n. 26
0
    def _load_vocoder(self, model_file: str, model_config: str,
                      use_cuda: bool) -> None:
        """Load the vocoder model.

        Args:
            model_file (str): path to the model checkpoint.
            model_config (str): path to the model config file.
            use_cuda (bool): enable/disable CUDA use.
        """
        self.vocoder_config = load_config(model_config)
        self.vocoder_ap = AudioProcessor(verbose=False,
                                         **self.vocoder_config["audio"])
        self.vocoder_model = setup_generator(self.vocoder_config)
        self.vocoder_model.load_checkpoint(self.vocoder_config,
                                           model_file,
                                           eval=True)
        if use_cuda:
            self.vocoder_model.cuda()
Esempio n. 27
0
def main(args):  # pylint: disable=redefined-outer-name
    # pylint: disable=global-variable-undefined
    global meta_data_train
    global meta_data_eval
    global train_classes

    ap = AudioProcessor(**c.audio)
    model = setup_encoder_model(c)

    optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model)

    # pylint: disable=redefined-outer-name
    meta_data_train, meta_data_eval = load_tts_samples(c.datasets,
                                                       eval_split=True)

    train_data_loader, train_classes, map_classid_to_classname = setup_loader(
        ap, is_val=False, verbose=True)
    if c.run_eval:
        eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True)
    else:
        eval_data_loader = None

    num_classes = len(train_classes)
    criterion = model.get_criterion(c, num_classes)

    if c.loss == "softmaxproto" and c.model != "speaker_encoder":
        c.map_classid_to_classname = map_classid_to_classname
        copy_model_files(c, OUT_PATH)

    if args.restore_path:
        criterion, args.restore_step = model.load_checkpoint(
            c,
            args.restore_path,
            eval=False,
            use_cuda=use_cuda,
            criterion=criterion)
        print(" > Model restored from step %d" % args.restore_step, flush=True)
    else:
        args.restore_step = 0

    if c.lr_decay:
        scheduler = NoamLR(optimizer,
                           warmup_steps=c.warmup_steps,
                           last_epoch=args.restore_step - 1)
    else:
        scheduler = None

    num_params = count_parameters(model)
    print("\n > Model has {} parameters".format(num_params), flush=True)

    if use_cuda:
        model = model.cuda()
        criterion.cuda()

    global_step = args.restore_step
    _, global_step = train(model, optimizer, scheduler, criterion,
                           train_data_loader, eval_data_loader, global_step)
Esempio n. 28
0
    def __init__(self, use_cuda=False, verbose=False):
        self.use_cuda = use_cuda
        self.verbose = verbose

        # load configs
        self.TTS_CONFIG = load_config(TTS_CONFIG)
        self.VOCODER_CONFIG = load_config(VOCODER_CONFIG)

        # load the audio processor
        self.ap = AudioProcessor(**self.TTS_CONFIG.audio)

        # LOAD TTS MODEL
        self.speaker_id = None
        speakers = []

        # load the model
        num_chars = len(phonemes) if self.TTS_CONFIG.use_phonemes else len(symbols)
        self.model = setup_model(num_chars, len(speakers), self.TTS_CONFIG)

        # load model state
        cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))

        # load the model
        self.model.load_state_dict(cp['model'])
        if self.use_cuda:
            self.model.cuda()
        self.model.eval()

        # set model stepsize
        if 'r' in cp:
            self.model.decoder.set_r(cp['r'])

        # LOAD VOCODER MODEL
        self.vocoder_model = setup_generator(self.VOCODER_CONFIG)
        self.vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location="cpu")["model"])
        self.vocoder_model.remove_weight_norm()
        self.vocoder_model.inference_padding = 0

        ap_vocoder = AudioProcessor(**self.VOCODER_CONFIG['audio'])
        if self.use_cuda:
            self.vocoder_model.cuda()

        self.vocoder_model.eval()
Esempio n. 29
0
    def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> None:
        """Load the vocoder model.

        1. Load the vocoder config.
        2. Init the AudioProcessor for the vocoder.
        3. Init the vocoder model from the config.
        4. Move the model to the GPU if CUDA is enabled.

        Args:
            model_file (str): path to the model checkpoint.
            model_config (str): path to the model config file.
            use_cuda (bool): enable/disable CUDA use.
        """
        self.vocoder_config = load_config(model_config)
        self.vocoder_ap = AudioProcessor(verbose=False, **self.vocoder_config.audio)
        self.vocoder_model = setup_vocoder_model(self.vocoder_config)
        self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True)
        if use_cuda:
            self.vocoder_model.cuda()
Esempio n. 30
0
    def _load_tts(self, tts_checkpoint: str, tts_config_path: str,
                  use_cuda: bool) -> None:
        """Load the TTS model.

        Args:
            tts_checkpoint (str): path to the model checkpoint.
            tts_config_path (str): path to the model config file.
            use_cuda (bool): enable/disable CUDA use.
        """
        # pylint: disable=global-statement

        global symbols, phonemes

        self.tts_config = load_config(tts_config_path)
        self.use_phonemes = self.tts_config.use_phonemes
        self.ap = AudioProcessor(verbose=False, **self.tts_config.audio)

        if "characters" in self.tts_config.keys():
            symbols, phonemes = make_symbols(**self.tts_config.characters)

        if self.use_phonemes:
            self.input_size = len(phonemes)
        else:
            self.input_size = len(symbols)

        if self.tts_config.use_speaker_embedding is True:
            self.tts_speakers_file = (
                self.tts_speakers_file if self.tts_speakers_file else
                self.tts_config["external_speaker_embedding_file"])
            self._load_speakers(self.tts_speakers_file)

        self.tts_model = setup_model(
            self.input_size,
            num_speakers=self.num_speakers,
            c=self.tts_config,
            speaker_embedding_dim=self.speaker_embedding_dim,
        )
        self.tts_model.load_checkpoint(self.tts_config,
                                       tts_checkpoint,
                                       eval=True)
        if use_cuda:
            self.tts_model.cuda()