Exemple #1
0
 def _create_random_model(self):
     config = load_config(
         os.path.join(get_tests_output_path(), 'dummy_model_config.json'))
     num_chars = len(phonemes) if config.use_phonemes else len(symbols)
     model = setup_model(num_chars, 0, config)
     output_path = os.path.join(get_tests_output_path())
     save_checkpoint(model, None, None, None, output_path, 10, 10)
Exemple #2
0
 def load_tts(self, tts_checkpoint, tts_config, use_cuda):
     print(" > Loading TTS model ...")
     print(" | > model config: ", tts_config)
     print(" | > checkpoint file: ", tts_checkpoint)
     self.tts_config = load_config(tts_config)
     self.use_phonemes = self.tts_config.use_phonemes
     self.ap = AudioProcessor(**self.tts_config.audio)
     if self.use_phonemes:
         self.input_size = len(phonemes)
     else:
         self.input_size = len(symbols)
     # TODO: fix this for multi-speaker model - load speakers
     if self.config.tts_speakers is not None:
         self.tts_speakers = load_speaker_mapping(self.config.tts_speakers)
         num_speakers = len(self.tts_speakers)
     else:
         num_speakers = 0
     self.tts_model = setup_model(self.input_size,
                                  num_speakers=num_speakers,
                                  c=self.tts_config)
     # load model state
     cp = torch.load(tts_checkpoint, map_location=torch.device('cpu'))
     # load the model
     self.tts_model.load_state_dict(cp['model'])
     if use_cuda:
         self.tts_model.cuda()
     self.tts_model.eval()
     self.tts_model.decoder.max_decoder_steps = 3000
     if 'r' in cp:
         self.tts_model.decoder.set_r(cp['r'])
 def load_tts(self, model_path, model_file, model_config, use_cuda):
     tts_config = os.path.join(model_path, model_config)
     self.model_file = os.path.join(model_path, model_file)
     print(" > Loading TTS model ...")
     print(" | > model config: ", tts_config)
     print(" | > model file: ", model_file)
     self.tts_config = load_config(tts_config)
     self.use_phonemes = self.tts_config.use_phonemes
     self.ap = AudioProcessor(**self.tts_config.audio)
     if self.use_phonemes:
         self.input_size = len(phonemes)
     else:
         self.input_size = len(symbols)
     # load speakers
     if self.config.tts_speakers is not None:
         self.tts_speakers = load_speaker_mapping(
             os.path.join(model_path, self.config.tts_speakers))
         num_speakers = len(self.tts_speakers)
     else:
         num_speakers = 0
     self.tts_model = setup_model(self.input_size,
                                  num_speakers=num_speakers,
                                  c=self.tts_config)
     # load model state
     cp = torch.load(self.model_file)
     # load the model
     self.tts_model.load_state_dict(cp['model'])
     if use_cuda:
         self.tts_model.cuda()
     self.tts_model.eval()
     self.tts_model.decoder.max_decoder_steps = 3000
     if 'r' in cp and self.tts_config.model in ["Tacotron", "TacotronGST"]:
         self.tts_model.decoder.set_r(cp['r'])
Exemple #4
0
    def _create_random_model(self):
        # pylint: disable=global-statement
        global symbols, phonemes
        config = load_config(
            os.path.join(get_tests_output_path(), 'dummy_model_config.json'))
        if 'characters' in config.keys():
            symbols, phonemes = make_symbols(**config.characters)

        num_chars = len(phonemes) if config.use_phonemes else len(symbols)
        model = setup_model(num_chars, 0, config)
        output_path = os.path.join(get_tests_output_path())
        save_checkpoint(model, None, None, None, output_path, 10, 10)
def setup():
    use_cuda = True

    # model paths
    TTS_MODEL = "tts_model.pth.tar"
    TTS_CONFIG = "config.json"
    VOCODER_MODEL = "vocoder_model.pth.tar"
    VOCODER_CONFIG = "config_vocoder.json"

    # Load configs
    TTS_CONFIG = load_config(TTS_CONFIG)
    VOCODER_CONFIG = load_config(VOCODER_CONFIG)

    ap = AudioProcessor(**TTS_CONFIG.audio)


    # LOAD TTS MODEL
    # multi speaker 
    speaker_id = None
    speakers = []

    # load the model
    num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)
    model = setup_model(num_chars, len(speakers), TTS_CONFIG)

    # load model state
    cp =  torch.load(TTS_MODEL, map_location=torch.device('cpu'))

    # load the model
    model.load_state_dict(cp['model'])
    if use_cuda:
        model.cuda()
    model.eval()

    # set model stepsize
    if 'r' in cp:
        model.decoder.set_r(cp['r'])


    from TTS.vocoder.utils.generic_utils import setup_generator

    # LOAD VOCODER MODEL
    vocoder_model = setup_generator(VOCODER_CONFIG)
    vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location="cpu")["model"])
    vocoder_model.remove_weight_norm()
    vocoder_model.inference_padding = 0

    ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio'])    
    if use_cuda:
        vocoder_model.cuda()
    vocoder_model.eval()

    return model, vocoder_model, speaker_id, TTS_CONFIG, use_cuda, ap
    def __init__(self, use_cuda=False, verbose=False):
        self.use_cuda = use_cuda
        self.verbose = verbose

        # load configs
        self.TTS_CONFIG = load_config(TTS_CONFIG)
        self.VOCODER_CONFIG = load_config(VOCODER_CONFIG)

        # load the audio processor
        self.ap = AudioProcessor(**self.TTS_CONFIG.audio)

        # LOAD TTS MODEL
        self.speaker_id = None
        speakers = []

        # load the model
        num_chars = len(phonemes) if self.TTS_CONFIG.use_phonemes else len(symbols)
        self.model = setup_model(num_chars, len(speakers), self.TTS_CONFIG)

        # load model state
        cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))

        # load the model
        self.model.load_state_dict(cp['model'])
        if self.use_cuda:
            self.model.cuda()
        self.model.eval()

        # set model stepsize
        if 'r' in cp:
            self.model.decoder.set_r(cp['r'])

        # LOAD VOCODER MODEL
        self.vocoder_model = setup_generator(self.VOCODER_CONFIG)
        self.vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location="cpu")["model"])
        self.vocoder_model.remove_weight_norm()
        self.vocoder_model.inference_padding = 0

        ap_vocoder = AudioProcessor(**self.VOCODER_CONFIG['audio'])
        if self.use_cuda:
            self.vocoder_model.cuda()

        self.vocoder_model.eval()
 def __init__(self, TTS_MODEL, TTS_CONFIG, VOCODER_MODEL, VOCODER_CONFIG,
              use_cuda, use_gl):
     self.use_cuda = use_cuda
     self.use_gl = use_gl
     # model paths
     self.tts_config = load_config(TTS_CONFIG)
     vocoder_config = load_config(VOCODER_CONFIG)
     # load audio processor
     self.ap = AudioProcessor(**self.tts_config.audio)
     # LOAD TTS MODEL
     # multi speaker
     self.speaker_id = None
     speakers = []
     # load the model
     num_chars = len(phonemes) if self.tts_config.use_phonemes else len(
         symbols)
     self.model = setup_model(num_chars, len(speakers), self.tts_config)
     # load model state
     self.cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))
     # load the model
     self.model.load_state_dict(self.cp['model'])
     if self.use_cuda:
         self.model.cuda()
     self.model.train(False)
     self.model.eval()
     # set model stepsize
     if 'r' in self.cp:
         self.model.decoder.set_r(self.cp['r'])
     # LOAD VOCODER MODEL
     self.vocoder_model = setup_generator(vocoder_config)
     self.vocoder_model.load_state_dict(
         torch.load(VOCODER_MODEL, map_location="cpu")["model"])
     self.vocoder_model.remove_weight_norm()
     self.vocoder_model.inference_padding = 0
     #ap_vocoder = AudioProcessor(**vocoder_config['audio'])
     if use_cuda:
         self.vocoder_model.cuda()
     self.vocoder_model.train(False)
     self.vocoder_model.eval()
     #get sample rate
     self.sample_rate = self.ap.sample_rate
     gc.collect(2)
Exemple #8
0
    def _load_tts(self):
        # LOAD TTS MODEL
        from TTS.utils.text.symbols import symbols, phonemes
        from TTS.utils.audio import AudioProcessor
        from TTS.utils.generic_utils import setup_model

        # load the model
        num_chars = len(phonemes) if self.tts_config.use_phonemes else len(symbols)
        self.tts_model = setup_model(num_chars, self.tts_config)

        # load the audio processor
        self._ap = AudioProcessor(**self.tts_config.audio)         

        # load model state
        cp = torch.load(self.tts_model_path, map_location=lambda storage, loc: storage)

        # load the model
        self.tts_model.load_state_dict(cp['model'])
        self.tts_model.to(self.device)
        self.tts_model.eval()
        self.tts_model.decoder.max_decoder_steps = 2000
Exemple #9
0
    def load_tts(self, tts_checkpoint, tts_config, use_cuda):
        global symbols, phonemes

        print(" > Loading TTS model ...")
        print(" | > model config: ", tts_config)
        print(" | > checkpoint file: ", tts_checkpoint)
        self.tts_config = load_config(tts_config)

        if 'text' in self.tts_config.keys():
            symbols, phonemes = make_symbols(**self.tts_config.text)

        self.use_phonemes = self.tts_config.use_phonemes
        self.ap = AudioProcessor(**self.tts_config.audio)
        if self.use_phonemes:
            self.input_size = len(phonemes)
        else:
            self.input_size = len(symbols)
        # load speakers
        if self.config.tts_speakers is not None:
            self.tts_speakers = load_speaker_mapping(
                os.path.join(model_path, self.config.tts_speakers))
            num_speakers = len(self.tts_speakers)
        else:
            num_speakers = 0
        self.tts_model = setup_model(self.input_size,
                                     num_speakers=num_speakers,
                                     c=self.tts_config)
        # load model state
        cp = torch.load(tts_checkpoint, map_location=torch.device('cpu'))
        # load the model
        self.tts_model.load_state_dict(cp['model'])
        if use_cuda:
            self.tts_model.cuda()
        self.tts_model.eval()
        self.tts_model.decoder.max_decoder_steps = 3000
        if 'r' in cp:
            self.tts_model.decoder.set_r(cp['r'])
Exemple #10
0
    ap = AudioProcessor(**C.audio)

    # if the vocabulary was passed, replace the default
    if 'characters' in C.keys():
        symbols, phonemes = make_symbols(**C.characters)

    # load speakers
    if args.speakers_json != '':
        speakers = json.load(open(args.speakers_json, 'r'))
        num_speakers = len(speakers)
    else:
        num_speakers = 0

    # load the model
    num_chars = len(phonemes) if C.use_phonemes else len(symbols)
    model = setup_model(num_chars, num_speakers, C)
    cp = torch.load(args.model_path)
    model.load_state_dict(cp['model'])
    model.eval()
    if args.use_cuda:
        model.cuda()
    model.decoder.set_r(cp['r'])

    # load vocoder model
    if args.vocoder_path != "":
        VC = load_config(args.vocoder_config_path)
        ap_vocoder = AudioProcessor(**VC.audio)
        bits = 10
        vocoder_model = VocoderModel(rnn_dims=512,
                                     fc_dims=512,
                                     mode=VC.mode,
Exemple #11
0
def main(args):  #pylint: disable=redefined-outer-name
    # Audio processor
    ap = AudioProcessor(**c.audio)

    # DISTRUBUTED
    if num_gpus > 1:
        init_distributed(args.rank, num_gpus, args.group_id,
                         c.distributed["backend"], c.distributed["url"])
    num_chars = len(phonemes) if c.use_phonemes else len(symbols)

    if c.use_speaker_embedding:
        speakers = get_speakers(c.data_path, c.meta_file_train, c.dataset)
        if args.restore_path:
            prev_out_path = os.path.dirname(args.restore_path)
            speaker_mapping = load_speaker_mapping(prev_out_path)
            assert all([speaker in speaker_mapping
                        for speaker in speakers]), "As of now you, you cannot " \
                                                   "introduce new speakers to " \
                                                   "a previously trained model."
        else:
            speaker_mapping = {name: i for i, name in enumerate(speakers)}
        save_speaker_mapping(OUT_PATH, speaker_mapping)
        num_speakers = len(speaker_mapping)
        print("Training with {} speakers: {}".format(num_speakers,
                                                     ", ".join(speakers)))
    else:
        num_speakers = 0

    model = setup_model(num_chars, num_speakers, c)

    print(" | > Num output units : {}".format(ap.num_freq), flush=True)

    optimizer = RAdam(model.parameters(), lr=c.lr, weight_decay=0)
    if c.stopnet and c.separate_stopnet:
        optimizer_st = RAdam(model.decoder.stopnet.parameters(),
                             lr=c.lr,
                             weight_decay=0)
    else:
        optimizer_st = None

    if c.loss_masking:
        criterion = L1LossMasked() if c.model in ["Tacotron", "TacotronGST"
                                                  ] else MSELossMasked()
    else:
        criterion = nn.L1Loss() if c.model in ["Tacotron", "TacotronGST"
                                               ] else nn.MSELoss()
    criterion_st = nn.BCEWithLogitsLoss() if c.stopnet else None

    if args.restore_path:
        checkpoint = torch.load(args.restore_path)
        try:
            # TODO: fix optimizer init, model.cuda() needs to be called before
            # optimizer restore
            # optimizer.load_state_dict(checkpoint['optimizer'])
            if c.reinit_layers:
                raise RuntimeError
            model.load_state_dict(checkpoint['model'])
        except:
            print(" > Partial model initialization.")
            model_dict = model.state_dict()
            model_dict = set_init_dict(model_dict, checkpoint, c)
            model.load_state_dict(model_dict)
            del model_dict
        for group in optimizer.param_groups:
            group['lr'] = c.lr
        print(" > Model restored from step %d" % checkpoint['step'],
              flush=True)
        args.restore_step = checkpoint['step']
    else:
        args.restore_step = 0

    if use_cuda:
        model = model.cuda()
        criterion.cuda()
        if criterion_st:
            criterion_st.cuda()

    # DISTRUBUTED
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)

    if c.lr_decay:
        scheduler = NoamLR(optimizer,
                           warmup_steps=c.warmup_steps,
                           last_epoch=args.restore_step - 1)
    else:
        scheduler = None

    num_params = count_parameters(model)
    print("\n > Model has {} parameters".format(num_params), flush=True)

    if 'best_loss' not in locals():
        best_loss = float('inf')

    global_step = args.restore_step
    for epoch in range(0, c.epochs):
        # set gradual training
        if c.gradual_training is not None:
            r, c.batch_size = gradual_training_scheduler(global_step, c)
            c.r = r
            model.decoder.set_r(r)
        print(" > Number of outputs per iteration:", model.decoder.r)

        train_loss, global_step = train(model, criterion, criterion_st,
                                        optimizer, optimizer_st, scheduler, ap,
                                        global_step, epoch)
        val_loss = evaluate(model, criterion, criterion_st, ap, global_step,
                            epoch)
        print(" | > Training Loss: {:.5f}   Validation Loss: {:.5f}".format(
            train_loss, val_loss),
              flush=True)
        target_loss = train_loss
        if c.run_eval:
            target_loss = val_loss
        best_loss = save_best_model(model, optimizer, target_loss, best_loss,
                                    OUT_PATH, global_step, epoch)
Exemple #12
0
        )

    print(" >  Run-time: {}".format(time.time() - t_1))
    return alignment, mel_postnet_spec, stop_tokens, waveform


use_cuda = True
batched_wavernn = True

# initialize TTS
CONFIG = load_config(tts_pretrained_model_config)
print(CONFIG)

# load the model
num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
model = setup_model(num_chars, CONFIG)
# load the audio processor
ap = AudioProcessor(**CONFIG.audio)
# load model state
if use_cuda:
    cp = torch.load(tts_pretrained_model)
else:
    cp = torch.load(tts_pretrained_model, map_location=lambda storage, loc: storage)

# load the model
model.load_state_dict(cp["model"])
if use_cuda:
    model.cuda()
model.eval()
print(cp["step"])
model.decoder.max_decoder_steps = 2000
Exemple #13
0
def main(args):  # pylint: disable=redefined-outer-name
    # pylint: disable=global-variable-undefined
    global meta_data_train, meta_data_eval, symbols, phonemes
    # Audio processor
    ap = AudioProcessor(**c.audio)
    if 'characters' in c.keys():
        symbols, phonemes = make_symbols(**c.characters)

    # DISTRUBUTED
    if num_gpus > 1:
        init_distributed(args.rank, num_gpus, args.group_id,
                         c.distributed["backend"], c.distributed["url"])
    num_chars = len(phonemes) if c.use_phonemes else len(symbols)

    # load data instances
    meta_data_train, meta_data_eval = load_meta_data(c.datasets)

    # parse speakers
    if c.use_speaker_embedding:
        speakers = get_speakers(meta_data_train)
        if args.restore_path:
            prev_out_path = os.path.dirname(args.restore_path)
            speaker_mapping = load_speaker_mapping(prev_out_path)
            assert all([speaker in speaker_mapping
                        for speaker in speakers]), "As of now you, you cannot " \
                                                   "introduce new speakers to " \
                                                   "a previously trained model."
        else:
            speaker_mapping = {name: i for i, name in enumerate(speakers)}
        save_speaker_mapping(OUT_PATH, speaker_mapping)
        num_speakers = len(speaker_mapping)
        print("Training with {} speakers: {}".format(num_speakers,
                                                     ", ".join(speakers)))
    else:
        num_speakers = 0

    model = setup_model(num_chars, num_speakers, c)

    print(" | > Num output units : {}".format(ap.num_freq), flush=True)

    params = set_weight_decay(model, c.wd)
    optimizer = RAdam(params, lr=c.lr, weight_decay=0)
    if c.stopnet and c.separate_stopnet:
        optimizer_st = RAdam(model.decoder.stopnet.parameters(),
                             lr=c.lr,
                             weight_decay=0)
    else:
        optimizer_st = None

    # setup criterion
    criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4)

    if args.restore_path:
        checkpoint = torch.load(args.restore_path, map_location='cpu')
        try:
            # TODO: fix optimizer init, model.cuda() needs to be called before
            # optimizer restore
            # optimizer.load_state_dict(checkpoint['optimizer'])
            if c.reinit_layers:
                raise RuntimeError
            model.load_state_dict(checkpoint['model'])
        except:
            print(" > Partial model initialization.")
            model_dict = model.state_dict()
            model_dict = set_init_dict(model_dict, checkpoint, c)
            model.load_state_dict(model_dict)
            del model_dict
        for group in optimizer.param_groups:
            group['lr'] = c.lr
        print(" > Model restored from step %d" % checkpoint['step'],
              flush=True)
        args.restore_step = checkpoint['step']
    else:
        args.restore_step = 0

    if use_cuda:
        model.cuda()
        criterion.cuda()

    # DISTRUBUTED
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)

    if c.noam_schedule:
        scheduler = NoamLR(optimizer,
                           warmup_steps=c.warmup_steps,
                           last_epoch=args.restore_step - 1)
    else:
        scheduler = None

    num_params = count_parameters(model)
    print("\n > Model has {} parameters".format(num_params), flush=True)

    if 'best_loss' not in locals():
        best_loss = float('inf')

    global_step = args.restore_step
    for epoch in range(0, c.epochs):
        c_logger.print_epoch_start(epoch, c.epochs)
        # set gradual training
        if c.gradual_training is not None:
            r, c.batch_size = gradual_training_scheduler(global_step, c)
            c.r = r
            model.decoder.set_r(r)
            if c.bidirectional_decoder:
                model.decoder_backward.set_r(r)
            print("\n > Number of output frames:", model.decoder.r)

        train_avg_loss_dict, global_step = train(model, criterion, optimizer,
                                                 optimizer_st, scheduler, ap,
                                                 global_step, epoch)
        eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch)
        c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
        target_loss = train_avg_loss_dict['avg_postnet_loss']
        if c.run_eval:
            target_loss = eval_avg_loss_dict['avg_postnet_loss']
        best_loss = save_best_model(target_loss, best_loss, model, optimizer,
                                    global_step, epoch, c.r, OUT_PATH)
Exemple #14
0
def main(**kwargs):
    global symbols, phonemes # pylint: disable=global-statement
    current_date = date.today()
    current_date = current_date.strftime("%B %d %Y")
    start_time = time.time()

    # read passed variables from gui
    text = kwargs['text']                           # text to generate speech from
    use_cuda = kwargs['use_cuda']                   # if gpu exists default is true
    project = kwargs['project']                     # path to project folder
    vocoder_type = kwargs['vocoder']                # vocoder type, default is GL
    use_gst = kwargs['use_gst']                     # use style_wave for prosody
    style_dict = kwargs['style_input']              # use style_wave for prosody
    speaker_id = kwargs['speaker_id']               # name of the selected speaker
    sentence_file = kwargs['sentence_file']         # path to file if generate from file
    out_path = kwargs['out_path']                   # path to save the output wav

    batched_vocoder = True

    # load speakers
    speakers_file_path = Path(project, "speakers.json")
    if speakers_file_path.is_file():
        speaker_data = json.load(open(speakers_file_path, 'r'))
        num_speakers = len(speaker_data)
        #get the speaker id for selected speaker
        if speaker_id >= num_speakers:
            print('Speaker ID outside of number of speakers range. Using default 0.')
            speaker_id = 0
            speaker_name = [speaker for speaker, id in speaker_data.items() if speaker_id == id][0]
        else:
            speaker_name = [speaker for speaker, id in speaker_data.items() if speaker_id == id][0]
    else:
        speaker_name = 'Default'
        num_speakers = 0
        speaker_id = None

    # load the config
    config_path = Path(project, "config.json")
    C = load_config(config_path)

    if use_gst:
        if style_dict is not None:
            style_input = style_dict
    else:
        style_input = None

    # load the audio processor
    ap = AudioProcessor(**C.audio)

    # if the vocabulary was passed, replace the default
    if 'characters' in C.keys():
        symbols, phonemes = make_symbols(**C.characters)
        

    # find the tts model file in project folder
    try:
        tts_model_file = glob(str(Path(project, '*.pth.tar')))
        if not tts_model_file:
            raise FileNotFoundError
        model_path = tts_model_file[0]
    except FileNotFoundError:
        print('[!] TTS Model not found in path: "{}"'.format(project))

    # load the model
    num_chars = len(phonemes) if C.use_phonemes else len(symbols)
    model = setup_model(num_chars, num_speakers, C)

    # if gpu is not available use cpu
    model, state = load_checkpoint(model, model_path, use_cuda=use_cuda)

    model.decoder.max_decoder_steps = 2000

    model.eval()
    print(' > Model step:', state['step'])
    print(' > Model r: ', state['r'])

    # load vocoder
    if vocoder_type is 'MelGAN':
        try:
            model_file = glob(str(Path(project, 'vocoder/*.pth.tar')))
            vocoder, ap_vocoder = load_vocoder(str(Path('TTS')),
                                               str(model_file[0]),
                                               str(Path(project, 'vocoder/config.json')),
                                               use_cuda)
        except Exception:
            print('[!] Error loading vocoder: "{}"'.format(project))
            sys.exit(0)

    elif vocoder_type is 'WaveRNN':
        try:
            model_file = glob(str(Path(project, 'vocoder/*.pkl')))
            vocoder, ap_vocoder = load_vocoder(str(Path('TTS')), str(model_file[0]), str(Path(project, 'config.yml')), use_cuda)
        except Exception:
            print('[!] Error loading vocoder: "{}"'.format(project))
            sys.exit(0)
    else:
        vocoder, ap_vocoder = None, None

    print(" > Vocoder: {}".format(vocoder_type))
    print(' > Using style input: {}\n'.format(style_input))

    if sentence_file != '':
        with open(sentence_file, "r", encoding='utf8') as f:
            list_of_sentences = [s.strip() for s in f.readlines()]
    else:
        list_of_sentences = [text.strip()]

    # iterate over every passed sentence and synthesize
    for _, tts_sentence in enumerate(list_of_sentences):
        wav_list = []
        # remove character which are not alphanumerical or contain ',. '
        tts_sentence = clean_sentence(tts_sentence) 
        print(" > Text: {}".format(tts_sentence))
        # build filename
        current_time = datetime.now().strftime("%H%M%S")
        file_name = ' '.join(tts_sentence.split(" ")[:10])
        # if multiple sentences in one line -> split them
        tts_sentence = split_into_sentences(tts_sentence)
        
        # if sentence was split in sub-sentences -> iterate over them
        for sentence in tts_sentence:
            # synthesize voice
            _, _, _, wav = tts(model,
                               vocoder,
                               C,
                               None,
                               sentence,
                               ap,
                               ap_vocoder,
                               use_cuda,
                               batched_vocoder,
                               speaker_id=speaker_id,
                               style_input=style_input,
                               figures=False)

            # join sub-sentences back together and add a filler between them
            wav_list += list(wav)
            wav_list += [0] * 10000

        wav = np.array(wav_list)

        # finalize filename
        file_name = "_".join([str(current_time), file_name])
        file_name = file_name.translate(
            str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'

        if out_path == "":
            out_dir = str(Path(project, 'output', current_date, speaker_name))
            out_path = os.path.join(out_dir, file_name)
        else:
            out_dir = os.path.dirname(out_path)

        # create output directory if it doesn't exist
        if not os.path.isdir(out_dir):
            os.makedirs(out_dir, exist_ok=True)

        # save generated wav to disk
        ap.save_wav(wav, out_path)
        end_time = time.time()
        print(" > Run-time: {}".format(end_time - start_time))
        print(" > Saving output to {}\n".format(out_path))
Exemple #15
0
# load configs
TTS_CONFIG = load_config(TTS_CONFIG)
VOCODER_CONFIG = load_config(VOCODER_CONFIG)

# load the audio processor
ap = AudioProcessor(**TTS_CONFIG.audio)

# LOAD TTS MODEL
# multi speaker
speaker_id = None
speakers = []

# load the model
num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)
model = setup_model(num_chars, len(speakers), TTS_CONFIG)

# load model state
cp = torch.load(TTS_MODEL, map_location=torch.device("cpu"))

# load the model
model.load_state_dict(cp["model"])
if use_cuda:
    model.cuda()

model.eval()

# set model stepsize
if "r" in cp:
    model.decoder.set_r(cp["r"])