Exemple #1
0
    def setup(self):
        # load configs
        self.TTS_CONFIG = load_config(self.TTS_CONFIG)
        self.VOCODER_CONFIG = load_config(self.VOCODER_CONFIG)

        # load the audio processor
        self.ap = AudioProcessor(**self.TTS_CONFIG.audio)

        # load the model
        num_chars = len(phonemes) if self.TTS_CONFIG.use_phonemes else len(
            symbols)
        self.model = setup_model(num_chars, len(self.speakers),
                                 self.TTS_CONFIG)

        self.model, _ = load_checkpoint(self.model,
                                        self.TTS_MODEL,
                                        use_cuda=self.use_cuda)
        self.model.eval()

        # LOAD VOCODER MODEL
        self.vocoder_model = setup_generator(self.VOCODER_CONFIG)
        self.vocoder_model, _ = load_vocoder_checkpoint(
            self.vocoder_model, checkpoint_path=self.VOCODER_MODEL)
        self.vocoder_model.remove_weight_norm()
        self.vocoder_model.inference_padding = 0

        self.ap_vocoder = AudioProcessor(**self.VOCODER_CONFIG['audio'])
        if self.use_cuda:
            self.vocoder_model.cuda()
        self.vocoder_model.eval()
Exemple #2
0
    def load_vocoder(self, model_file, model_config, use_cuda):
        self.vocoder_config = load_config(model_config)
        self.vocoder_model = setup_generator(self.vocoder_config)
        self.vocoder_model.load_state_dict(
            torch.load(model_file, map_location="cpu")["model"])
        self.vocoder_model.remove_weight_norm()
        self.vocoder_model.inference_padding = 0
        self.vocoder_config = load_config(model_config)

        if use_cuda:
            self.vocoder_model.cuda()
        self.vocoder_model.eval()
Exemple #3
0
def details():
    model_config = load_config(args.tts_config)
    if args.vocoder_config is not None and os.path.isfile(args.vocoder_config):
        vocoder_config = load_config(args.vocoder_config)
    else:
        vocoder_config = None

    return render_template('details.html',
                           show_details=args.show_details,
                           model_config=model_config,
                           vocoder_config=vocoder_config,
                           args=args.__dict__)
def setup():
    use_cuda = True

    # model paths
    TTS_MODEL = "tts_model.pth.tar"
    TTS_CONFIG = "config.json"
    VOCODER_MODEL = "vocoder_model.pth.tar"
    VOCODER_CONFIG = "config_vocoder.json"

    # Load configs
    TTS_CONFIG = load_config(TTS_CONFIG)
    VOCODER_CONFIG = load_config(VOCODER_CONFIG)

    ap = AudioProcessor(**TTS_CONFIG.audio)


    # LOAD TTS MODEL
    # multi speaker 
    speaker_id = None
    speakers = []

    # load the model
    num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)
    model = setup_model(num_chars, len(speakers), TTS_CONFIG)

    # load model state
    cp =  torch.load(TTS_MODEL, map_location=torch.device('cpu'))

    # load the model
    model.load_state_dict(cp['model'])
    if use_cuda:
        model.cuda()
    model.eval()

    # set model stepsize
    if 'r' in cp:
        model.decoder.set_r(cp['r'])


    from TTS.vocoder.utils.generic_utils import setup_generator

    # LOAD VOCODER MODEL
    vocoder_model = setup_generator(VOCODER_CONFIG)
    vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location="cpu")["model"])
    vocoder_model.remove_weight_norm()
    vocoder_model.inference_padding = 0

    ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio'])    
    if use_cuda:
        vocoder_model.cuda()
    vocoder_model.eval()

    return model, vocoder_model, speaker_id, TTS_CONFIG, use_cuda, ap
Exemple #5
0
def do_phonemize(args):
    """Generate phonemes for text using config"""
    from TTS.utils.io import load_config
    from TTS.tts.utils.text import make_symbols, phoneme_to_sequence

    c = load_config(args.config)
    _, phonemes = make_symbols(**c.characters)

    if args.text:
        # Use arguments
        texts = args.text
    else:
        # Use stdin
        texts = sys.stdin

        if os.isatty(sys.stdin.fileno()):
            print("Reading text from stdin...", file=sys.stderr)

    for line in texts:
        line = line.strip()
        if not line:
            continue

        line_indexes = phoneme_to_sequence(
            line,
            [c.text_cleaner],
            language=c.phoneme_language,
            enable_eos_bos=False,
            tp=c.characters if "characters" in c.keys() else None,
            backend=c.phoneme_backend,
        )

        line_phonemes = [phonemes[i] for i in line_indexes]

        print(args.separator.join(line_phonemes))
Exemple #6
0
def load_vocoder(lib_path, model_file, model_config, use_cuda):
    sys.path.append(lib_path) # set this if ParallelWaveGAN is not installed globally
    #pylint: disable=import-outside-toplevel
    vocoder_config = load_config(model_config)
    vocoder_model = setup_generator(vocoder_config)
    checkpoint = torch.load(model_file, map_location='cpu')
    print(' > Model step:', checkpoint['step'])
    vocoder_model.load_state_dict(checkpoint['model'])
    vocoder_model.remove_weight_norm()
    vocoder_model.inference_padding = 0
    vocoder_config = load_config(model_config)
    ap_vocoder = AudioProcessor(**vocoder_config['audio'])

    if use_cuda:
        vocoder_model.cuda()
    return vocoder_model.eval(), ap_vocoder
Exemple #7
0
    def load_wavernn(self, lib_path, model_file, model_config, use_cuda):
        # TODO: set a function in wavernn code base for model setup and call it here.
        sys.path.append(
            lib_path)  # set this if WaveRNN is not installed globally
        #pylint: disable=import-outside-toplevel
        from WaveRNN.models.wavernn import Model
        print(" > Loading WaveRNN model ...")
        print(" | > model config: ", model_config)
        print(" | > model file: ", model_file)
        self.wavernn_config = load_config(model_config)
        # This is the default architecture we use for our models.
        # You might need to update it
        self.wavernn = Model(
            rnn_dims=512,
            fc_dims=512,
            mode=self.wavernn_config.mode,
            mulaw=self.wavernn_config.mulaw,
            pad=self.wavernn_config.pad,
            use_aux_net=self.wavernn_config.use_aux_net,
            use_upsample_net=self.wavernn_config.use_upsample_net,
            upsample_factors=self.wavernn_config.upsample_factors,
            feat_dims=80,
            compute_dims=128,
            res_out_dims=128,
            res_blocks=10,
            hop_length=self.ap.hop_length,
            sample_rate=self.ap.sample_rate,
        ).cuda()

        check = torch.load(model_file, map_location="cpu")
        self.wavernn.load_state_dict(check['model'])
        if use_cuda:
            self.wavernn.cuda()
        self.wavernn.eval()
Exemple #8
0
def load_tacotron2(use_cuda):
    """
    Loads the Tacotron2 model

    Parameters
    ----------
    use_cuda : bool
        whether to use the gpu

    Returns
    -------
    model, audio processor, model config
    """
    TTS_MODEL = model_path / 'model.pth.tar'
    TTS_CONFIG = model_path / 'config.json'

    TTS_CONFIG = load_config(TTS_CONFIG)
    TTS_CONFIG.audio['stats_path'] = str(model_path / 'scale_stats.npy')

    ap = AudioProcessor(**TTS_CONFIG.audio)

    num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)
    model = setup_model(num_chars, 0, TTS_CONFIG)
    cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))
    model.load_state_dict(cp['model'])
    if use_cuda:
        model.cuda()

    if 'r' in cp:
        model.decoder.set_r(cp['r'])

    model.eval()

    return model, ap, TTS_CONFIG
Exemple #9
0
def load_vocoder(use_cuda):
    """
    Loads the Vocoder model

    Parameters
    ----------
    use_cuda : bool
        whether to use the gpu

    Returns
    -------
    model, audio processor, model config
    """
    VOCODER_MODEL = model_path / 'vocoder_model.pth.tar'
    VOCODER_CONFIG = model_path / 'vocoder_config.json'

    VOCODER_CONFIG = load_config(VOCODER_CONFIG)
    VOCODER_CONFIG.audio['stats_path'] = str(model_path /
                                             'vocoder_scale_stats.npy')

    ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio'])

    vocoder_model = setup_generator(VOCODER_CONFIG)
    cp = torch.load(VOCODER_MODEL, map_location=torch.device('cpu'))
    vocoder_model.load_state_dict(cp['model'])
    vocoder_model.remove_weight_norm()
    vocoder_model.inference_padding = 0

    if use_cuda:
        vocoder_model.cuda()
    vocoder_model.eval()

    return vocoder_model, ap_vocoder, VOCODER_CONFIG
Exemple #10
0
 def load_vocoder(self, model_file, model_config, use_cuda):
     self.vocoder_config = load_config(model_config)
     self.vocoder_ap = AudioProcessor(**self.vocoder_config['audio'])
     self.vocoder_model = setup_generator(self.vocoder_config)
     self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True)
     if use_cuda:
         self.vocoder_model.cuda()
Exemple #11
0
 def test_in_out(self):
     self._create_random_model()
     config = load_config(os.path.join(get_tests_input_path(), 'server_config.json'))
     tts_root_path = get_tests_output_path()
     config['tts_checkpoint'] = os.path.join(tts_root_path, config['tts_checkpoint'])
     config['tts_config'] = os.path.join(tts_root_path, config['tts_config'])
     synthesizer = Synthesizer(config['tts_checkpoint'], config['tts_config'], None, None)
     synthesizer.tts("Better this test works!!")
Exemple #12
0
 def init_speaker_encoder(self, model_path: str, config_path: str) -> None:
     self.speaker_encoder_config = load_config(config_path)
     self.speaker_encoder = setup_model(self.speaker_encoder_config)
     self.speaker_encoder.load_checkpoint(config_path, model_path, True)
     self.speaker_encoder_ap = AudioProcessor(
         **self.speaker_encoder_config.audio)
     # normalize the input audio level and trim silences
     self.speaker_encoder_ap.do_sound_norm = True
     self.speaker_encoder_ap.do_trim_silence = True
Exemple #13
0
    def download_model(self, model_name):
        """Download model files given the full model name.
        Model name is in the format
            'type/language/dataset/model'
            e.g. 'tts_model/en/ljspeech/tacotron'

        Every model must have the following files
            - *.pth.tar : pytorch model checkpoint file.
            - config.json : model config file.
            - scale_stats.npy (if exist): scale values for preprocessing.

        Args:
            model_name (str): model name as explained above.

        TODO: support multi-speaker models
        """
        # fetch model info from the dict
        model_type, lang, dataset, model = model_name.split("/")
        model_full_name = f"{model_type}--{lang}--{dataset}--{model}"
        model_item = self.models_dict[model_type][lang][dataset][model]
        # set the model specific output path
        output_path = os.path.join(self.output_prefix, model_full_name)
        output_model_path = os.path.join(output_path, "model_file.pth.tar")
        output_config_path = os.path.join(output_path, "config.json")
        if os.path.exists(output_path):
            print(f" > {model_name} is already downloaded.")
        else:
            os.makedirs(output_path, exist_ok=True)
            print(f" > Downloading model to {output_path}")
            output_stats_path = os.path.join(output_path, 'scale_stats.npy')
            # download files to the output path
            if self._check_dict_key(model_item, 'github_rls_url'):
                # download from github release
                # TODO: pass output_path
                self._download_zip_file(model_item['github_rls_url'],
                                        output_path)
            else:
                # download from gdrive
                self._download_gdrive_file(model_item['model_file'],
                                           output_model_path)
                self._download_gdrive_file(model_item['config_file'],
                                           output_config_path)
                if self._check_dict_key(model_item, 'stats_file'):
                    self._download_gdrive_file(model_item['stats_file'],
                                               output_stats_path)

            # set the scale_path.npy file path in the model config.json
            if self._check_dict_key(
                    model_item,
                    'stats_file') or os.path.exists(output_stats_path):
                # set scale stats path in config.json
                config_path = output_config_path
                config = load_config(config_path)
                config["audio"]['stats_path'] = output_stats_path
                with open(config_path, "w") as jf:
                    json.dump(config, jf)
        return output_model_path, output_config_path, model_item
    def __init__(self, use_cuda=False, verbose=False):
        self.use_cuda = use_cuda
        self.verbose = verbose

        # load configs
        self.TTS_CONFIG = load_config(TTS_CONFIG)
        self.VOCODER_CONFIG = load_config(VOCODER_CONFIG)

        # load the audio processor
        self.ap = AudioProcessor(**self.TTS_CONFIG.audio)

        # LOAD TTS MODEL
        self.speaker_id = None
        speakers = []

        # load the model
        num_chars = len(phonemes) if self.TTS_CONFIG.use_phonemes else len(symbols)
        self.model = setup_model(num_chars, len(speakers), self.TTS_CONFIG)

        # load model state
        cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))

        # load the model
        self.model.load_state_dict(cp['model'])
        if self.use_cuda:
            self.model.cuda()
        self.model.eval()

        # set model stepsize
        if 'r' in cp:
            self.model.decoder.set_r(cp['r'])

        # LOAD VOCODER MODEL
        self.vocoder_model = setup_generator(self.VOCODER_CONFIG)
        self.vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location="cpu")["model"])
        self.vocoder_model.remove_weight_norm()
        self.vocoder_model.inference_padding = 0

        ap_vocoder = AudioProcessor(**self.VOCODER_CONFIG['audio'])
        if self.use_cuda:
            self.vocoder_model.cuda()

        self.vocoder_model.eval()
Exemple #15
0
 def __init__(self, TTS_MODEL, TTS_CONFIG, VOCODER_MODEL, VOCODER_CONFIG,
              use_cuda, use_gl):
     self.use_cuda = use_cuda
     self.use_gl = use_gl
     # model paths
     self.tts_config = load_config(TTS_CONFIG)
     vocoder_config = load_config(VOCODER_CONFIG)
     # load audio processor
     self.ap = AudioProcessor(**self.tts_config.audio)
     # LOAD TTS MODEL
     # multi speaker
     self.speaker_id = None
     speakers = []
     # load the model
     num_chars = len(phonemes) if self.tts_config.use_phonemes else len(
         symbols)
     self.model = setup_model(num_chars, len(speakers), self.tts_config)
     # load model state
     self.cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))
     # load the model
     self.model.load_state_dict(self.cp['model'])
     if self.use_cuda:
         self.model.cuda()
     self.model.train(False)
     self.model.eval()
     # set model stepsize
     if 'r' in self.cp:
         self.model.decoder.set_r(self.cp['r'])
     # LOAD VOCODER MODEL
     self.vocoder_model = setup_generator(vocoder_config)
     self.vocoder_model.load_state_dict(
         torch.load(VOCODER_MODEL, map_location="cpu")["model"])
     self.vocoder_model.remove_weight_norm()
     self.vocoder_model.inference_padding = 0
     #ap_vocoder = AudioProcessor(**vocoder_config['audio'])
     if use_cuda:
         self.vocoder_model.cuda()
     self.vocoder_model.train(False)
     self.vocoder_model.eval()
     #get sample rate
     self.sample_rate = self.ap.sample_rate
     gc.collect(2)
Exemple #16
0
    def _create_random_model(self):
        # pylint: disable=global-statement
        global symbols, phonemes
        config = load_config(os.path.join(get_tests_output_path(), 'dummy_model_config.json'))
        if 'characters' in config.keys():
            symbols, phonemes = make_symbols(**config.characters)

        num_chars = len(phonemes) if config.use_phonemes else len(symbols)
        model = setup_model(num_chars, 0, config)
        output_path = os.path.join(get_tests_output_path())
        save_checkpoint(model, None, 10, 10, 1, output_path)
Exemple #17
0
    def _load_vocoder(self, model_file: str, model_config: str,
                      use_cuda: bool) -> None:
        """Load the vocoder model.

        Args:
            model_file (str): path to the model checkpoint.
            model_config (str): path to the model config file.
            use_cuda (bool): enable/disable CUDA use.
        """
        self.vocoder_config = load_config(model_config)
        self.vocoder_ap = AudioProcessor(verbose=False,
                                         **self.vocoder_config["audio"])
        self.vocoder_model = setup_generator(self.vocoder_config)
        self.vocoder_model.load_checkpoint(self.vocoder_config,
                                           model_file,
                                           eval=True)
        if use_cuda:
            self.vocoder_model.cuda()
Exemple #18
0
    def _load_tts(self, tts_checkpoint: str, tts_config_path: str,
                  use_cuda: bool) -> None:
        """Load the TTS model.

        Args:
            tts_checkpoint (str): path to the model checkpoint.
            tts_config_path (str): path to the model config file.
            use_cuda (bool): enable/disable CUDA use.
        """
        # pylint: disable=global-statement

        global symbols, phonemes

        self.tts_config = load_config(tts_config_path)
        self.use_phonemes = self.tts_config.use_phonemes
        self.ap = AudioProcessor(verbose=False, **self.tts_config.audio)

        if "characters" in self.tts_config.keys():
            symbols, phonemes = make_symbols(**self.tts_config.characters)

        if self.use_phonemes:
            self.input_size = len(phonemes)
        else:
            self.input_size = len(symbols)

        if self.tts_config.use_speaker_embedding is True:
            self.tts_speakers_file = (
                self.tts_speakers_file if self.tts_speakers_file else
                self.tts_config["external_speaker_embedding_file"])
            self._load_speakers(self.tts_speakers_file)

        self.tts_model = setup_model(
            self.input_size,
            num_speakers=self.num_speakers,
            c=self.tts_config,
            speaker_embedding_dim=self.speaker_embedding_dim,
        )
        self.tts_model.load_checkpoint(self.tts_config,
                                       tts_checkpoint,
                                       eval=True)
        if use_cuda:
            self.tts_model.cuda()
Exemple #19
0
    def load_tts(self, tts_checkpoint, tts_config, use_cuda):
        # pylint: disable=global-statement
        global symbols, phonemes

        self.tts_config = load_config(tts_config)
        self.use_phonemes = self.tts_config.use_phonemes
        self.ap = AudioProcessor(**self.tts_config.audio)

        if 'characters' in self.tts_config.keys():
            symbols, phonemes = make_symbols(**self.tts_config.characters)

        if self.use_phonemes:
            self.input_size = len(phonemes)
        else:
            self.input_size = len(symbols)

        self.tts_model = setup_model(self.input_size, num_speakers=self.num_speakers, c=self.tts_config)
        self.tts_model.load_checkpoint(tts_config, tts_checkpoint, eval=True)
        if use_cuda:
            self.tts_model.cuda()
Exemple #20
0
    def download_model(self, model_name):
        """Download model files given the full model name.
        Model name is in the format
            'type/language/dataset/model'
            e.g. 'tts_model/en/ljspeech/tacotron'

        Args:
            model_name (str): model name as explained above.

        TODO: support multi-speaker models
        """
        # fetch model info from the dict
        model_type, lang, dataset, model = model_name.split("/")
        model_full_name = f"{model_type}--{lang}--{dataset}--{model}"
        model_item = self.models_dict[model_type][lang][dataset][model]
        # set the model specific output path
        output_path = os.path.join(self.output_prefix, model_full_name)
        output_model_path = os.path.join(output_path, "model_file.pth.tar")
        output_config_path = os.path.join(output_path, "config.json")
        if os.path.exists(output_path):
            print(f" > {model_name} is already downloaded.")
        else:
            os.makedirs(output_path, exist_ok=True)
            print(f" > Downloading model to {output_path}")
            output_stats_path = None
            # download files to the output path
            self._download_file(model_item['model_file'], output_model_path)
            self._download_file(model_item['config_file'], output_config_path)
            if model_item['stats_file'] is not None and len(
                    model_item['stats_file']) > 1:
                output_stats_path = os.path.join(output_path,
                                                 'scale_stats.npy')
                self._download_file(model_item['stats_file'],
                                    output_stats_path)
                # set scale stats path in config.json
                config_path = output_config_path
                config = load_config(config_path)
                config["audio"]['stats_path'] = output_stats_path
                with open(config_path, "w") as jf:
                    json.dump(config, jf)
        return output_model_path, output_config_path
Exemple #21
0
    def load_tts(self, tts_checkpoint, tts_config, use_cuda):
        # pylint: disable=global-statement
        global symbols, phonemes

        print(" > Loading TTS model ...")
        print(" | > model config: ", tts_config)
        print(" | > checkpoint file: ", tts_checkpoint)

        self.tts_config = load_config(tts_config)
        self.use_phonemes = self.tts_config.use_phonemes
        self.ap = AudioProcessor(**self.tts_config.audio)

        if 'characters' in self.tts_config.keys():
            symbols, phonemes = make_symbols(**self.tts_config.characters)

        if self.use_phonemes:
            self.input_size = len(phonemes)
        else:
            self.input_size = len(symbols)
        # TODO: fix this for multi-speaker model - load speakers
        if self.config.tts_speakers is not None:
            self.tts_speakers = load_speaker_mapping(self.config.tts_speakers)
            num_speakers = len(self.tts_speakers)
        else:
            num_speakers = 0
        self.tts_model = setup_model(self.input_size,
                                     num_speakers=num_speakers,
                                     c=self.tts_config)
        # load model state
        cp = torch.load(tts_checkpoint, map_location=torch.device('cpu'))
        # load the model
        self.tts_model.load_state_dict(cp['model'])
        if use_cuda:
            self.tts_model.cuda()
        self.tts_model.eval()
        self.tts_model.decoder.max_decoder_steps = 3000
        if 'r' in cp:
            self.tts_model.decoder.set_r(cp['r'])
            print(f" > model reduction factor: {cp['r']}")
    def test_speaker_embedding():
        # load config
        config = load_config(encoder_config_path)
        config["audio"]["resample"] = True

        # create a dummy speaker encoder
        model = SpeakerEncoder(**config.model)
        save_checkpoint(model, None, None, get_tests_input_path(), 0, 0)

        # load audio processor and speaker encoder
        ap = AudioProcessor(**config.audio)
        manager = SpeakerManager(encoder_model_path=encoder_model_path,
                                 encoder_config_path=encoder_config_path)

        # load a sample audio and compute embedding
        waveform = ap.load_wav(sample_wav_path)
        mel = ap.melspectrogram(waveform)
        x_vector = manager.compute_x_vector(mel.T)
        assert x_vector.shape[1] == 256

        # compute x_vector directly from an input file
        x_vector = manager.compute_x_vector_from_clip(sample_wav_path)
        x_vector2 = manager.compute_x_vector_from_clip(sample_wav_path)
        x_vector = torch.FloatTensor(x_vector)
        x_vector2 = torch.FloatTensor(x_vector2)
        assert x_vector.shape[0] == 256
        assert (x_vector - x_vector2).sum() == 0.0

        # compute x_vector from a list of wav files.
        x_vector3 = manager.compute_x_vector_from_clip(
            [sample_wav_path, sample_wav_path2])
        x_vector3 = torch.FloatTensor(x_vector3)
        assert x_vector3.shape[0] == 256
        assert (x_vector - x_vector3).sum() != 0.0

        # remove dummy model
        os.remove(encoder_model_path)
Exemple #23
0
import os
import unittest

import torch as T
from tests import get_tests_input_path

from TTS.speaker_encoder.losses import GE2ELoss, AngleProtoLoss
from TTS.speaker_encoder.model import SpeakerEncoder
from TTS.utils.io import load_config

file_path = get_tests_input_path()
c = load_config(os.path.join(file_path, "test_config.json"))


class SpeakerEncoderTests(unittest.TestCase):
    # pylint: disable=R0201
    def test_in_out(self):
        dummy_input = T.rand(4, 20, 80)  # B x T x D
        dummy_hidden = [T.rand(2, 4, 128), T.rand(2, 4, 128)]
        model = SpeakerEncoder(input_dim=80,
                               proj_dim=256,
                               lstm_dim=768,
                               num_lstm_layers=3)
        # computing d vectors
        output = model.forward(dummy_input)
        assert output.shape[0] == 4
        assert output.shape[1] == 256
        output = model.inference(dummy_input)
        assert output.shape[0] == 4
        assert output.shape[1] == 256
        # compute d vectors by passing LSTM hidden
Exemple #24
0
import argparse

from TTS.utils.io import load_config
from TTS.vocoder.tf.utils.generic_utils import setup_generator
from TTS.vocoder.tf.utils.io import load_checkpoint
from TTS.vocoder.tf.utils.tflite import convert_melgan_to_tflite

parser = argparse.ArgumentParser()
parser.add_argument("--tf_model",
                    type=str,
                    help="Path to target torch model to be converted to TF.")
parser.add_argument("--config_path",
                    type=str,
                    help="Path to config file of torch model.")
parser.add_argument("--output_path",
                    type=str,
                    help="path to tflite output binary.")
args = parser.parse_args()

# Set constants
CONFIG = load_config(args.config_path)

# load the model
model = setup_generator(CONFIG)
model.build_inference()
model = load_checkpoint(model, args.tf_model)

# create tflite model
tflite_model = convert_melgan_to_tflite(model, output_path=args.output_path)
Exemple #25
0
                        default="",
                        help='DISTRIBUTED: process group id.')
    args = parser.parse_args()

    if args.continue_path != '':
        args.output_path = args.continue_path
        args.config_path = os.path.join(args.continue_path, 'config.json')
        list_of_files = glob.glob(
            args.continue_path +
            "/*.pth.tar")  # * means all if need specific format then *.csv
        latest_model_file = max(list_of_files, key=os.path.getctime)
        args.restore_path = latest_model_file
        print(f" > Training continues for {args.restore_path}")

    # setup output paths and read configs
    c = load_config(args.config_path)
    # check_config(c)
    _ = os.path.dirname(os.path.realpath(__file__))

    # DISTRIBUTED
    if c.mixed_precision:
        print("   >  Mixed precision is enabled")

    OUT_PATH = args.continue_path
    if args.continue_path == '':
        OUT_PATH = create_experiment_folder(c.output_path, c.run_name,
                                            args.debug)

    AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')

    c_logger = ConsoleLogger()
Exemple #26
0
        "if CONFIG.use_external_speaker_embedding_file is true, name of speaker embedding reference file present in speakers.json, else target speaker_fileid if the model is multi-speaker.",
        default=None)
    parser.add_argument('--gst_style',
                        help="Wav path file for GST stylereference.",
                        default=None)
    parser.add_argument(
        '--save_spectogram',
        type=bool,
        help=
        "If true save raw spectogram for further (vocoder) processing in out_path.",
        default=False)

    args = parser.parse_args()

    # load the config
    C = load_config(args.config_path)
    C.forward_attn_mask = True

    # load the audio processor
    ap = AudioProcessor(**C.audio)

    # if the vocabulary was passed, replace the default
    if 'characters' in C.keys():
        symbols, phonemes = make_symbols(**C.characters)

    speaker_embedding = None
    speaker_embedding_dim = None
    num_speakers = 0

    # load speakers
    if args.speakers_json != '':
Exemple #27
0
import torch
from torch import nn, optim

from tests import get_tests_input_path
from TTS.tts.layers.losses import MSELossMasked
from TTS.tts.models.tacotron2 import Tacotron2
from TTS.utils.audio import AudioProcessor
from TTS.utils.io import load_config

# pylint: disable=unused-variable

torch.manual_seed(1)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

c = load_config(os.path.join(get_tests_input_path(), "test_config.json"))

ap = AudioProcessor(**c.audio)
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")


class TacotronTrainTest(unittest.TestCase):
    def test_train_step(self):  # pylint: disable=no-self-use
        input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
        input_lengths = torch.randint(100, 128, (8, )).long().to(device)
        input_lengths = torch.sort(input_lengths, descending=True)[0]
        mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
        mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
        mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
        mel_lengths[0] = 30
        stop_targets = torch.zeros(8, 30, 1).float().to(device)
Exemple #28
0
def process_args(args, model_class):
    """Process parsed comand line arguments based on model class (tts or vocoder).

    Args:
        args (argparse.Namespace or dict like): Parsed input arguments.
        model_type (str): Model type used to check config parameters and setup
            the TensorBoard logger. One of ['tts', 'vocoder'].

    Raises:
        ValueError: If `model_type` is not one of implemented choices.

    Returns:
        c (TTS.utils.io.AttrDict): Config paramaters.
        out_path (str): Path to save models and logging.
        audio_path (str): Path to save generated test audios.
        c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does
            logging to the console.
        tb_logger (TTS.utils.tensorboard.TensorboardLogger): Class that does
            the TensorBoard loggind.
    """
    if args.continue_path:
        args.output_path = args.continue_path
        args.config_path = os.path.join(args.continue_path, "config.json")
        args.restore_path, best_model = get_last_checkpoint(args.continue_path)
        if not args.best_path:
            args.best_path = best_model

    # setup output paths and read configs
    c = load_config(args.config_path)
    _ = os.path.dirname(os.path.realpath(__file__))

    if 'mixed_precision' in c and c.mixed_precision:
        print("   >  Mixed precision mode is ON")

    out_path = args.continue_path
    if not out_path:
        out_path = create_experiment_folder(c.output_path, c.run_name,
                                            args.debug)

    audio_path = os.path.join(out_path, "test_audios")

    c_logger = ConsoleLogger()
    tb_logger = None

    if args.rank == 0:
        os.makedirs(audio_path, exist_ok=True)
        new_fields = {}
        if args.restore_path:
            new_fields["restore_path"] = args.restore_path
        new_fields["github_branch"] = get_git_branch()
        # if model characters are not set in the config file
        # save the default set to the config file for future
        # compatibility.
        if model_class == 'tts' and 'characters' not in c:
            used_characters = parse_symbols()
            new_fields['characters'] = used_characters
        copy_model_files(c, args.config_path, out_path, new_fields)
        os.chmod(audio_path, 0o775)
        os.chmod(out_path, 0o775)

        log_path = out_path

        tb_logger = TensorboardLogger(log_path, model_name=model_class.upper())

        # write model desc to tensorboard
        tb_logger.tb_add_text("model-description", c["run_description"], 0)

    return c, out_path, audio_path, c_logger, tb_logger
parser = argparse.ArgumentParser()
parser.add_argument('--torch_model_path',
                    type=str,
                    help='Path to target torch model to be converted to TF.')
parser.add_argument('--config_path',
                    type=str,
                    help='Path to config file of torch model.')
parser.add_argument(
    '--output_path',
    type=str,
    help='path to output file including file name to save TF model.')
args = parser.parse_args()

# load model config
config_path = args.config_path
c = load_config(config_path)
num_speakers = 0

# init torch model
model = setup_generator(c)
checkpoint = torch.load(args.torch_model_path,
                        map_location=torch.device('cpu'))
state_dict = checkpoint['model']
model.load_state_dict(state_dict)
model.remove_weight_norm()
state_dict = model.state_dict()

# init tf model
model_tf = setup_tf_generator(c)

common_sufix = '/.ATTRIBUTES/VARIABLE_VALUE'
Exemple #30
0
import os
import numpy as np
from torch.utils.data import DataLoader

from TTS.vocoder.datasets.gan_dataset import GANDataset
from TTS.vocoder.datasets.preprocess import load_wav_data
from TTS.utils.audio import AudioProcessor
from TTS.utils.io import load_config

file_path = os.path.dirname(os.path.realpath(__file__))
OUTPATH = os.path.join(file_path, "../../tests/outputs/loader_tests/")
os.makedirs(OUTPATH, exist_ok=True)

C = load_config(os.path.join(file_path, 'test_config.json'))

test_data_path = os.path.join(file_path, "../../tests/data/ljspeech/")
ok_ljspeech = os.path.exists(test_data_path)


def gan_dataset_case(batch_size, seq_len, hop_len, conv_pad, return_segments,
                     use_noise_augment, use_cache, num_workers):
    ''' run dataloader with given parameters and check conditions '''
    ap = AudioProcessor(**C.audio)
    _, train_items = load_wav_data(test_data_path, 10)
    dataset = GANDataset(ap,
                         train_items,
                         seq_len=seq_len,
                         hop_len=hop_len,
                         pad_short=2000,
                         conv_pad=conv_pad,
                         return_segments=return_segments,