def setup(self): # load configs self.TTS_CONFIG = load_config(self.TTS_CONFIG) self.VOCODER_CONFIG = load_config(self.VOCODER_CONFIG) # load the audio processor self.ap = AudioProcessor(**self.TTS_CONFIG.audio) # load the model num_chars = len(phonemes) if self.TTS_CONFIG.use_phonemes else len( symbols) self.model = setup_model(num_chars, len(self.speakers), self.TTS_CONFIG) self.model, _ = load_checkpoint(self.model, self.TTS_MODEL, use_cuda=self.use_cuda) self.model.eval() # LOAD VOCODER MODEL self.vocoder_model = setup_generator(self.VOCODER_CONFIG) self.vocoder_model, _ = load_vocoder_checkpoint( self.vocoder_model, checkpoint_path=self.VOCODER_MODEL) self.vocoder_model.remove_weight_norm() self.vocoder_model.inference_padding = 0 self.ap_vocoder = AudioProcessor(**self.VOCODER_CONFIG['audio']) if self.use_cuda: self.vocoder_model.cuda() self.vocoder_model.eval()
def load_vocoder(self, model_file, model_config, use_cuda): self.vocoder_config = load_config(model_config) self.vocoder_model = setup_generator(self.vocoder_config) self.vocoder_model.load_state_dict( torch.load(model_file, map_location="cpu")["model"]) self.vocoder_model.remove_weight_norm() self.vocoder_model.inference_padding = 0 self.vocoder_config = load_config(model_config) if use_cuda: self.vocoder_model.cuda() self.vocoder_model.eval()
def details(): model_config = load_config(args.tts_config) if args.vocoder_config is not None and os.path.isfile(args.vocoder_config): vocoder_config = load_config(args.vocoder_config) else: vocoder_config = None return render_template('details.html', show_details=args.show_details, model_config=model_config, vocoder_config=vocoder_config, args=args.__dict__)
def setup(): use_cuda = True # model paths TTS_MODEL = "tts_model.pth.tar" TTS_CONFIG = "config.json" VOCODER_MODEL = "vocoder_model.pth.tar" VOCODER_CONFIG = "config_vocoder.json" # Load configs TTS_CONFIG = load_config(TTS_CONFIG) VOCODER_CONFIG = load_config(VOCODER_CONFIG) ap = AudioProcessor(**TTS_CONFIG.audio) # LOAD TTS MODEL # multi speaker speaker_id = None speakers = [] # load the model num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols) model = setup_model(num_chars, len(speakers), TTS_CONFIG) # load model state cp = torch.load(TTS_MODEL, map_location=torch.device('cpu')) # load the model model.load_state_dict(cp['model']) if use_cuda: model.cuda() model.eval() # set model stepsize if 'r' in cp: model.decoder.set_r(cp['r']) from TTS.vocoder.utils.generic_utils import setup_generator # LOAD VOCODER MODEL vocoder_model = setup_generator(VOCODER_CONFIG) vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location="cpu")["model"]) vocoder_model.remove_weight_norm() vocoder_model.inference_padding = 0 ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) if use_cuda: vocoder_model.cuda() vocoder_model.eval() return model, vocoder_model, speaker_id, TTS_CONFIG, use_cuda, ap
def do_phonemize(args): """Generate phonemes for text using config""" from TTS.utils.io import load_config from TTS.tts.utils.text import make_symbols, phoneme_to_sequence c = load_config(args.config) _, phonemes = make_symbols(**c.characters) if args.text: # Use arguments texts = args.text else: # Use stdin texts = sys.stdin if os.isatty(sys.stdin.fileno()): print("Reading text from stdin...", file=sys.stderr) for line in texts: line = line.strip() if not line: continue line_indexes = phoneme_to_sequence( line, [c.text_cleaner], language=c.phoneme_language, enable_eos_bos=False, tp=c.characters if "characters" in c.keys() else None, backend=c.phoneme_backend, ) line_phonemes = [phonemes[i] for i in line_indexes] print(args.separator.join(line_phonemes))
def load_vocoder(lib_path, model_file, model_config, use_cuda): sys.path.append(lib_path) # set this if ParallelWaveGAN is not installed globally #pylint: disable=import-outside-toplevel vocoder_config = load_config(model_config) vocoder_model = setup_generator(vocoder_config) checkpoint = torch.load(model_file, map_location='cpu') print(' > Model step:', checkpoint['step']) vocoder_model.load_state_dict(checkpoint['model']) vocoder_model.remove_weight_norm() vocoder_model.inference_padding = 0 vocoder_config = load_config(model_config) ap_vocoder = AudioProcessor(**vocoder_config['audio']) if use_cuda: vocoder_model.cuda() return vocoder_model.eval(), ap_vocoder
def load_wavernn(self, lib_path, model_file, model_config, use_cuda): # TODO: set a function in wavernn code base for model setup and call it here. sys.path.append( lib_path) # set this if WaveRNN is not installed globally #pylint: disable=import-outside-toplevel from WaveRNN.models.wavernn import Model print(" > Loading WaveRNN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) self.wavernn_config = load_config(model_config) # This is the default architecture we use for our models. # You might need to update it self.wavernn = Model( rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, mulaw=self.wavernn_config.mulaw, pad=self.wavernn_config.pad, use_aux_net=self.wavernn_config.use_aux_net, use_upsample_net=self.wavernn_config.use_upsample_net, upsample_factors=self.wavernn_config.upsample_factors, feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self.ap.hop_length, sample_rate=self.ap.sample_rate, ).cuda() check = torch.load(model_file, map_location="cpu") self.wavernn.load_state_dict(check['model']) if use_cuda: self.wavernn.cuda() self.wavernn.eval()
def load_tacotron2(use_cuda): """ Loads the Tacotron2 model Parameters ---------- use_cuda : bool whether to use the gpu Returns ------- model, audio processor, model config """ TTS_MODEL = model_path / 'model.pth.tar' TTS_CONFIG = model_path / 'config.json' TTS_CONFIG = load_config(TTS_CONFIG) TTS_CONFIG.audio['stats_path'] = str(model_path / 'scale_stats.npy') ap = AudioProcessor(**TTS_CONFIG.audio) num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols) model = setup_model(num_chars, 0, TTS_CONFIG) cp = torch.load(TTS_MODEL, map_location=torch.device('cpu')) model.load_state_dict(cp['model']) if use_cuda: model.cuda() if 'r' in cp: model.decoder.set_r(cp['r']) model.eval() return model, ap, TTS_CONFIG
def load_vocoder(use_cuda): """ Loads the Vocoder model Parameters ---------- use_cuda : bool whether to use the gpu Returns ------- model, audio processor, model config """ VOCODER_MODEL = model_path / 'vocoder_model.pth.tar' VOCODER_CONFIG = model_path / 'vocoder_config.json' VOCODER_CONFIG = load_config(VOCODER_CONFIG) VOCODER_CONFIG.audio['stats_path'] = str(model_path / 'vocoder_scale_stats.npy') ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) vocoder_model = setup_generator(VOCODER_CONFIG) cp = torch.load(VOCODER_MODEL, map_location=torch.device('cpu')) vocoder_model.load_state_dict(cp['model']) vocoder_model.remove_weight_norm() vocoder_model.inference_padding = 0 if use_cuda: vocoder_model.cuda() vocoder_model.eval() return vocoder_model, ap_vocoder, VOCODER_CONFIG
def load_vocoder(self, model_file, model_config, use_cuda): self.vocoder_config = load_config(model_config) self.vocoder_ap = AudioProcessor(**self.vocoder_config['audio']) self.vocoder_model = setup_generator(self.vocoder_config) self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True) if use_cuda: self.vocoder_model.cuda()
def test_in_out(self): self._create_random_model() config = load_config(os.path.join(get_tests_input_path(), 'server_config.json')) tts_root_path = get_tests_output_path() config['tts_checkpoint'] = os.path.join(tts_root_path, config['tts_checkpoint']) config['tts_config'] = os.path.join(tts_root_path, config['tts_config']) synthesizer = Synthesizer(config['tts_checkpoint'], config['tts_config'], None, None) synthesizer.tts("Better this test works!!")
def init_speaker_encoder(self, model_path: str, config_path: str) -> None: self.speaker_encoder_config = load_config(config_path) self.speaker_encoder = setup_model(self.speaker_encoder_config) self.speaker_encoder.load_checkpoint(config_path, model_path, True) self.speaker_encoder_ap = AudioProcessor( **self.speaker_encoder_config.audio) # normalize the input audio level and trim silences self.speaker_encoder_ap.do_sound_norm = True self.speaker_encoder_ap.do_trim_silence = True
def download_model(self, model_name): """Download model files given the full model name. Model name is in the format 'type/language/dataset/model' e.g. 'tts_model/en/ljspeech/tacotron' Every model must have the following files - *.pth.tar : pytorch model checkpoint file. - config.json : model config file. - scale_stats.npy (if exist): scale values for preprocessing. Args: model_name (str): model name as explained above. TODO: support multi-speaker models """ # fetch model info from the dict model_type, lang, dataset, model = model_name.split("/") model_full_name = f"{model_type}--{lang}--{dataset}--{model}" model_item = self.models_dict[model_type][lang][dataset][model] # set the model specific output path output_path = os.path.join(self.output_prefix, model_full_name) output_model_path = os.path.join(output_path, "model_file.pth.tar") output_config_path = os.path.join(output_path, "config.json") if os.path.exists(output_path): print(f" > {model_name} is already downloaded.") else: os.makedirs(output_path, exist_ok=True) print(f" > Downloading model to {output_path}") output_stats_path = os.path.join(output_path, 'scale_stats.npy') # download files to the output path if self._check_dict_key(model_item, 'github_rls_url'): # download from github release # TODO: pass output_path self._download_zip_file(model_item['github_rls_url'], output_path) else: # download from gdrive self._download_gdrive_file(model_item['model_file'], output_model_path) self._download_gdrive_file(model_item['config_file'], output_config_path) if self._check_dict_key(model_item, 'stats_file'): self._download_gdrive_file(model_item['stats_file'], output_stats_path) # set the scale_path.npy file path in the model config.json if self._check_dict_key( model_item, 'stats_file') or os.path.exists(output_stats_path): # set scale stats path in config.json config_path = output_config_path config = load_config(config_path) config["audio"]['stats_path'] = output_stats_path with open(config_path, "w") as jf: json.dump(config, jf) return output_model_path, output_config_path, model_item
def __init__(self, use_cuda=False, verbose=False): self.use_cuda = use_cuda self.verbose = verbose # load configs self.TTS_CONFIG = load_config(TTS_CONFIG) self.VOCODER_CONFIG = load_config(VOCODER_CONFIG) # load the audio processor self.ap = AudioProcessor(**self.TTS_CONFIG.audio) # LOAD TTS MODEL self.speaker_id = None speakers = [] # load the model num_chars = len(phonemes) if self.TTS_CONFIG.use_phonemes else len(symbols) self.model = setup_model(num_chars, len(speakers), self.TTS_CONFIG) # load model state cp = torch.load(TTS_MODEL, map_location=torch.device('cpu')) # load the model self.model.load_state_dict(cp['model']) if self.use_cuda: self.model.cuda() self.model.eval() # set model stepsize if 'r' in cp: self.model.decoder.set_r(cp['r']) # LOAD VOCODER MODEL self.vocoder_model = setup_generator(self.VOCODER_CONFIG) self.vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location="cpu")["model"]) self.vocoder_model.remove_weight_norm() self.vocoder_model.inference_padding = 0 ap_vocoder = AudioProcessor(**self.VOCODER_CONFIG['audio']) if self.use_cuda: self.vocoder_model.cuda() self.vocoder_model.eval()
def __init__(self, TTS_MODEL, TTS_CONFIG, VOCODER_MODEL, VOCODER_CONFIG, use_cuda, use_gl): self.use_cuda = use_cuda self.use_gl = use_gl # model paths self.tts_config = load_config(TTS_CONFIG) vocoder_config = load_config(VOCODER_CONFIG) # load audio processor self.ap = AudioProcessor(**self.tts_config.audio) # LOAD TTS MODEL # multi speaker self.speaker_id = None speakers = [] # load the model num_chars = len(phonemes) if self.tts_config.use_phonemes else len( symbols) self.model = setup_model(num_chars, len(speakers), self.tts_config) # load model state self.cp = torch.load(TTS_MODEL, map_location=torch.device('cpu')) # load the model self.model.load_state_dict(self.cp['model']) if self.use_cuda: self.model.cuda() self.model.train(False) self.model.eval() # set model stepsize if 'r' in self.cp: self.model.decoder.set_r(self.cp['r']) # LOAD VOCODER MODEL self.vocoder_model = setup_generator(vocoder_config) self.vocoder_model.load_state_dict( torch.load(VOCODER_MODEL, map_location="cpu")["model"]) self.vocoder_model.remove_weight_norm() self.vocoder_model.inference_padding = 0 #ap_vocoder = AudioProcessor(**vocoder_config['audio']) if use_cuda: self.vocoder_model.cuda() self.vocoder_model.train(False) self.vocoder_model.eval() #get sample rate self.sample_rate = self.ap.sample_rate gc.collect(2)
def _create_random_model(self): # pylint: disable=global-statement global symbols, phonemes config = load_config(os.path.join(get_tests_output_path(), 'dummy_model_config.json')) if 'characters' in config.keys(): symbols, phonemes = make_symbols(**config.characters) num_chars = len(phonemes) if config.use_phonemes else len(symbols) model = setup_model(num_chars, 0, config) output_path = os.path.join(get_tests_output_path()) save_checkpoint(model, None, 10, 10, 1, output_path)
def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> None: """Load the vocoder model. Args: model_file (str): path to the model checkpoint. model_config (str): path to the model config file. use_cuda (bool): enable/disable CUDA use. """ self.vocoder_config = load_config(model_config) self.vocoder_ap = AudioProcessor(verbose=False, **self.vocoder_config["audio"]) self.vocoder_model = setup_generator(self.vocoder_config) self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True) if use_cuda: self.vocoder_model.cuda()
def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -> None: """Load the TTS model. Args: tts_checkpoint (str): path to the model checkpoint. tts_config_path (str): path to the model config file. use_cuda (bool): enable/disable CUDA use. """ # pylint: disable=global-statement global symbols, phonemes self.tts_config = load_config(tts_config_path) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(verbose=False, **self.tts_config.audio) if "characters" in self.tts_config.keys(): symbols, phonemes = make_symbols(**self.tts_config.characters) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) if self.tts_config.use_speaker_embedding is True: self.tts_speakers_file = ( self.tts_speakers_file if self.tts_speakers_file else self.tts_config["external_speaker_embedding_file"]) self._load_speakers(self.tts_speakers_file) self.tts_model = setup_model( self.input_size, num_speakers=self.num_speakers, c=self.tts_config, speaker_embedding_dim=self.speaker_embedding_dim, ) self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) if use_cuda: self.tts_model.cuda()
def load_tts(self, tts_checkpoint, tts_config, use_cuda): # pylint: disable=global-statement global symbols, phonemes self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if 'characters' in self.tts_config.keys(): symbols, phonemes = make_symbols(**self.tts_config.characters) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) self.tts_model = setup_model(self.input_size, num_speakers=self.num_speakers, c=self.tts_config) self.tts_model.load_checkpoint(tts_config, tts_checkpoint, eval=True) if use_cuda: self.tts_model.cuda()
def download_model(self, model_name): """Download model files given the full model name. Model name is in the format 'type/language/dataset/model' e.g. 'tts_model/en/ljspeech/tacotron' Args: model_name (str): model name as explained above. TODO: support multi-speaker models """ # fetch model info from the dict model_type, lang, dataset, model = model_name.split("/") model_full_name = f"{model_type}--{lang}--{dataset}--{model}" model_item = self.models_dict[model_type][lang][dataset][model] # set the model specific output path output_path = os.path.join(self.output_prefix, model_full_name) output_model_path = os.path.join(output_path, "model_file.pth.tar") output_config_path = os.path.join(output_path, "config.json") if os.path.exists(output_path): print(f" > {model_name} is already downloaded.") else: os.makedirs(output_path, exist_ok=True) print(f" > Downloading model to {output_path}") output_stats_path = None # download files to the output path self._download_file(model_item['model_file'], output_model_path) self._download_file(model_item['config_file'], output_config_path) if model_item['stats_file'] is not None and len( model_item['stats_file']) > 1: output_stats_path = os.path.join(output_path, 'scale_stats.npy') self._download_file(model_item['stats_file'], output_stats_path) # set scale stats path in config.json config_path = output_config_path config = load_config(config_path) config["audio"]['stats_path'] = output_stats_path with open(config_path, "w") as jf: json.dump(config, jf) return output_model_path, output_config_path
def load_tts(self, tts_checkpoint, tts_config, use_cuda): # pylint: disable=global-statement global symbols, phonemes print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > checkpoint file: ", tts_checkpoint) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if 'characters' in self.tts_config.keys(): symbols, phonemes = make_symbols(**self.tts_config.characters) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # TODO: fix this for multi-speaker model - load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping(self.config.tts_speakers) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(tts_checkpoint, map_location=torch.device('cpu')) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp: self.tts_model.decoder.set_r(cp['r']) print(f" > model reduction factor: {cp['r']}")
def test_speaker_embedding(): # load config config = load_config(encoder_config_path) config["audio"]["resample"] = True # create a dummy speaker encoder model = SpeakerEncoder(**config.model) save_checkpoint(model, None, None, get_tests_input_path(), 0, 0) # load audio processor and speaker encoder ap = AudioProcessor(**config.audio) manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path) # load a sample audio and compute embedding waveform = ap.load_wav(sample_wav_path) mel = ap.melspectrogram(waveform) x_vector = manager.compute_x_vector(mel.T) assert x_vector.shape[1] == 256 # compute x_vector directly from an input file x_vector = manager.compute_x_vector_from_clip(sample_wav_path) x_vector2 = manager.compute_x_vector_from_clip(sample_wav_path) x_vector = torch.FloatTensor(x_vector) x_vector2 = torch.FloatTensor(x_vector2) assert x_vector.shape[0] == 256 assert (x_vector - x_vector2).sum() == 0.0 # compute x_vector from a list of wav files. x_vector3 = manager.compute_x_vector_from_clip( [sample_wav_path, sample_wav_path2]) x_vector3 = torch.FloatTensor(x_vector3) assert x_vector3.shape[0] == 256 assert (x_vector - x_vector3).sum() != 0.0 # remove dummy model os.remove(encoder_model_path)
import os import unittest import torch as T from tests import get_tests_input_path from TTS.speaker_encoder.losses import GE2ELoss, AngleProtoLoss from TTS.speaker_encoder.model import SpeakerEncoder from TTS.utils.io import load_config file_path = get_tests_input_path() c = load_config(os.path.join(file_path, "test_config.json")) class SpeakerEncoderTests(unittest.TestCase): # pylint: disable=R0201 def test_in_out(self): dummy_input = T.rand(4, 20, 80) # B x T x D dummy_hidden = [T.rand(2, 4, 128), T.rand(2, 4, 128)] model = SpeakerEncoder(input_dim=80, proj_dim=256, lstm_dim=768, num_lstm_layers=3) # computing d vectors output = model.forward(dummy_input) assert output.shape[0] == 4 assert output.shape[1] == 256 output = model.inference(dummy_input) assert output.shape[0] == 4 assert output.shape[1] == 256 # compute d vectors by passing LSTM hidden
import argparse from TTS.utils.io import load_config from TTS.vocoder.tf.utils.generic_utils import setup_generator from TTS.vocoder.tf.utils.io import load_checkpoint from TTS.vocoder.tf.utils.tflite import convert_melgan_to_tflite parser = argparse.ArgumentParser() parser.add_argument("--tf_model", type=str, help="Path to target torch model to be converted to TF.") parser.add_argument("--config_path", type=str, help="Path to config file of torch model.") parser.add_argument("--output_path", type=str, help="path to tflite output binary.") args = parser.parse_args() # Set constants CONFIG = load_config(args.config_path) # load the model model = setup_generator(CONFIG) model.build_inference() model = load_checkpoint(model, args.tf_model) # create tflite model tflite_model = convert_melgan_to_tflite(model, output_path=args.output_path)
default="", help='DISTRIBUTED: process group id.') args = parser.parse_args() if args.continue_path != '': args.output_path = args.continue_path args.config_path = os.path.join(args.continue_path, 'config.json') list_of_files = glob.glob( args.continue_path + "/*.pth.tar") # * means all if need specific format then *.csv latest_model_file = max(list_of_files, key=os.path.getctime) args.restore_path = latest_model_file print(f" > Training continues for {args.restore_path}") # setup output paths and read configs c = load_config(args.config_path) # check_config(c) _ = os.path.dirname(os.path.realpath(__file__)) # DISTRIBUTED if c.mixed_precision: print(" > Mixed precision is enabled") OUT_PATH = args.continue_path if args.continue_path == '': OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug) AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') c_logger = ConsoleLogger()
"if CONFIG.use_external_speaker_embedding_file is true, name of speaker embedding reference file present in speakers.json, else target speaker_fileid if the model is multi-speaker.", default=None) parser.add_argument('--gst_style', help="Wav path file for GST stylereference.", default=None) parser.add_argument( '--save_spectogram', type=bool, help= "If true save raw spectogram for further (vocoder) processing in out_path.", default=False) args = parser.parse_args() # load the config C = load_config(args.config_path) C.forward_attn_mask = True # load the audio processor ap = AudioProcessor(**C.audio) # if the vocabulary was passed, replace the default if 'characters' in C.keys(): symbols, phonemes = make_symbols(**C.characters) speaker_embedding = None speaker_embedding_dim = None num_speakers = 0 # load speakers if args.speakers_json != '':
import torch from torch import nn, optim from tests import get_tests_input_path from TTS.tts.layers.losses import MSELossMasked from TTS.tts.models.tacotron2 import Tacotron2 from TTS.utils.audio import AudioProcessor from TTS.utils.io import load_config # pylint: disable=unused-variable torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") c = load_config(os.path.join(get_tests_input_path(), "test_config.json")) ap = AudioProcessor(**c.audio) WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") class TacotronTrainTest(unittest.TestCase): def test_train_step(self): # pylint: disable=no-self-use input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8, )).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8, )).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device)
def process_args(args, model_class): """Process parsed comand line arguments based on model class (tts or vocoder). Args: args (argparse.Namespace or dict like): Parsed input arguments. model_type (str): Model type used to check config parameters and setup the TensorBoard logger. One of ['tts', 'vocoder']. Raises: ValueError: If `model_type` is not one of implemented choices. Returns: c (TTS.utils.io.AttrDict): Config paramaters. out_path (str): Path to save models and logging. audio_path (str): Path to save generated test audios. c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does logging to the console. tb_logger (TTS.utils.tensorboard.TensorboardLogger): Class that does the TensorBoard loggind. """ if args.continue_path: args.output_path = args.continue_path args.config_path = os.path.join(args.continue_path, "config.json") args.restore_path, best_model = get_last_checkpoint(args.continue_path) if not args.best_path: args.best_path = best_model # setup output paths and read configs c = load_config(args.config_path) _ = os.path.dirname(os.path.realpath(__file__)) if 'mixed_precision' in c and c.mixed_precision: print(" > Mixed precision mode is ON") out_path = args.continue_path if not out_path: out_path = create_experiment_folder(c.output_path, c.run_name, args.debug) audio_path = os.path.join(out_path, "test_audios") c_logger = ConsoleLogger() tb_logger = None if args.rank == 0: os.makedirs(audio_path, exist_ok=True) new_fields = {} if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() # if model characters are not set in the config file # save the default set to the config file for future # compatibility. if model_class == 'tts' and 'characters' not in c: used_characters = parse_symbols() new_fields['characters'] = used_characters copy_model_files(c, args.config_path, out_path, new_fields) os.chmod(audio_path, 0o775) os.chmod(out_path, 0o775) log_path = out_path tb_logger = TensorboardLogger(log_path, model_name=model_class.upper()) # write model desc to tensorboard tb_logger.tb_add_text("model-description", c["run_description"], 0) return c, out_path, audio_path, c_logger, tb_logger
parser = argparse.ArgumentParser() parser.add_argument('--torch_model_path', type=str, help='Path to target torch model to be converted to TF.') parser.add_argument('--config_path', type=str, help='Path to config file of torch model.') parser.add_argument( '--output_path', type=str, help='path to output file including file name to save TF model.') args = parser.parse_args() # load model config config_path = args.config_path c = load_config(config_path) num_speakers = 0 # init torch model model = setup_generator(c) checkpoint = torch.load(args.torch_model_path, map_location=torch.device('cpu')) state_dict = checkpoint['model'] model.load_state_dict(state_dict) model.remove_weight_norm() state_dict = model.state_dict() # init tf model model_tf = setup_tf_generator(c) common_sufix = '/.ATTRIBUTES/VARIABLE_VALUE'
import os import numpy as np from torch.utils.data import DataLoader from TTS.vocoder.datasets.gan_dataset import GANDataset from TTS.vocoder.datasets.preprocess import load_wav_data from TTS.utils.audio import AudioProcessor from TTS.utils.io import load_config file_path = os.path.dirname(os.path.realpath(__file__)) OUTPATH = os.path.join(file_path, "../../tests/outputs/loader_tests/") os.makedirs(OUTPATH, exist_ok=True) C = load_config(os.path.join(file_path, 'test_config.json')) test_data_path = os.path.join(file_path, "../../tests/data/ljspeech/") ok_ljspeech = os.path.exists(test_data_path) def gan_dataset_case(batch_size, seq_len, hop_len, conv_pad, return_segments, use_noise_augment, use_cache, num_workers): ''' run dataloader with given parameters and check conditions ''' ap = AudioProcessor(**C.audio) _, train_items = load_wav_data(test_data_path, 10) dataset = GANDataset(ap, train_items, seq_len=seq_len, hop_len=hop_len, pad_short=2000, conv_pad=conv_pad, return_segments=return_segments,