class SpeakerEncoderConfig(BaseTrainingConfig): """Defines parameters for Speaker Encoder model.""" model: str = "speaker_encoder" audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) datasets: List[BaseDatasetConfig] = field( default_factory=lambda: [BaseDatasetConfig()]) # model params model_params: Dict = field( default_factory=lambda: { "model_name": "lstm", "input_dim": 80, "proj_dim": 256, "lstm_dim": 768, "num_lstm_layers": 3, "use_lstm_with_projection": True, }) audio_augmentation: Dict = field(default_factory=lambda: {}) storage: Dict = field( default_factory=lambda: { "sample_from_storage_p": 0.66, # the probability with which we'll sample from the DataSet in-memory storage "storage_size": 15, # the size of the in-memory storage with respect to a single batch }) # training params max_train_step: int = 1000000 # end training when number of training steps reaches this value. loss: str = "angleproto" grad_clip: float = 3.0 lr: float = 0.0001 lr_decay: bool = False warmup_steps: int = 4000 wd: float = 1e-6 # logging params tb_model_param_stats: bool = False steps_plot_stats: int = 10 checkpoint: bool = True save_step: int = 1000 print_step: int = 20 # data loader num_speakers_in_batch: int = MISSING num_utters_per_speaker: int = MISSING num_loader_workers: int = MISSING skip_speakers: bool = False voice_len: float = 1.6 def check_values(self): super().check_values() c = asdict(self) assert ( c["model_params"]["input_dim"] == self.audio.num_mels ), " [!] model input dimendion must be equal to melspectrogram dimension."
class BaseEncoderConfig(BaseTrainingConfig): """Defines parameters for a Generic Encoder model.""" model: str = None audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) # model params model_params: Dict = field( default_factory=lambda: { "model_name": "lstm", "input_dim": 80, "proj_dim": 256, "lstm_dim": 768, "num_lstm_layers": 3, "use_lstm_with_projection": True, } ) audio_augmentation: Dict = field(default_factory=lambda: {}) # training params epochs: int = 10000 loss: str = "angleproto" grad_clip: float = 3.0 lr: float = 0.0001 optimizer: str = "radam" optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0}) lr_decay: bool = False warmup_steps: int = 4000 # logging params tb_model_param_stats: bool = False steps_plot_stats: int = 10 save_step: int = 1000 print_step: int = 20 run_eval: bool = False # data loader num_classes_in_batch: int = MISSING num_utter_per_class: int = MISSING eval_num_classes_in_batch: int = None eval_num_utter_per_class: int = None num_loader_workers: int = MISSING voice_len: float = 1.6 def check_values(self): super().check_values() c = asdict(self) assert ( c["model_params"]["input_dim"] == self.audio.num_mels ), " [!] model input dimendion must be equal to melspectrogram dimension."
import unittest import torch from tests import get_tests_output_path, run_cli from TTS.config.shared_configs import BaseDatasetConfig from TTS.tts.configs.vits_config import VitsConfig torch.manual_seed(1) config_path = os.path.join(get_tests_output_path(), "test_model_config.json") dataset_config_en = BaseDatasetConfig( name="ljspeech", meta_file_train="metadata.csv", meta_file_val="metadata.csv", path="tests/data/ljspeech", language="en", ) dataset_config_pt = BaseDatasetConfig( name="ljspeech", meta_file_train="metadata.csv", meta_file_val="metadata.csv", path="tests/data/ljspeech", language="pt-br", ) # pylint: disable=protected-access class TestFindUniquePhonemes(unittest.TestCase):
import os from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig from TTS.trainer import Trainer, TrainingArgs from TTS.tts.configs.fast_pitch_config import FastPitchConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.forward_tts import ForwardTTS from TTS.utils.audio import AudioProcessor from TTS.utils.manage import ModelManager output_path = os.path.dirname(os.path.abspath(__file__)) # init configs dataset_config = BaseDatasetConfig( name="ljspeech", meta_file_train="metadata.csv", # meta_file_attn_mask=os.path.join(output_path, "../LJSpeech-1.1/metadata_attn_mask.txt"), path=os.path.join(output_path, "../LJSpeech-1.1/"), ) audio_config = BaseAudioConfig( sample_rate=22050, do_trim_silence=True, trim_db=60.0, signal_norm=False, mel_fmin=0.0, mel_fmax=8000, spec_gain=1.0, log_func="np.log", ref_level_db=20, preemphasis=0.0, )