async def _send_message(self, socket_id, response, **kwargs: Any): # type: (Text, Any) -> None """Sends a message to the recipient using the bot event.""" #tts_out = TTS_mod(response).load_model() #await self.sio.emit(self.bot_message_evt, response, room=socket_id) # Set constants MODEL_PATH = './tts_model/best_model.pth.tar' CONFIG_PATH = './tts_model/config.json' OUT_FILE = 'tts_out.wav' CONFIG = load_config(CONFIG_PATH) use_cuda = False wav_norm = self.load_model(MODEL_PATH, response['text'], CONFIG, use_cuda, OUT_FILE) #await self.sio.emit(self.bot_message_evt, {'text':response['text'], "user_utterance":"Hello", "link":"file://local/Users/juste/Desktop/rasa-demo/tts_out.wav"}, room=socket_id) await self.sio.emit(self.bot_message_evt, { 'text': response['text'], "link": "https://file-examples.com/wp-content/uploads/2017/11/file_example_WAV_1MG.wav" }, room=socket_id)
def load_tts(self, tts_checkpoint, tts_config, use_cuda): print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > checkpoint file: ", tts_checkpoint) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # TODO: fix this for multi-speaker model - load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping(self.config.tts_speakers) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(tts_checkpoint, map_location=torch.device('cpu')) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp: self.tts_model.decoder.set_r(cp['r'])
def test_in_out(self): self._create_random_model() config = load_config( os.path.join(get_tests_input_path(), 'server_config.json')) config['tts_path'] = get_tests_output_path() synthesizer = Synthesizer(config) synthesizer.tts("Better this test works!!")
def load_tts(self, model_path, model_file, model_config, use_cuda): tts_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_file) print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > model file: ", model_file) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping( os.path.join(model_path, self.config.tts_speakers)) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(self.model_file) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp and self.tts_config.model in ["Tacotron", "TacotronGST"]: self.tts_model.decoder.set_r(cp['r'])
def __init__(self, message): self.message = message self.MODEL_PATH = './stt_models/best_model.pth.tar' self.CONFIG_PATH = './stt_models/config.json' self.OUT_FOLDER = '/output' self.CONFIG = load_config(self.CONFIG_PATH) self.use_cuda = False
def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda): # TODO: set a function in wavernn code base for model setup and call it here. sys.path.append(lib_path) # set this if TTS is not installed globally from WaveRNN.models.wavernn import Model wavernn_config = os.path.join(model_path, model_config) model_file = os.path.join(model_path, model_file) print(" > Loading WaveRNN model ...") print(" | > model config: ", wavernn_config) print(" | > model file: ", model_file) self.wavernn_config = load_config(wavernn_config) self.wavernn = Model( rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, mulaw=self.wavernn_config.mulaw, pad=self.wavernn_config.pad, use_aux_net=self.wavernn_config.use_aux_net, use_upsample_net=self.wavernn_config.use_upsample_net, upsample_factors=self.wavernn_config.upsample_factors, feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self.ap.hop_length, sample_rate=self.ap.sample_rate, ).cuda() check = torch.load(model_file) self.wavernn.load_state_dict(check['model']) if use_cuda: self.wavernn.cuda() self.wavernn.eval()
def _create_random_model(self): config = load_config( os.path.join(get_tests_output_path(), 'dummy_model_config.json')) num_chars = len(phonemes) if config.use_phonemes else len(symbols) model = setup_model(num_chars, 0, config) output_path = os.path.join(get_tests_output_path()) save_checkpoint(model, None, None, None, output_path, 10, 10)
def load_tts_model(): MODEL_PATH = dirpath + '/tts_model/best_model.pth.tar' CONFIG_PATH = dirpath + '/tts_model/config.json' CONFIG = load_config(CONFIG_PATH) use_cuda = False num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols) model = Tacotron(num_chars, CONFIG.embedding_size, CONFIG.audio['num_freq'], CONFIG.audio['num_mels'], CONFIG.r, attn_windowing=False) num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols) model = Tacotron(num_chars, CONFIG.embedding_size, CONFIG.audio['num_freq'], CONFIG.audio['num_mels'], CONFIG.r, attn_windowing=False) # load the audio processor # CONFIG.audio["power"] = 1.3 CONFIG.audio["preemphasis"] = 0.97 ap = AudioProcessor(**CONFIG.audio) # load model state if use_cuda: cp = torch.load(MODEL_PATH) else: cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage) # load the model model.load_state_dict(cp['model']) if use_cuda: model.cuda() #model.eval() model.decoder.max_decoder_steps = 1000 return model, ap, MODEL_PATH, CONFIG, use_cuda
def load_wavernn(self, lib_path, model_file, model_config, use_cuda): # TODO: set a function in wavernn code base for model setup and call it here. sys.path.append(lib_path) # set this if WaveRNN is not installed globally #pylint: disable=import-outside-toplevel from WaveRNN.models.wavernn import Model print(" > Loading WaveRNN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) self.wavernn_config = load_config(model_config) # This is the default architecture we use for our models. # You might need to update it self.wavernn = Model( rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, mulaw=self.wavernn_config.mulaw, pad=self.wavernn_config.pad, use_aux_net=self.wavernn_config.use_aux_net, use_upsample_net=self.wavernn_config.use_upsample_net, upsample_factors=self.wavernn_config.upsample_factors, feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self.ap.hop_length, sample_rate=self.ap.sample_rate, ).cuda() check = torch.load(model_file, map_location="cpu") self.wavernn.load_state_dict(check['model']) if use_cuda: self.wavernn.cuda() self.wavernn.eval()
def load_model(self, model_path, model_name, model_config, use_cuda): model_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_name) print(" > Loading model ...") print(" | > model config: ", model_config) print(" | > model file: ", self.model_file) config = load_config(model_config) self.config = config self.use_cuda = use_cuda self.model = Tacotron(config.embedding_size, config.num_freq, config.num_mels, config.r) self.ap = AudioProcessor(config.sample_rate, config.num_mels, config.min_level_db, config.frame_shift_ms, config.frame_length_ms, config.preemphasis, config.ref_level_db, config.num_freq, config.power, griffin_lim_iters=60) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if use_cuda: self.model.cuda() self.model.eval()
def main(): """ Call train.py as a new process and pass command arguments """ parser = argparse.ArgumentParser() parser.add_argument('--restore_path', type=str, help='Folder path to checkpoints', default='') parser.add_argument( '--config_path', type=str, help='path to config file for training', ) parser.add_argument('--data_path', type=str, help='dataset path.', default='') args = parser.parse_args() CONFIG = load_config(args.config_path) OUT_PATH = create_experiment_folder(CONFIG.output_path, CONFIG.run_name, True) stdout_path = os.path.join(OUT_PATH, "process_stdout/") num_gpus = torch.cuda.device_count() group_id = time.strftime("%Y_%m_%d-%H%M%S") # set arguments for train.py command = ['train.py'] command.append('--restore_path={}'.format(args.restore_path)) command.append('--config_path={}'.format(args.config_path)) command.append('--group_id=group_{}'.format(group_id)) command.append('--data_path={}'.format(args.data_path)) command.append('--output_path={}'.format(OUT_PATH)) command.append('') if not os.path.isdir(stdout_path): os.makedirs(stdout_path) os.chmod(stdout_path, 0o775) # run processes processes = [] for i in range(num_gpus): my_env = os.environ.copy() my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i) command[6] = '--rank={}'.format(i) stdout = None if i == 0 else open( os.path.join(stdout_path, "process_{}.log".format(i)), "w") p = subprocess.Popen(['python3'] + command, stdout=stdout, env=my_env) processes.append(p) print(command) for p in processes: p.wait()
def __init__(self): # Set constants #ROOT_PATH = '/home/avnerus/Code/TTS-Data' ROOT_PATH = '/Users/avnerus/Code/TTS-Data' CONFIG_PATH = ROOT_PATH + '/config.json' OUT_FOLDER = ROOT_PATH + '/test' self.CONFIG = load_config(CONFIG_PATH) self.MODEL_PATH = ROOT_PATH + '/best_model.pth.tar' self.use_cuda = False
def __init__(self, tts_model, tts_config, wavernn_model=None, wavernn_config=None, device="cpu"): from TTS.utils.generic_utils import load_config self.tts_config = load_config(tts_config) self.tts_config.windowing = True if not torch.cuda.is_available(): device = "cpu" self.use_cuda = device != "cpu" self.device = torch.device(device) self.tts_model_path = tts_model self._load_tts() if wavernn_model and wavernn_config: self.use_gl = False self.batched_wavernn = True self.wavernn_model_path = wavernn_model self.wavernn_config = load_config(wavernn_config) self._load_wavernn() else: self.use_gl = True
def _create_random_model(self): # pylint: disable=global-statement global symbols, phonemes config = load_config( os.path.join(get_tests_output_path(), 'dummy_model_config.json')) if 'characters' in config.keys(): symbols, phonemes = make_symbols(**config.characters) num_chars = len(phonemes) if config.use_phonemes else len(symbols) model = setup_model(num_chars, 0, config) output_path = os.path.join(get_tests_output_path()) save_checkpoint(model, None, None, None, output_path, 10, 10)
def tts(text, model_path='model/best_model.pth.tar', config_path='model/config.json', use_cuda=False): CONFIG = load_config(config_path) model = Tacotron(CONFIG.embedding_size, CONFIG.num_freq, CONFIG.num_mels, CONFIG.r) if use_cuda: cp = torch.load(model_path + seq_to_seq_test_model_fname, map_location='cuda:0') else: cp = torch.load(model_path, map_location=lambda storage, loc: storage) model.load_state_dict(cp['model']) if use_cuda: model.cuda() model.eval() model.decoder.max_decoder_steps = 250 ap = AudioProcessor(CONFIG.sample_rate, CONFIG.num_mels, CONFIG.min_level_db, CONFIG.frame_shift_ms, CONFIG.frame_length_ms, CONFIG.ref_level_db, CONFIG.num_freq, CONFIG.power, CONFIG.preemphasis, griffin_lim_iters=50) t_1 = time.time() text_cleaner = [CONFIG.text_cleaner] seq = np.array(text_to_sequence(text, text_cleaner)) chars_var = torch.from_numpy(seq).unsqueeze(0) if use_cuda: chars_var = chars_var.cuda() linear_out = model.forward(chars_var.long()) linear_out = linear_out[0].data.cpu().numpy() waveform = ap.inv_spectrogram(linear_out.T) waveform = waveform[:ap.find_endpoint(waveform)] out_path = 'static/samples/' os.makedirs(out_path, exist_ok=True) file_name = text.replace(" ", "_").replace(".", "") + ".wav" out_path = os.path.join(out_path, file_name) ap.save_wav(waveform, out_path) # print(" > Run-time: {}".format(time.time() - t_1)) return file_name
def __init__(self): # Set constants ROOT_PATH = 'TTS/tts_model/' MODEL_PATH = ROOT_PATH + '/best_model.pth.tar' # MODEL_PATH_TMP = ROOT_PATH + '/best_model.pth.tar' CONFIG_PATH = ROOT_PATH + '/config.json' OUT_FOLDER = ROOT_PATH + '/test' self.CONFIG = load_config(CONFIG_PATH) self.use_cuda = True # True # load the model self.model = Tacotron(self.CONFIG.embedding_size, self.CONFIG.num_freq, self.CONFIG.num_mels, self.CONFIG.r) # load the audio processor self.ap = AudioProcessor(self.CONFIG.sample_rate, self.CONFIG.num_mels, self.CONFIG.min_level_db, self.CONFIG.frame_shift_ms, self.CONFIG.frame_length_ms, self.CONFIG.ref_level_db, self.CONFIG.num_freq, self.CONFIG.power, self.CONFIG.preemphasis, 60) # load model state if self.use_cuda: cp = torch.load(MODEL_PATH) else: cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if self.use_cuda: self.model.cuda() self.model.eval() self.model.decoder.max_decoder_steps = 500 self.nlp = spacy.load("en")
def load_tts_model(self): CONFIG = load_config(CONFIG_PATH) model = Tacotron(len(phonemes), CONFIG.embedding_size, CONFIG.audio["num_freq"], CONFIG.audio["num_mels"], CONFIG.r, attn_windowing=False) # load the audio processor ap = AudioProcessor(**CONFIG.audio) # load model state cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage) # load the model model.load_state_dict(cp["model"]) model.decoder.max_decoder_steps = 650 return model, ap, MODEL_PATH, CONFIG, use_cuda
def load_tts(self, tts_checkpoint, tts_config, use_cuda): global symbols, phonemes print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > checkpoint file: ", tts_checkpoint) self.tts_config = load_config(tts_config) if 'text' in self.tts_config.keys(): symbols, phonemes = make_symbols(**self.tts_config.text) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping( os.path.join(model_path, self.config.tts_speakers)) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(tts_checkpoint, map_location=torch.device('cpu')) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp: self.tts_model.decoder.set_r(cp['r'])
import os import unittest import numpy as np from torch.utils.data import DataLoader from TTS.utils.generic_utils import load_config from TTS.datasets.LJSpeech import LJSpeechDataset file_path = os.path.dirname(os.path.realpath(__file__)) c = load_config(os.path.join(file_path, 'test_config.json')) class TestDataset(unittest.TestCase): def __init__(self, *args, **kwargs): super(TestDataset, self).__init__(*args, **kwargs) self.max_loader_iter = 4 def test_loader(self): dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata.csv'), os.path.join(c.data_path, 'wavs'), c.r, c.sample_rate, c.text_cleaner, c.num_mels, c.min_level_db, c.frame_shift_ms, c.frame_length_ms, c.preemphasis, c.ref_level_db, c.num_freq, c.power) dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=dataset.collate_fn, drop_last=True, num_workers=c.num_loader_workers)
import os import unittest import torch as T from TTS.speaker_encoder.model import SpeakerEncoder from TTS.speaker_encoder.loss import GE2ELoss from TTS.utils.generic_utils import load_config file_path = os.path.dirname(os.path.realpath(__file__)) + "/../tests/" c = load_config(os.path.join(file_path, "test_config.json")) class SpeakerEncoderTests(unittest.TestCase): # pylint: disable=R0201 def test_in_out(self): dummy_input = T.rand(4, 20, 80) # B x T x D dummy_hidden = [T.rand(2, 4, 128), T.rand(2, 4, 128)] model = SpeakerEncoder(input_dim=80, proj_dim=256, lstm_dim=768, num_lstm_layers=3) # computing d vectors output = model.forward(dummy_input) assert output.shape[0] == 4 assert output.shape[1] == 256 output = model.inference(dummy_input) assert output.shape[0] == 4 assert output.shape[1] == 256 # compute d vectors by passing LSTM hidden # output = model.forward(dummy_input, dummy_hidden) # assert output.shape[0] == 4
get_ipython().run_line_magic('matplotlib', 'inline') from TTS.utils.audio import AudioProcessor from TTS.utils.visual import plot_spectrogram from TTS.utils.generic_utils import load_config import glob import IPython.display as ipd # In[ ]: config_path = "/media/erogol/data_ssd/Data/models/tr/TTS-phoneme-January-14-2019_06+52PM-4ad64a7/config.json" data_path = "/home/erogol/Data/Mozilla/" file_paths = glob.glob(data_path + "/**/*.wav", recursive=True) CONFIG = load_config(config_path) # ### Setup Audio Processor # Play with the AP parameters until you find a good fit with the synthesis speech below. # In[ ]: audio={ 'audio_processor': 'audio', 'num_mels': 80, # In general, you don'tneed to change it 'num_freq': 1025, # In general, you don'tneed to change it 'sample_rate': 22050, # It depends to the sample rate of the dataset. 'frame_length_ms': 50, # In general, you don'tneed to change it 'frame_shift_ms': 12.5, # In general, you don'tneed to change it
from TTS.utils.generic_utils import load_config from TTS.synthesizer import Synthesizer config = load_config('./model/conf.json') class TTSEngine(object): def __init__(self): self.synthesizer = Synthesizer(config) def translate(self, text): wav = self.synthesizer.tts(text) return wav
waveform = wavernn.generate( torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=11000, overlap=550, ) print(" > Run-time: {}".format(time.time() - t_1)) return alignment, mel_postnet_spec, stop_tokens, waveform use_cuda = True batched_wavernn = True # initialize TTS CONFIG = load_config(tts_pretrained_model_config) print(CONFIG) # load the model num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols) model = setup_model(num_chars, CONFIG) # load the audio processor ap = AudioProcessor(**CONFIG.audio) # load model state if use_cuda: cp = torch.load(tts_pretrained_model) else: cp = torch.load(tts_pretrained_model, map_location=lambda storage, loc: storage) # load the model model.load_state_dict(cp["model"])
import os import unittest from TTS.tests import get_tests_path, get_tests_input_path, get_tests_output_path from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import load_config TESTS_PATH = get_tests_path() OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests") WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") os.makedirs(OUT_PATH, exist_ok=True) conf = load_config(os.path.join(TESTS_PATH, 'test_config.json')) class TestAudio(unittest.TestCase): def __init__(self, *args, **kwargs): super(TestAudio, self).__init__(*args, **kwargs) self.ap = AudioProcessor(**conf.audio) def test_audio_synthesis(self): """ 1. load wav 2. set normalization parameters 3. extract mel-spec 4. invert to wav and save the output """ print(" > Sanity check for the process wav -> mel -> wav") def _test(max_norm, signal_norm, symmetric_norm, clip_norm): self.ap.max_norm = max_norm self.ap.signal_norm = signal_norm
wav = process_audio(wav) fp = 'audio' wav.export('{}.wav'.format(fp), format='wav') end = time.time() print('\n', end - start, 'segundos') MODEL_PATH = 'checkpoint.pth.tar' CONFIG_PATH = 'TTS/config.json' OUT_FOLDER = 'samples/' try: os.mkdir(OUT_FOLDER) except: pass CONFIG = load_config(CONFIG_PATH) use_cuda = torch.cuda.is_available() VOCODER_MODEL_PATH = 'WaveRNN/saver.pth.tar' VOCODER_CONFIG_PATH = 'WaveRNN/config_16K.json' VOCODER_CONFIG = load_config(VOCODER_CONFIG_PATH) # load the model ap2 = AudioProcessor(**VOCODER_CONFIG.audio) ap = AudioProcessor(**CONFIG.audio) num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols) model = Tacotron(num_chars, CONFIG.embedding_size, ap.num_freq, ap.num_mels, CONFIG.r, CONFIG.memory_size) # load model state
wav = process_audio(wav) fp = 'audio' wav.export('{}.wav'.format(fp), format='wav') end = time.time() print('\n', end - start, 'segundos') MODEL_PATH = 'checkpoint.pth.tar' CONFIG_PATH = 'TTS/config.json' OUT_FOLDER = 'samples/' try: os.mkdir(OUT_FOLDER) except: pass CONFIG = load_config(CONFIG_PATH) use_cuda = torch.cuda.is_available() # load the model ap = AudioProcessor(**CONFIG.audio) num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols) model = Tacotron(num_chars, CONFIG.embedding_size, ap.num_freq, ap.num_mels, CONFIG.r, CONFIG.memory_size) # load model state if use_cuda: cp = torch.load(MODEL_PATH) else: cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)
type=str, help="JSON file for multi-speaker model.", default="") parser.add_argument( '--speaker_id', type=int, help="target speaker_id if the model is multi-speaker.", default=None) args = parser.parse_args() if args.vocoder_path != "": assert args.use_cuda, " [!] Enable cuda for vocoder." from WaveRNN.models.wavernn import Model as VocoderModel # load the config C = load_config(args.config_path) C.forward_attn_mask = True # load the audio processor ap = AudioProcessor(**C.audio) # load speakers if args.speakers_json != '': speakers = json.load(open(args.speakers_json, 'r')) num_speakers = len(speakers) else: num_speakers = 0 # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) model = setup_model(num_chars, num_speakers, C)
use_griffin_lim=True, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars, do_trim_silence=False) OUT_FOLDER = "/content/output" #Path where the audio files will be saved os.makedirs(OUT_FOLDER, exist_ok=True) file_name = text.replace(" ", "_").replace(".", "") + ".wav" out_path = os.path.join(OUT_FOLDER, file_name) ap.save_wav(waveform, out_path) return alignment, mel_postnet_spec, stop_tokens, waveform # model paths TTS_MODEL = "/content/ttsmodel/checkpoint_290000.pth.tar" TTS_CONFIG = "/content/ttsmodel/config.json" TTS_CONFIG = load_config(TTS_CONFIG) # Run FLAGs use_cuda = False # Set some config fields manually for testing TTS_CONFIG.windowing = False TTS_CONFIG.use_forward_attn = True # Set the vocoder use_gl = True # use GL if True batched_wavernn = False # use batched wavernn inference if True speaker_id = None speakers = [] # load the model num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)
help='folder name for training outputs.') # DISTRUBUTED parser.add_argument( '--rank', type=int, default=0, help='DISTRIBUTED: process rank for distributed training.') parser.add_argument('--group_id', type=str, default="", help='DISTRIBUTED: process group id.') args = parser.parse_args() # setup output paths and read configs c = load_config(args.config_path) _ = os.path.dirname(os.path.realpath(__file__)) if args.data_path != '': c.data_path = args.data_path if args.output_path == '': OUT_PATH = os.path.join(_, c.output_path) else: OUT_PATH = args.output_path if args.group_id == '' and args.output_folder == '': OUT_PATH = create_experiment_folder(OUT_PATH, c.run_name, args.debug) else: OUT_PATH = os.path.join(OUT_PATH, args.output_folder) AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')
def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser( description="Compute mean and variance of spectrogtram features.") parser.add_argument("--config_path", type=str, required=True, help="TTS config file path.") parser.add_argument("--out_path", default=None, type=str, help="directory to save the output file.") args = parser.parse_args() # load config CONFIG = load_config(args.config_path) CONFIG.audio['signal_norm'] = False # do not apply earlier normalization CONFIG.audio['stats_path'] = None # discard pre-defined stats # load audio processor ap = AudioProcessor(**CONFIG.audio) # load the meta data of target dataset dataset_items = load_meta_data(CONFIG.datasets)[0] # take only train data print(f" > There are {len(dataset_items)} files.") mel_sum = 0 mel_square_sum = 0 linear_sum = 0 linear_square_sum = 0 N = 0 for item in tqdm(dataset_items): # compute features wav = ap.load_wav(item[1]) linear = ap.spectrogram(wav) mel = ap.melspectrogram(wav) # compute stats N += mel.shape[1] mel_sum += mel.sum(1) linear_sum += linear.sum(1) mel_square_sum += (mel**2).sum(axis=1) linear_square_sum += (linear**2).sum(axis=1) mel_mean = mel_sum / N mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2) linear_mean = linear_sum / N linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2) output_file_path = os.path.join(args.out_path, "scale_stats.npy") stats = {} stats['mel_mean'] = mel_mean stats['mel_std'] = mel_scale stats['linear_mean'] = linear_mean stats['linear_std'] = linear_scale # set default config values for mean-var scaling CONFIG.audio['stats_path'] = output_file_path CONFIG.audio['signal_norm'] = True # remove redundant values del CONFIG.audio['max_norm'] del CONFIG.audio['min_level_db'] del CONFIG.audio['symmetric_norm'] del CONFIG.audio['clip_norm'] stats['audio_config'] = CONFIG.audio np.save(output_file_path, stats, allow_pickle=True)