async def _send_message(self, socket_id, response, **kwargs: Any):
        # type: (Text, Any) -> None
        """Sends a message to the recipient using the bot event."""

        #tts_out = TTS_mod(response).load_model()
        #await self.sio.emit(self.bot_message_evt, response, room=socket_id)

        # Set constants

        MODEL_PATH = './tts_model/best_model.pth.tar'
        CONFIG_PATH = './tts_model/config.json'
        OUT_FILE = 'tts_out.wav'
        CONFIG = load_config(CONFIG_PATH)
        use_cuda = False

        wav_norm = self.load_model(MODEL_PATH, response['text'], CONFIG,
                                   use_cuda, OUT_FILE)

        #await self.sio.emit(self.bot_message_evt, {'text':response['text'], "user_utterance":"Hello", "link":"file://local/Users/juste/Desktop/rasa-demo/tts_out.wav"}, room=socket_id)
        await self.sio.emit(self.bot_message_evt, {
            'text':
            response['text'],
            "link":
            "https://file-examples.com/wp-content/uploads/2017/11/file_example_WAV_1MG.wav"
        },
                            room=socket_id)
Exemple #2
0
 def load_tts(self, tts_checkpoint, tts_config, use_cuda):
     print(" > Loading TTS model ...")
     print(" | > model config: ", tts_config)
     print(" | > checkpoint file: ", tts_checkpoint)
     self.tts_config = load_config(tts_config)
     self.use_phonemes = self.tts_config.use_phonemes
     self.ap = AudioProcessor(**self.tts_config.audio)
     if self.use_phonemes:
         self.input_size = len(phonemes)
     else:
         self.input_size = len(symbols)
     # TODO: fix this for multi-speaker model - load speakers
     if self.config.tts_speakers is not None:
         self.tts_speakers = load_speaker_mapping(self.config.tts_speakers)
         num_speakers = len(self.tts_speakers)
     else:
         num_speakers = 0
     self.tts_model = setup_model(self.input_size,
                                  num_speakers=num_speakers,
                                  c=self.tts_config)
     # load model state
     cp = torch.load(tts_checkpoint, map_location=torch.device('cpu'))
     # load the model
     self.tts_model.load_state_dict(cp['model'])
     if use_cuda:
         self.tts_model.cuda()
     self.tts_model.eval()
     self.tts_model.decoder.max_decoder_steps = 3000
     if 'r' in cp:
         self.tts_model.decoder.set_r(cp['r'])
 def test_in_out(self):
     self._create_random_model()
     config = load_config(
         os.path.join(get_tests_input_path(), 'server_config.json'))
     config['tts_path'] = get_tests_output_path()
     synthesizer = Synthesizer(config)
     synthesizer.tts("Better this test works!!")
 def load_tts(self, model_path, model_file, model_config, use_cuda):
     tts_config = os.path.join(model_path, model_config)
     self.model_file = os.path.join(model_path, model_file)
     print(" > Loading TTS model ...")
     print(" | > model config: ", tts_config)
     print(" | > model file: ", model_file)
     self.tts_config = load_config(tts_config)
     self.use_phonemes = self.tts_config.use_phonemes
     self.ap = AudioProcessor(**self.tts_config.audio)
     if self.use_phonemes:
         self.input_size = len(phonemes)
     else:
         self.input_size = len(symbols)
     # load speakers
     if self.config.tts_speakers is not None:
         self.tts_speakers = load_speaker_mapping(
             os.path.join(model_path, self.config.tts_speakers))
         num_speakers = len(self.tts_speakers)
     else:
         num_speakers = 0
     self.tts_model = setup_model(self.input_size,
                                  num_speakers=num_speakers,
                                  c=self.tts_config)
     # load model state
     cp = torch.load(self.model_file)
     # load the model
     self.tts_model.load_state_dict(cp['model'])
     if use_cuda:
         self.tts_model.cuda()
     self.tts_model.eval()
     self.tts_model.decoder.max_decoder_steps = 3000
     if 'r' in cp and self.tts_config.model in ["Tacotron", "TacotronGST"]:
         self.tts_model.decoder.set_r(cp['r'])
Exemple #5
0
 def __init__(self, message):
     self.message = message
     self.MODEL_PATH = './stt_models/best_model.pth.tar'
     self.CONFIG_PATH = './stt_models/config.json'
     self.OUT_FOLDER = '/output'
     self.CONFIG = load_config(self.CONFIG_PATH)
     self.use_cuda = False
    def load_wavernn(self, lib_path, model_path, model_file, model_config,
                     use_cuda):
        # TODO: set a function in wavernn code base for model setup and call it here.
        sys.path.append(lib_path)  # set this if TTS is not installed globally
        from WaveRNN.models.wavernn import Model
        wavernn_config = os.path.join(model_path, model_config)
        model_file = os.path.join(model_path, model_file)
        print(" > Loading WaveRNN model ...")
        print(" | > model config: ", wavernn_config)
        print(" | > model file: ", model_file)
        self.wavernn_config = load_config(wavernn_config)
        self.wavernn = Model(
            rnn_dims=512,
            fc_dims=512,
            mode=self.wavernn_config.mode,
            mulaw=self.wavernn_config.mulaw,
            pad=self.wavernn_config.pad,
            use_aux_net=self.wavernn_config.use_aux_net,
            use_upsample_net=self.wavernn_config.use_upsample_net,
            upsample_factors=self.wavernn_config.upsample_factors,
            feat_dims=80,
            compute_dims=128,
            res_out_dims=128,
            res_blocks=10,
            hop_length=self.ap.hop_length,
            sample_rate=self.ap.sample_rate,
        ).cuda()

        check = torch.load(model_file)
        self.wavernn.load_state_dict(check['model'])
        if use_cuda:
            self.wavernn.cuda()
        self.wavernn.eval()
Exemple #7
0
 def _create_random_model(self):
     config = load_config(
         os.path.join(get_tests_output_path(), 'dummy_model_config.json'))
     num_chars = len(phonemes) if config.use_phonemes else len(symbols)
     model = setup_model(num_chars, 0, config)
     output_path = os.path.join(get_tests_output_path())
     save_checkpoint(model, None, None, None, output_path, 10, 10)
def load_tts_model():

    MODEL_PATH = dirpath + '/tts_model/best_model.pth.tar'
    CONFIG_PATH = dirpath + '/tts_model/config.json'
    CONFIG = load_config(CONFIG_PATH)
    use_cuda = False

    num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
    model = Tacotron(num_chars, CONFIG.embedding_size, CONFIG.audio['num_freq'], CONFIG.audio['num_mels'], CONFIG.r, attn_windowing=False)

    num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
    model = Tacotron(num_chars, CONFIG.embedding_size, CONFIG.audio['num_freq'], CONFIG.audio['num_mels'], CONFIG.r, attn_windowing=False)

    # load the audio processor
    # CONFIG.audio["power"] = 1.3
    CONFIG.audio["preemphasis"] = 0.97
    ap = AudioProcessor(**CONFIG.audio)

    # load model state
    if use_cuda:
        cp = torch.load(MODEL_PATH)
    else:
        cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)

    # load the model
    model.load_state_dict(cp['model'])
    if use_cuda:
        model.cuda()

    #model.eval()
    model.decoder.max_decoder_steps = 1000
    return model, ap, MODEL_PATH, CONFIG, use_cuda
Exemple #9
0
    def load_wavernn(self, lib_path, model_file, model_config, use_cuda):
        # TODO: set a function in wavernn code base for model setup and call it here.
        sys.path.append(lib_path) # set this if WaveRNN is not installed globally
        #pylint: disable=import-outside-toplevel
        from WaveRNN.models.wavernn import Model
        print(" > Loading WaveRNN model ...")
        print(" | > model config: ", model_config)
        print(" | > model file: ", model_file)
        self.wavernn_config = load_config(model_config)
        # This is the default architecture we use for our models.
        # You might need to update it
        self.wavernn = Model(
            rnn_dims=512,
            fc_dims=512,
            mode=self.wavernn_config.mode,
            mulaw=self.wavernn_config.mulaw,
            pad=self.wavernn_config.pad,
            use_aux_net=self.wavernn_config.use_aux_net,
            use_upsample_net=self.wavernn_config.use_upsample_net,
            upsample_factors=self.wavernn_config.upsample_factors,
            feat_dims=80,
            compute_dims=128,
            res_out_dims=128,
            res_blocks=10,
            hop_length=self.ap.hop_length,
            sample_rate=self.ap.sample_rate,
        ).cuda()

        check = torch.load(model_file, map_location="cpu")
        self.wavernn.load_state_dict(check['model'])
        if use_cuda:
            self.wavernn.cuda()
        self.wavernn.eval()
Exemple #10
0
 def load_model(self, model_path, model_name, model_config, use_cuda):
     model_config = os.path.join(model_path, model_config)
     self.model_file = os.path.join(model_path, model_name)
     print(" > Loading model ...")
     print(" | > model config: ", model_config)
     print(" | > model file: ", self.model_file)
     config = load_config(model_config)
     self.config = config
     self.use_cuda = use_cuda
     self.model = Tacotron(config.embedding_size, config.num_freq,
                           config.num_mels, config.r)
     self.ap = AudioProcessor(config.sample_rate,
                              config.num_mels,
                              config.min_level_db,
                              config.frame_shift_ms,
                              config.frame_length_ms,
                              config.preemphasis,
                              config.ref_level_db,
                              config.num_freq,
                              config.power,
                              griffin_lim_iters=60)
     # load model state
     if use_cuda:
         cp = torch.load(self.model_file)
     else:
         cp = torch.load(self.model_file,
                         map_location=lambda storage, loc: storage)
     # load the model
     self.model.load_state_dict(cp['model'])
     if use_cuda:
         self.model.cuda()
     self.model.eval()
Exemple #11
0
def main():
    """
    Call train.py as a new process and pass command arguments
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--restore_path',
                        type=str,
                        help='Folder path to checkpoints',
                        default='')
    parser.add_argument(
        '--config_path',
        type=str,
        help='path to config file for training',
    )
    parser.add_argument('--data_path',
                        type=str,
                        help='dataset path.',
                        default='')

    args = parser.parse_args()

    CONFIG = load_config(args.config_path)
    OUT_PATH = create_experiment_folder(CONFIG.output_path, CONFIG.run_name,
                                        True)
    stdout_path = os.path.join(OUT_PATH, "process_stdout/")

    num_gpus = torch.cuda.device_count()
    group_id = time.strftime("%Y_%m_%d-%H%M%S")

    # set arguments for train.py
    command = ['train.py']
    command.append('--restore_path={}'.format(args.restore_path))
    command.append('--config_path={}'.format(args.config_path))
    command.append('--group_id=group_{}'.format(group_id))
    command.append('--data_path={}'.format(args.data_path))
    command.append('--output_path={}'.format(OUT_PATH))
    command.append('')

    if not os.path.isdir(stdout_path):
        os.makedirs(stdout_path)
        os.chmod(stdout_path, 0o775)

    # run processes
    processes = []
    for i in range(num_gpus):
        my_env = os.environ.copy()
        my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i)
        command[6] = '--rank={}'.format(i)
        stdout = None if i == 0 else open(
            os.path.join(stdout_path, "process_{}.log".format(i)), "w")
        p = subprocess.Popen(['python3'] + command, stdout=stdout, env=my_env)
        processes.append(p)
        print(command)

    for p in processes:
        p.wait()
Exemple #12
0
    def __init__(self):
        # Set constants

        #ROOT_PATH = '/home/avnerus/Code/TTS-Data'
        ROOT_PATH = '/Users/avnerus/Code/TTS-Data'
        CONFIG_PATH = ROOT_PATH + '/config.json'
        OUT_FOLDER = ROOT_PATH + '/test'
        self.CONFIG = load_config(CONFIG_PATH)
        self.MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'
        self.use_cuda = False
Exemple #13
0
    def __init__(self, tts_model, tts_config, wavernn_model=None, wavernn_config=None, device="cpu"):
        from TTS.utils.generic_utils import load_config
        self.tts_config = load_config(tts_config)
        self.tts_config.windowing = True
        if not torch.cuda.is_available():
            device = "cpu"
        self.use_cuda = device != "cpu"
        self.device = torch.device(device)
        self.tts_model_path = tts_model

        self._load_tts()

        if wavernn_model and wavernn_config:
            self.use_gl = False
            self.batched_wavernn = True
            self.wavernn_model_path = wavernn_model
            self.wavernn_config = load_config(wavernn_config)
            self._load_wavernn()
        else:
            self.use_gl = True
Exemple #14
0
    def _create_random_model(self):
        # pylint: disable=global-statement
        global symbols, phonemes
        config = load_config(
            os.path.join(get_tests_output_path(), 'dummy_model_config.json'))
        if 'characters' in config.keys():
            symbols, phonemes = make_symbols(**config.characters)

        num_chars = len(phonemes) if config.use_phonemes else len(symbols)
        model = setup_model(num_chars, 0, config)
        output_path = os.path.join(get_tests_output_path())
        save_checkpoint(model, None, None, None, output_path, 10, 10)
def tts(text,
        model_path='model/best_model.pth.tar',
        config_path='model/config.json',
        use_cuda=False):
    CONFIG = load_config(config_path)
    model = Tacotron(CONFIG.embedding_size, CONFIG.num_freq, CONFIG.num_mels,
                     CONFIG.r)
    if use_cuda:
        cp = torch.load(model_path + seq_to_seq_test_model_fname,
                        map_location='cuda:0')
    else:
        cp = torch.load(model_path, map_location=lambda storage, loc: storage)
    model.load_state_dict(cp['model'])
    if use_cuda:
        model.cuda()
    model.eval()
    model.decoder.max_decoder_steps = 250
    ap = AudioProcessor(CONFIG.sample_rate,
                        CONFIG.num_mels,
                        CONFIG.min_level_db,
                        CONFIG.frame_shift_ms,
                        CONFIG.frame_length_ms,
                        CONFIG.ref_level_db,
                        CONFIG.num_freq,
                        CONFIG.power,
                        CONFIG.preemphasis,
                        griffin_lim_iters=50)
    t_1 = time.time()
    text_cleaner = [CONFIG.text_cleaner]
    seq = np.array(text_to_sequence(text, text_cleaner))
    chars_var = torch.from_numpy(seq).unsqueeze(0)
    if use_cuda:
        chars_var = chars_var.cuda()
    linear_out = model.forward(chars_var.long())
    linear_out = linear_out[0].data.cpu().numpy()
    waveform = ap.inv_spectrogram(linear_out.T)
    waveform = waveform[:ap.find_endpoint(waveform)]
    out_path = 'static/samples/'
    os.makedirs(out_path, exist_ok=True)
    file_name = text.replace(" ", "_").replace(".", "") + ".wav"
    out_path = os.path.join(out_path, file_name)
    ap.save_wav(waveform, out_path)
    # print(" >  Run-time: {}".format(time.time() - t_1))

    return file_name
Exemple #16
0
    def __init__(self):

        # Set constants
        ROOT_PATH = 'TTS/tts_model/'
        MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'
        # MODEL_PATH_TMP = ROOT_PATH + '/best_model.pth.tar'
        CONFIG_PATH = ROOT_PATH + '/config.json'
        OUT_FOLDER = ROOT_PATH + '/test'
        self.CONFIG = load_config(CONFIG_PATH)
        self.use_cuda = True  # True

        # load the model
        self.model = Tacotron(self.CONFIG.embedding_size, self.CONFIG.num_freq,
                              self.CONFIG.num_mels, self.CONFIG.r)

        # load the audio processor

        self.ap = AudioProcessor(self.CONFIG.sample_rate, self.CONFIG.num_mels,
                                 self.CONFIG.min_level_db,
                                 self.CONFIG.frame_shift_ms,
                                 self.CONFIG.frame_length_ms,
                                 self.CONFIG.ref_level_db,
                                 self.CONFIG.num_freq, self.CONFIG.power,
                                 self.CONFIG.preemphasis, 60)

        # load model state
        if self.use_cuda:
            cp = torch.load(MODEL_PATH)
        else:
            cp = torch.load(MODEL_PATH,
                            map_location=lambda storage, loc: storage)

        # load the model
        self.model.load_state_dict(cp['model'])
        if self.use_cuda:
            self.model.cuda()
        self.model.eval()

        self.model.decoder.max_decoder_steps = 500

        self.nlp = spacy.load("en")
    def load_tts_model(self):
        CONFIG = load_config(CONFIG_PATH)

        model = Tacotron(len(phonemes),
                         CONFIG.embedding_size,
                         CONFIG.audio["num_freq"],
                         CONFIG.audio["num_mels"],
                         CONFIG.r,
                         attn_windowing=False)

        # load the audio processor
        ap = AudioProcessor(**CONFIG.audio)

        # load model state
        cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)

        # load the model
        model.load_state_dict(cp["model"])

        model.decoder.max_decoder_steps = 650
        return model, ap, MODEL_PATH, CONFIG, use_cuda
Exemple #18
0
    def load_tts(self, tts_checkpoint, tts_config, use_cuda):
        global symbols, phonemes

        print(" > Loading TTS model ...")
        print(" | > model config: ", tts_config)
        print(" | > checkpoint file: ", tts_checkpoint)
        self.tts_config = load_config(tts_config)

        if 'text' in self.tts_config.keys():
            symbols, phonemes = make_symbols(**self.tts_config.text)

        self.use_phonemes = self.tts_config.use_phonemes
        self.ap = AudioProcessor(**self.tts_config.audio)
        if self.use_phonemes:
            self.input_size = len(phonemes)
        else:
            self.input_size = len(symbols)
        # load speakers
        if self.config.tts_speakers is not None:
            self.tts_speakers = load_speaker_mapping(
                os.path.join(model_path, self.config.tts_speakers))
            num_speakers = len(self.tts_speakers)
        else:
            num_speakers = 0
        self.tts_model = setup_model(self.input_size,
                                     num_speakers=num_speakers,
                                     c=self.tts_config)
        # load model state
        cp = torch.load(tts_checkpoint, map_location=torch.device('cpu'))
        # load the model
        self.tts_model.load_state_dict(cp['model'])
        if use_cuda:
            self.tts_model.cuda()
        self.tts_model.eval()
        self.tts_model.decoder.max_decoder_steps = 3000
        if 'r' in cp:
            self.tts_model.decoder.set_r(cp['r'])
Exemple #19
0
import os
import unittest
import numpy as np

from torch.utils.data import DataLoader
from TTS.utils.generic_utils import load_config
from TTS.datasets.LJSpeech import LJSpeechDataset

file_path = os.path.dirname(os.path.realpath(__file__))
c = load_config(os.path.join(file_path, 'test_config.json'))


class TestDataset(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        super(TestDataset, self).__init__(*args, **kwargs)
        self.max_loader_iter = 4

    def test_loader(self):
        dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata.csv'),
                                  os.path.join(c.data_path, 'wavs'), c.r,
                                  c.sample_rate, c.text_cleaner, c.num_mels,
                                  c.min_level_db, c.frame_shift_ms,
                                  c.frame_length_ms, c.preemphasis,
                                  c.ref_level_db, c.num_freq, c.power)

        dataloader = DataLoader(dataset,
                                batch_size=2,
                                shuffle=True,
                                collate_fn=dataset.collate_fn,
                                drop_last=True,
                                num_workers=c.num_loader_workers)
Exemple #20
0
import os
import unittest
import torch as T

from TTS.speaker_encoder.model import SpeakerEncoder
from TTS.speaker_encoder.loss import GE2ELoss
from TTS.utils.generic_utils import load_config

file_path = os.path.dirname(os.path.realpath(__file__)) + "/../tests/"
c = load_config(os.path.join(file_path, "test_config.json"))


class SpeakerEncoderTests(unittest.TestCase):
    # pylint: disable=R0201
    def test_in_out(self):
        dummy_input = T.rand(4, 20, 80)  # B x T x D
        dummy_hidden = [T.rand(2, 4, 128), T.rand(2, 4, 128)]
        model = SpeakerEncoder(input_dim=80,
                               proj_dim=256,
                               lstm_dim=768,
                               num_lstm_layers=3)
        # computing d vectors
        output = model.forward(dummy_input)
        assert output.shape[0] == 4
        assert output.shape[1] == 256
        output = model.inference(dummy_input)
        assert output.shape[0] == 4
        assert output.shape[1] == 256
        # compute d vectors by passing LSTM hidden
        # output = model.forward(dummy_input, dummy_hidden)
        # assert output.shape[0] == 4
Exemple #21
0
get_ipython().run_line_magic('matplotlib', 'inline')
from TTS.utils.audio import AudioProcessor
from TTS.utils.visual import plot_spectrogram
from TTS.utils.generic_utils import load_config
import glob 
import IPython.display as ipd


# In[ ]:


config_path = "/media/erogol/data_ssd/Data/models/tr/TTS-phoneme-January-14-2019_06+52PM-4ad64a7/config.json"
data_path = "/home/erogol/Data/Mozilla/"
file_paths = glob.glob(data_path + "/**/*.wav", recursive=True)
CONFIG = load_config(config_path)


# ### Setup Audio Processor
# Play with the AP parameters until you find a good fit with the synthesis speech below. 

# In[ ]:


audio={
 'audio_processor': 'audio',
 'num_mels': 80,          # In general, you don'tneed to change it 
 'num_freq': 1025,        # In general, you don'tneed to change it 
 'sample_rate': 22050,    # It depends to the sample rate of the dataset.
 'frame_length_ms': 50,   # In general, you don'tneed to change it 
 'frame_shift_ms': 12.5,  # In general, you don'tneed to change it 
from TTS.utils.generic_utils import load_config
from TTS.synthesizer import Synthesizer

config = load_config('./model/conf.json')

class TTSEngine(object):
    def __init__(self):
        self.synthesizer = Synthesizer(config)

    def translate(self, text):
        wav = self.synthesizer.tts(text)
        return wav
Exemple #23
0
        waveform = wavernn.generate(
            torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(),
            batched=batched_wavernn,
            target=11000,
            overlap=550,
        )

    print(" >  Run-time: {}".format(time.time() - t_1))
    return alignment, mel_postnet_spec, stop_tokens, waveform


use_cuda = True
batched_wavernn = True

# initialize TTS
CONFIG = load_config(tts_pretrained_model_config)
print(CONFIG)

# load the model
num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
model = setup_model(num_chars, CONFIG)
# load the audio processor
ap = AudioProcessor(**CONFIG.audio)
# load model state
if use_cuda:
    cp = torch.load(tts_pretrained_model)
else:
    cp = torch.load(tts_pretrained_model, map_location=lambda storage, loc: storage)

# load the model
model.load_state_dict(cp["model"])
Exemple #24
0
import os
import unittest

from TTS.tests import get_tests_path, get_tests_input_path, get_tests_output_path
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import load_config

TESTS_PATH = get_tests_path()
OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")

os.makedirs(OUT_PATH, exist_ok=True)
conf = load_config(os.path.join(TESTS_PATH, 'test_config.json'))


class TestAudio(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        super(TestAudio, self).__init__(*args, **kwargs)
        self.ap = AudioProcessor(**conf.audio)

    def test_audio_synthesis(self):
        """ 1. load wav
            2. set normalization parameters
            3. extract mel-spec
            4. invert to wav and save the output
        """
        print(" > Sanity check for the process wav -> mel -> wav")

        def _test(max_norm, signal_norm, symmetric_norm, clip_norm):
            self.ap.max_norm = max_norm
            self.ap.signal_norm = signal_norm
Exemple #25
0
    wav = process_audio(wav)
    fp = 'audio'
    wav.export('{}.wav'.format(fp), format='wav')
    end = time.time()
    print('\n', end - start, 'segundos')


MODEL_PATH = 'checkpoint.pth.tar'
CONFIG_PATH = 'TTS/config.json'
OUT_FOLDER = 'samples/'
try:
    os.mkdir(OUT_FOLDER)
except:
    pass

CONFIG = load_config(CONFIG_PATH)
use_cuda = torch.cuda.is_available()

VOCODER_MODEL_PATH = 'WaveRNN/saver.pth.tar'
VOCODER_CONFIG_PATH = 'WaveRNN/config_16K.json'
VOCODER_CONFIG = load_config(VOCODER_CONFIG_PATH)

# load the model
ap2 = AudioProcessor(**VOCODER_CONFIG.audio)
ap = AudioProcessor(**CONFIG.audio)

num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
model = Tacotron(num_chars, CONFIG.embedding_size, ap.num_freq, ap.num_mels,
                 CONFIG.r, CONFIG.memory_size)

# load model state
    wav = process_audio(wav)
    fp = 'audio'
    wav.export('{}.wav'.format(fp), format='wav')
    end = time.time()
    print('\n', end - start, 'segundos')


MODEL_PATH = 'checkpoint.pth.tar'
CONFIG_PATH = 'TTS/config.json'
OUT_FOLDER = 'samples/'
try:
    os.mkdir(OUT_FOLDER)
except:
    pass

CONFIG = load_config(CONFIG_PATH)
use_cuda = torch.cuda.is_available()

# load the model
ap = AudioProcessor(**CONFIG.audio)

num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
model = Tacotron(num_chars, CONFIG.embedding_size, ap.num_freq, ap.num_mels,
                 CONFIG.r, CONFIG.memory_size)

# load model state
if use_cuda:
    cp = torch.load(MODEL_PATH)
else:
    cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)
Exemple #27
0
                        type=str,
                        help="JSON file for multi-speaker model.",
                        default="")
    parser.add_argument(
        '--speaker_id',
        type=int,
        help="target speaker_id if the model is multi-speaker.",
        default=None)
    args = parser.parse_args()

    if args.vocoder_path != "":
        assert args.use_cuda, " [!] Enable cuda for vocoder."
        from WaveRNN.models.wavernn import Model as VocoderModel

    # load the config
    C = load_config(args.config_path)
    C.forward_attn_mask = True

    # load the audio processor
    ap = AudioProcessor(**C.audio)

    # load speakers
    if args.speakers_json != '':
        speakers = json.load(open(args.speakers_json, 'r'))
        num_speakers = len(speakers)
    else:
        num_speakers = 0

    # load the model
    num_chars = len(phonemes) if C.use_phonemes else len(symbols)
    model = setup_model(num_chars, num_speakers, C)
Exemple #28
0
        use_griffin_lim=True,
        enable_eos_bos_chars=CONFIG.enable_eos_bos_chars,
        do_trim_silence=False)
    OUT_FOLDER = "/content/output"  #Path where the audio files will be saved
    os.makedirs(OUT_FOLDER, exist_ok=True)
    file_name = text.replace(" ", "_").replace(".", "") + ".wav"
    out_path = os.path.join(OUT_FOLDER, file_name)
    ap.save_wav(waveform, out_path)
    return alignment, mel_postnet_spec, stop_tokens, waveform


# model paths
TTS_MODEL = "/content/ttsmodel/checkpoint_290000.pth.tar"
TTS_CONFIG = "/content/ttsmodel/config.json"

TTS_CONFIG = load_config(TTS_CONFIG)

# Run FLAGs
use_cuda = False
# Set some config fields manually for testing
TTS_CONFIG.windowing = False
TTS_CONFIG.use_forward_attn = True
# Set the vocoder
use_gl = True  # use GL if True
batched_wavernn = False  # use batched wavernn inference if True

speaker_id = None
speakers = []

# load the model
num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)
Exemple #29
0
                        help='folder name for training outputs.')

    # DISTRUBUTED
    parser.add_argument(
        '--rank',
        type=int,
        default=0,
        help='DISTRIBUTED: process rank for distributed training.')
    parser.add_argument('--group_id',
                        type=str,
                        default="",
                        help='DISTRIBUTED: process group id.')
    args = parser.parse_args()

    # setup output paths and read configs
    c = load_config(args.config_path)
    _ = os.path.dirname(os.path.realpath(__file__))
    if args.data_path != '':
        c.data_path = args.data_path

    if args.output_path == '':
        OUT_PATH = os.path.join(_, c.output_path)
    else:
        OUT_PATH = args.output_path

    if args.group_id == '' and args.output_folder == '':
        OUT_PATH = create_experiment_folder(OUT_PATH, c.run_name, args.debug)
    else:
        OUT_PATH = os.path.join(OUT_PATH, args.output_folder)

    AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')
Exemple #30
0
def main():
    """Run preprocessing process."""
    parser = argparse.ArgumentParser(
        description="Compute mean and variance of spectrogtram features.")
    parser.add_argument("--config_path",
                        type=str,
                        required=True,
                        help="TTS config file path.")
    parser.add_argument("--out_path",
                        default=None,
                        type=str,
                        help="directory to save the output file.")
    args = parser.parse_args()

    # load config
    CONFIG = load_config(args.config_path)
    CONFIG.audio['signal_norm'] = False  # do not apply earlier normalization
    CONFIG.audio['stats_path'] = None  # discard pre-defined stats

    # load audio processor
    ap = AudioProcessor(**CONFIG.audio)

    # load the meta data of target dataset
    dataset_items = load_meta_data(CONFIG.datasets)[0]  # take only train data
    print(f" > There are {len(dataset_items)} files.")

    mel_sum = 0
    mel_square_sum = 0
    linear_sum = 0
    linear_square_sum = 0
    N = 0
    for item in tqdm(dataset_items):
        # compute features
        wav = ap.load_wav(item[1])
        linear = ap.spectrogram(wav)
        mel = ap.melspectrogram(wav)

        # compute stats
        N += mel.shape[1]
        mel_sum += mel.sum(1)
        linear_sum += linear.sum(1)
        mel_square_sum += (mel**2).sum(axis=1)
        linear_square_sum += (linear**2).sum(axis=1)

    mel_mean = mel_sum / N
    mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2)
    linear_mean = linear_sum / N
    linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2)

    output_file_path = os.path.join(args.out_path, "scale_stats.npy")
    stats = {}
    stats['mel_mean'] = mel_mean
    stats['mel_std'] = mel_scale
    stats['linear_mean'] = linear_mean
    stats['linear_std'] = linear_scale

    # set default config values for mean-var scaling
    CONFIG.audio['stats_path'] = output_file_path
    CONFIG.audio['signal_norm'] = True
    # remove redundant values
    del CONFIG.audio['max_norm']
    del CONFIG.audio['min_level_db']
    del CONFIG.audio['symmetric_norm']
    del CONFIG.audio['clip_norm']
    stats['audio_config'] = CONFIG.audio
    np.save(output_file_path, stats, allow_pickle=True)