예제 #1
0
 def test_in_out(self):
     self._create_random_model()
     tts_root_path = get_tests_output_path()
     tts_checkpoint = os.path.join(tts_root_path, "checkpoint_10.pth.tar")
     tts_config = os.path.join(tts_root_path, "dummy_model_config.json")
     synthesizer = Synthesizer(tts_checkpoint, tts_config, None, None)
     synthesizer.tts("Better this test works!!")
예제 #2
0
 def test_in_out(self):
     self._create_random_model()
     config = load_config(os.path.join(get_tests_input_path(), 'server_config.json'))
     tts_root_path = get_tests_output_path()
     config['tts_checkpoint'] = os.path.join(tts_root_path, config['tts_checkpoint'])
     config['tts_config'] = os.path.join(tts_root_path, config['tts_config'])
     synthesizer = Synthesizer(config['tts_checkpoint'], config['tts_config'], None, None)
     synthesizer.tts("Better this test works!!")
예제 #3
0
    def __init__(self):

        manager = ModelManager()
        model_path, config_path, model_item = manager.download_model(MODEL)

        vocoder_path, vocoder_config_path, _ = manager.download_model(
            model_item['default_vocoder'])

        # last arg is use kuda,
        self.synth = Synthesizer(model_path, config_path, vocoder_path,
                                 vocoder_config_path, False)
예제 #4
0
def tts(text):
    synthesizer = Synthesizer(MODEL_PATH, CONFIG_PATH, use_cuda)
    wav = synthesizer.tts(text)
    # save the results
    file_name = text.replace(" ", "_")[0:20]
    file_name = file_name.translate(
        str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
    out_path = OUT_FILE
    print(" > Saving output to {}".format(out_path))
    synthesizer.save_wav(wav, out_path)
    playsound(out_path)
예제 #5
0
    def __init__(self, auto_start: bool = True):
        super().__init__(auto_start)

        path = Path(
            __file__
        ).parent / "../../.venv/lib/python3.8/site-packages/TTS/.models.json"
        manager = ModelManager(path)

        model_path, config_path = manager.download_model(self.model_name)
        vocoder_path, vocoder_config_path = manager.download_model(
            self.vocoder_name)

        self.synthesizer = Synthesizer(model_path, config_path, vocoder_path,
                                       vocoder_config_path, self.use_cuda)
예제 #6
0
 def __init__(self):
     path = Path(synthesize.__file__).parent / "../.models.json"
     logger.info("path")
     logger.info("Creating ModelManager")
     self.manager = ModelManager(path)
     logger.info("Downloading model")
     model_path, config_path, _ = self.manager.download_model(
         self.MODEL_NAME)
     logger.info("Downloading vcoder")
     vocoder_path, vocoder_config_path, _ = self.manager.download_model(
         self.VOCODER_NAME)
     logger.info("Finished downloading TTS model & vcoder")
     self.synthesizer = Synthesizer(model_path, config_path, vocoder_path,
                                    vocoder_config_path, False)
     self.tts_lock = threading.Lock()
예제 #7
0
class MozillaTTS():
    def __init__(self):

        manager = ModelManager()
        model_path, config_path, model_item = manager.download_model(MODEL)

        vocoder_path, vocoder_config_path, _ = manager.download_model(
            model_item['default_vocoder'])

        # last arg is use kuda,
        self.synth = Synthesizer(model_path, config_path, vocoder_path,
                                 vocoder_config_path, False)

    def say(self, text):

        # generate wav
        wav = self.synth.tts(text)
        # output TODO would be nice to play this without having to do file I/O...
        out_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                'output.wav')
        self.synth.save_wav(
            wav,
            out_file,
        )

        # Open the sound file
        chunk = 1024
        wf = wave.open(out_file, 'rb')

        # play wav
        p = pyaudio.PyAudio()

        # Open a .Stream object to write the WAV file to
        # 'output = True' indicates that the sound will be played rather than recorded
        stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
                        channels=wf.getnchannels(),
                        rate=wf.getframerate(),
                        output=True)

        # Play the sound by writing the audio data to the stream
        data = wf.readframes(chunk)
        while data != '':
            stream.write(data)
            data = wf.readframes(chunk)

        # Close and terminate the stream
        stream.close()
        p.terminate()
예제 #8
0
파일: hubconf.py 프로젝트: stjordanis/TTS
def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA',
        vocoder_name=None,
        use_cuda=False):
    """TTS entry point for PyTorch Hub that provides a Synthesizer object to synthesize speech from a give text.

    Example:
        >>> synthesizer = torch.hub.load('coqui-ai/TTS', 'tts', source='github')
        >>> wavs = synthesizer.tts("This is a test! This is also a test!!")
            wavs - is a list of values of the synthesized speech.

    Args:
        model_name (str, optional): One of the model names from .model.json. Defaults to 'tts_models/en/ljspeech/tacotron2-DCA'.
        vocoder_name (str, optional): One of the model names from .model.json. Defaults to 'vocoder_models/en/ljspeech/multiband-melgan'.
        pretrained (bool, optional): [description]. Defaults to True.

    Returns:
        TTS.utils.synthesizer.Synthesizer: Synthesizer object wrapping both vocoder and tts models.
    """
    manager = ModelManager()

    model_path, config_path, model_item = manager.download_model(model_name)
    vocoder_name = model_item[
        'default_vocoder'] if vocoder_name is None else vocoder_name
    vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name)

    # create synthesizer
    synt = Synthesizer(tts_checkpoint=model_path,
                       tts_config_path=config_path,
                       vocoder_checkpoint=vocoder_path,
                       vocoder_config=vocoder_config_path,
                       use_cuda=use_cuda)
    return synt
    def text_to_speech(self):
        tts_config = CONFIG['tts_config']
        models_folder = Path(tts_config['folder'])

        model_path = str(models_folder / tts_config['model'])
        model_config_path = str(models_folder / tts_config['model_config'])
        vocoder_path = str(models_folder / tts_config['vocoder'])
        vocoder_config_path = str(models_folder / tts_config['vocoder_config'])

        self.mozilla_tts = Synthesizer(model_path, model_config_path,
                                       vocoder_path, vocoder_config_path)

        while True:
            response = self.chatbot_to_tts_queue.get()
            print("TTS:", response)

            sound_arr = np.array(self.mozilla_tts.tts(response))

            sound_arr *= 2**15
            sound_arr = sound_arr.astype('int16')

            sound = bytes(sound_arr)
            sound, _ = audioop.ratecv(sound, 2, 1, self.MOZILLA_TTS_AUDIO_RATE,
                                      self.IN_AUDIO_RATE, None)

            ulaw_sound = audioop.lin2ulaw(sound, 2)

            chunk_len = 540
            chunks = len(ulaw_sound) // chunk_len
            extra = len(ulaw_sound) - (chunks * chunk_len)

            for c in range(chunks):
                chunk = ulaw_sound[c * chunk_len:c * chunk_len + chunk_len]
                self.audio_out_queue.put(
                    base64.b64encode(chunk).decode('utf-8'))

            if extra != 0:
                chunk = ulaw_sound[-extra:]
                self.audio_out_queue.put(
                    base64.b64encode(chunk).decode('utf-8'))

            self.transcript.append({
                "speaker": "self",
                "text": response,
                "datetime": dt.datetime.now().isoformat()
            })
def generate():
    if inputbox.get("1.0", "end-1c") == "":
        messagebox.showerror(
            message=
            "TTS will give a division by zero error if the text field is blank."
        )
    else:
        if not os.path.exists('mozilla-tts-output'):
            try:
                os.makedirs('mozilla-tts-output')
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise
        generatebutton.config(state="disabled")
        exportbutton.config(state="disabled")
        model_path = None
        config_path = None
        vocoder_path = None
        vocoder_config_path = None
        path = Path(__file__).parent / "TTS/.models.json"
        manager = ModelManager(path)
        model_name = 'tts_models/' + ttsmodelbox.get()
        print(f'model_name is {model_name}')
        # for dev
        #model_path, config_path, model_item = manager.download_model(model_name)
        # for master
        model_path, config_path = manager.download_model(model_name)
        vocoder_name = 'vocoder_models/' + vocodermodelbox.get()
        print(f'vocoder_name is {vocoder_name}')
        # for dev
        #vocoder_path, vocoder_config_path, model_item = manager.download_model(vocoder_name)
        # for master
        vocoder_path, vocoder_config_path = manager.download_model(
            vocoder_name)
        synthesizer = Synthesizer(model_path, config_path, vocoder_path,
                                  vocoder_config_path,
                                  cudacheckbutton.instate(['selected']))
        wav = synthesizer.tts(inputbox.get("1.0", "end-1c"))
        synthesizer.save_wav(wav, "mozilla-tts-output/generated.wav")
        playsound("mozilla-tts-output/generated.wav")
        generatebutton.config(state="enabled")
        exportbutton.config(state="enabled")
        print("All done!")
def exportaudio():
    if inputbox.get("1.0", "end-1c") == "":
        messagebox.showerror(
            message=
            "TTS will give a division by zero error if the text field is blank."
        )
    else:
        f = filedialog.asksaveasfile(mode='a',
                                     defaultextension=".wav",
                                     filetypes=[("Wave files", ".wav")])
        if f is None:  # asksaveasfile return `None` if dialog closed with "cancel".
            return
    generatebutton.config(state="disabled")
    exportbutton.config(state="disabled")
    model_path = None
    config_path = None
    vocoder_path = None
    vocoder_config_path = None
    path = Path(__file__).parent / "TTS/.models.json"
    manager = ModelManager(path)
    model_name = 'tts_models/' + ttsmodelbox.get()
    print(f'model_name is {model_name}')
    # for dev
    #model_path, config_path, model_item = manager.download_model(model_name)
    # for master
    model_path, config_path = manager.download_model(model_name)
    vocoder_name = 'vocoder_models/' + vocodermodelbox.get()
    print(f'vocoder_name is {vocoder_name}')
    # for dev
    #vocoder_path, vocoder_config_path, model_item = manager.download_model(vocoder_name)
    # for master
    vocoder_path, vocoder_config_path = manager.download_model(vocoder_name)
    synthesizer = Synthesizer(model_path, config_path, vocoder_path,
                              vocoder_config_path,
                              cudacheckbutton.instate(['selected']))
    wav = synthesizer.tts(inputbox.get("1.0", "end-1c"))
    synthesizer.save_wav(wav, str(f.name))
    generatebutton.config(state="enabled")
    exportbutton.config(state="enabled")
    print("All done!")
예제 #12
0
def text_to_wav(text, lang):
    #class Synthesizer(object):
    #def __init__(self, tts_checkpoint, tts_config, vocoder_checkpoint=None, vocoder_config=None, use_cuda=False):
    tts_checkpoint = "/home/hector/.local/share/tts/tts_models--en--ljspeech--speedy-speech-wn/model_file.pth.tar"
    tts_config = "/home/hector/.local/share/tts/tts_models--en--ljspeech--speedy-speech-wn/config.json"
    #tts_checkpoint = "/home/hector/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar"
    #tts_config = "/home/hector/.local/share/tts/tts_models--en--ljspeech--glow-tts/config.json"
    if lang in ["Es"]:
        tts_checkpoint = "/home/hector/.local/share/tts/tts_models--es--mai--tacotron2-DDC/model_file.pth.tar"
        tts_config = "/home/hector/.local/share/tts/tts_models--es--mai--tacotron2-DDC/config.json"

    vocoder_checkpoint = "/home/hector/.local/share/tts/vocoder_models--universal--libri-tts--fullband-melgan/model_file.pth.tar"
    vocoder_config = "/home/hector/.local/share/tts/vocoder_models--universal--libri-tts--fullband-melgan/config.json"

    synthesizer = Synthesizer(tts_checkpoint, tts_config, vocoder_checkpoint,
                              vocoder_config)

    # kick it
    wav = synthesizer.tts(text)

    # save the results
    file_name = 'audio.wav'
    #print(" > Saving output to {}".format(file_name))
    synthesizer.save_wav(wav, file_name)
예제 #13
0
class Tts:

    MODEL_NAME = "tts_models/en/ljspeech/tacotron2-DCA"
    VOCODER_NAME = "vocoder_models/en/ljspeech/multiband-melgan"

    def __init__(self):
        path = Path(synthesize.__file__).parent / "../.models.json"
        logger.info("path")
        logger.info("Creating ModelManager")
        self.manager = ModelManager(path)
        logger.info("Downloading model")
        model_path, config_path, _ = self.manager.download_model(
            self.MODEL_NAME)
        logger.info("Downloading vcoder")
        vocoder_path, vocoder_config_path, _ = self.manager.download_model(
            self.VOCODER_NAME)
        logger.info("Finished downloading TTS model & vcoder")
        self.synthesizer = Synthesizer(model_path, config_path, vocoder_path,
                                       vocoder_config_path, False)
        self.tts_lock = threading.Lock()

    def synthesize_speech(self, tts: str):
        """
        This is largely copy pasted from TTS library (TTS.utils.synthesizer.Synthesizer.save_wav) but slightly modified
        to allow NamedTemporaryFile as output instead of writing it to a file

        :param tts: Text to speech
        :return: Speech in NamedTemporaryFile (wav)
        """
        if not self.tts_lock.acquire(blocking=True, timeout=0.1):
            raise TTSAlreadyProcessingException
        try:
            wav = self.synthesizer.tts(tts)
            wav = np.array(wav)
            wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
            temp_file = NamedTemporaryFile(suffix=".wav")
            scipy.io.wavfile.write(temp_file,
                                   self.synthesizer.output_sample_rate,
                                   wav_norm.astype(np.int16))
            return temp_file
        finally:
            self.tts_lock.release()
    def _get_synthesizer(self, language) -> Synthesizer:
        if '-' in language:
            language = language.split('-')[0]
        stopwatch = Stopwatch()
        with stopwatch:
            model_name = None

            for model in self.models:
                _, lang, dataset, name = model.split('/')
                print(f"{lang}|{name}")
                if language in lang:
                    model_name = model
                    if name == self.preferred_model:
                        break

            model_path, config_path, model_item = self.manager.download_model(
                model_name)
            vocoder_name = model_item.get(
                "default_vocoder",
                "vocoder_models/universal/libri-tts/fullband-melgan")
            vocoder_path, vocoder_config_path, _ = self.manager.download_model(
                vocoder_name)
            speakers_file_path = ''
            encoder_path = ''
            encoder_config_path = ''
            use_cuda = False

            synthesizer = Synthesizer(
                model_path,
                config_path,
                speakers_file_path,
                vocoder_path,
                vocoder_config_path,
                encoder_path,
                encoder_config_path,
                use_cuda,
            )
        LOG.debug(f"Get synthesizer time={stopwatch.time}")
        return synthesizer
예제 #15
0
 def test_split_into_sentences(self):
     """Check demo server sentences split as expected"""
     print("\n > Testing demo server sentence splitting")
     # pylint: disable=attribute-defined-outside-init
     self.seg = Synthesizer.get_segmenter("en")
     sis = Synthesizer.split_into_sentences
     assert sis(self, 'Hello. Two sentences') == ['Hello.', 'Two sentences']
     assert sis(self, 'He went to meet the adviser from Scott, Waltman & Co. next morning.') == ['He went to meet the adviser from Scott, Waltman & Co. next morning.']
     assert sis(self, 'Let\'s run it past Sarah and co. They\'ll want to see this.') == ['Let\'s run it past Sarah and co.', 'They\'ll want to see this.']
     assert sis(self, 'Where is Bobby Jr.\'s rabbit?') == ['Where is Bobby Jr.\'s rabbit?']
     assert sis(self, 'Please inform the U.K. authorities right away.') == ['Please inform the U.K. authorities right away.']
     assert sis(self, 'Were David and co. at the event?') == ['Were David and co. at the event?']
     assert sis(self, 'paging dr. green, please come to theatre four immediately.') == ['paging dr. green, please come to theatre four immediately.']
     assert sis(self, 'The email format is [email protected]. I think you reversed them.') == ['The email format is [email protected].', 'I think you reversed them.']
     assert sis(self, 'The demo site is: https://top100.example.com/subsection/latestnews.html. Please send us your feedback.') == ['The demo site is: https://top100.example.com/subsection/latestnews.html.', 'Please send us your feedback.']
     assert sis(self, 'Scowling at him, \'You are not done yet!\' she yelled.') == ['Scowling at him, \'You are not done yet!\' she yelled.'] # with the  final lowercase "she" we see it's all one sentence
     assert sis(self, 'Hey!! So good to see you.') == ['Hey!!', 'So good to see you.']
     assert sis(self, 'He went to Yahoo! but I don\'t know the division.') == ['He went to Yahoo! but I don\'t know the division.']
     assert sis(self, 'If you can\'t remember a quote, “at least make up a memorable one that\'s plausible..."') == ['If you can\'t remember a quote, “at least make up a memorable one that\'s plausible..."']
     assert sis(self, 'The address is not google.com.') == ['The address is not google.com.']
     assert sis(self, '1.) The first item 2.) The second item') == ['1.) The first item', '2.) The second item']
     assert sis(self, '1) The first item 2) The second item') == ['1) The first item', '2) The second item']
     assert sis(self, 'a. The first item b. The second item c. The third list item') == ['a. The first item', 'b. The second item', 'c. The third list item']
예제 #16
0
def make_synthesizer(model_name, use_cuda):
    # load model manager
    path = Path(TTS.__file__).parent / ".models.json"
    manager = ModelManager(path)

    model_path, config_path, model_item = manager.download_model(model_name)
    vocoder_name = model_item["default_vocoder"]
    vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name)

    speakers_file_path = None
    encoder_path = None
    encoder_config_path = None

    return Synthesizer(
        model_path,
        config_path,
        speakers_file_path,
        vocoder_path,
        vocoder_config_path,
        encoder_path,
        encoder_config_path,
        use_cuda,
    )
예제 #17
0
if args.model_path is not None:
    model_path = args.model_path
    config_path = args.config_path
    speakers_file_path = args.speakers_file_path

if args.vocoder_path is not None:
    vocoder_path = args.vocoder_path
    vocoder_config_path = args.vocoder_config_path

# load models
synthesizer = Synthesizer(
    tts_checkpoint=model_path,
    tts_config_path=config_path,
    tts_speakers_file=speakers_file_path,
    tts_languages_file=None,
    vocoder_checkpoint=vocoder_path,
    vocoder_config=vocoder_config_path,
    encoder_checkpoint="",
    encoder_config="",
    use_cuda=args.use_cuda,
)

use_multi_speaker = hasattr(
    synthesizer.tts_model,
    "num_speakers") and (synthesizer.tts_model.num_speakers > 1
                         or synthesizer.tts_speakers_file is not None)

speaker_manager = getattr(synthesizer.tts_model, "speaker_manager", None)
# TODO: set this from SpeakerManager
use_gst = synthesizer.tts_config.get("use_gst", False)
app = Flask(__name__)
예제 #18
0
def main():
    # pylint: disable=bad-continuation
    parser = argparse.ArgumentParser(description='''Synthesize speech on command line.\n\n'''

    '''You can either use your trained model or choose a model from the provided list.\n\n'''\

    '''If you don't specify any models, then it uses LJSpeech based English models\n\n'''\

    '''
    Example runs:

    # list provided models
    ./TTS/bin/synthesize.py --list_models

    # run tts with default models.
    ./TTS/bin synthesize.py --text "Text for TTS"

    # run a tts model with its default vocoder model.
     ./TTS/bin synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>"

    # run with specific tts and vocoder models from the list
    ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path

    # run your own TTS model (Using Griffin-Lim Vocoder)
    ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav

    # run your own TTS and Vocoder models
    ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
        --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json

    ''',
        formatter_class=RawTextHelpFormatter)

    parser.add_argument(
        '--list_models',
        type=str2bool,
        nargs='?',
        const=True,
        default=False,
        help='list available pre-trained tts and vocoder models.')
    parser.add_argument('--text',
                        type=str,
                        default=None,
                        help='Text to generate speech.')

    # Args for running pre-trained TTS models.
    parser.add_argument(
        '--model_name',
        type=str,
        default="tts_models/en/ljspeech/speedy-speech-wn",
        help=
        'Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>'
    )
    parser.add_argument(
        '--vocoder_name',
        type=str,
        default=None,
        help=
        'Name of one of the pre-trained  vocoder models in format <language>/<dataset>/<model_name>'
    )

    # Args for running custom models
    parser.add_argument('--config_path',
                        default=None,
                        type=str,
                        help='Path to model config file.')
    parser.add_argument(
        '--model_path',
        type=str,
        default=None,
        help='Path to model file.',
    )
    parser.add_argument(
        '--out_path',
        type=str,
        default=Path(__file__).resolve().parent,
        help=
        'Path to save final wav file. Wav file will be named as the given text.',
    )
    parser.add_argument('--use_cuda',
                        type=bool,
                        help='Run model on CUDA.',
                        default=False)
    parser.add_argument(
        '--vocoder_path',
        type=str,
        help=
        'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).',
        default=None,
    )
    parser.add_argument('--vocoder_config_path',
                        type=str,
                        help='Path to vocoder model config file.',
                        default=None)

    # args for multi-speaker synthesis
    parser.add_argument('--speakers_json',
                        type=str,
                        help="JSON file for multi-speaker model.",
                        default=None)
    parser.add_argument(
        '--speaker_idx',
        type=str,
        help=
        "if the tts model is trained with x-vectors, then speaker_idx is a file present in speakers.json else speaker_idx is the speaker id corresponding to a speaker in the speaker embedding layer.",
        default=None)
    parser.add_argument('--gst_style',
                        help="Wav path file for GST stylereference.",
                        default=None)

    # aux args
    parser.add_argument(
        '--save_spectogram',
        type=bool,
        help=
        "If true save raw spectogram for further (vocoder) processing in out_path.",
        default=False)

    args = parser.parse_args()

    # print the description if either text or list_models is not set
    if args.text is None and not args.list_models:
        parser.parse_args(['-h'])

    # load model manager
    path = Path(__file__).parent / "../.models.json"
    manager = ModelManager(path)

    model_path = None
    config_path = None
    vocoder_path = None
    vocoder_config_path = None

    # CASE1: list pre-trained TTS models
    if args.list_models:
        manager.list_models()
        sys.exit()

    # CASE2: load pre-trained models
    if args.model_name is not None:
        model_path, config_path, model_item = manager.download_model(
            args.model_name)
        args.vocoder_name = model_item[
            'default_vocoder'] if args.vocoder_name is None else args.vocoder_name

    if args.vocoder_name is not None:
        vocoder_path, vocoder_config_path, _ = manager.download_model(
            args.vocoder_name)

    # CASE3: load custome models
    if args.model_path is not None:
        model_path = args.model_path
        config_path = args.config_path

    if args.vocoder_path is not None:
        vocoder_path = args.vocoder_path
        vocoder_config_path = args.vocoder_config_path

    # RUN THE SYNTHESIS
    # load models
    synthesizer = Synthesizer(model_path, config_path, vocoder_path,
                              vocoder_config_path, args.use_cuda)

    print(" > Text: {}".format(args.text))

    # # handle multi-speaker setting
    # if not model_config.use_external_speaker_embedding_file and args.speaker_idx is not None:
    #     if args.speaker_idx.isdigit():
    #         args.speaker_idx = int(args.speaker_idx)
    #     else:
    #         args.speaker_idx = None
    # else:
    #     args.speaker_idx = None

    # if args.gst_style is None:
    #     if 'gst' in model_config.keys() and model_config.gst['gst_style_input'] is not None:
    #         gst_style = model_config.gst['gst_style_input']
    #     else:
    #         gst_style = None
    # else:
    #     # check if gst_style string is a dict, if is dict convert  else use string
    #     try:
    #         gst_style = json.loads(args.gst_style)
    #         if max(map(int, gst_style.keys())) >= model_config.gst['gst_style_tokens']:
    #             raise RuntimeError("The highest value of the gst_style dictionary key must be less than the number of GST Tokens, \n Highest dictionary key value: {} \n Number of GST tokens: {}".format(max(map(int, gst_style.keys())), model_config.gst['gst_style_tokens']))
    #     except ValueError:
    #         gst_style = args.gst_style

    # kick it
    wav = synthesizer.tts(args.text)

    # save the results
    file_name = args.text.replace(" ", "_")[0:20]
    file_name = file_name.translate(
        str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
    out_path = os.path.join(args.out_path, file_name)
    print(" > Saving output to {}".format(out_path))
    synthesizer.save_wav(
        wav,
        out_path,
    )
예제 #19
0
def initsynthesizer(model_name,vocoder_name,use_cuda):
    model_path, config_path = manager.download_model(model_name)
    vocoder_path, vocoder_config_path = manager.download(vocoder_name)
    return Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, use_cuda)
예제 #20
0
def main():
    # pylint: disable=bad-option-value
    parser = argparse.ArgumentParser(
        description="""Synthesize speech on command line.\n\n"""
        """You can either use your trained model or choose a model from the provided list.\n\n"""
        """If you don't specify any models, then it uses LJSpeech based English model.\n\n"""
        """
    # Example Runs:

    ## Single Speaker Models

    - list provided models

    ```
    $ ./TTS/bin/synthesize.py --list_models
    ```

    - run tts with default models.

    ```
    $ ./TTS/bin synthesize.py --text "Text for TTS"
    ```

    - run a tts model with its default vocoder model.

    ```
    $ ./TTS/bin synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>
    ```

    - run with specific tts and vocoder models from the list

    ```
    $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
    ```

    - run your own TTS model (Using Griffin-Lim Vocoder)

    ```
    $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
    ```

    - run your own TTS and Vocoder models
    ```
    $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
        --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
    ```

    ## MULTI-SPEAKER MODELS

    - list the available speakers and choose as <speaker_id> among them.

    ```
    $ ./TTS/bin/synthesize.py --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
    ```

    - run the multi-speaker TTS model with the target speaker ID.

    ```
    $ ./TTS/bin/synthesize.py --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
    ```

    - run your own multi-speaker TTS model.

    ```
    $ ./TTS/bin/synthesize.py --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
    ```
    """,
        formatter_class=RawTextHelpFormatter,
    )

    parser.add_argument(
        "--list_models",
        type=str2bool,
        nargs="?",
        const=True,
        default=False,
        help="list available pre-trained tts and vocoder models.",
    )
    parser.add_argument("--text",
                        type=str,
                        default=None,
                        help="Text to generate speech.")

    # Args for running pre-trained TTS models.
    parser.add_argument(
        "--model_name",
        type=str,
        default="tts_models/en/ljspeech/tacotron2-DDC",
        help=
        "Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>",
    )
    parser.add_argument(
        "--vocoder_name",
        type=str,
        default=None,
        help=
        "Name of one of the pre-trained  vocoder models in format <language>/<dataset>/<model_name>",
    )

    # Args for running custom models
    parser.add_argument("--config_path",
                        default=None,
                        type=str,
                        help="Path to model config file.")
    parser.add_argument(
        "--model_path",
        type=str,
        default=None,
        help="Path to model file.",
    )
    parser.add_argument(
        "--out_path",
        type=str,
        default="tts_output.wav",
        help="Output wav file path.",
    )
    parser.add_argument("--use_cuda",
                        type=bool,
                        help="Run model on CUDA.",
                        default=False)
    parser.add_argument(
        "--vocoder_path",
        type=str,
        help=
        "Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
        default=None,
    )
    parser.add_argument("--vocoder_config_path",
                        type=str,
                        help="Path to vocoder model config file.",
                        default=None)
    parser.add_argument(
        "--encoder_path",
        type=str,
        help="Path to speaker encoder model file.",
        default=None,
    )
    parser.add_argument("--encoder_config_path",
                        type=str,
                        help="Path to speaker encoder config file.",
                        default=None)

    # args for multi-speaker synthesis
    parser.add_argument("--speakers_file_path",
                        type=str,
                        help="JSON file for multi-speaker model.",
                        default=None)
    parser.add_argument(
        "--speaker_idx",
        type=str,
        help="Target speaker ID for a multi-speaker TTS model.",
        default=None,
    )
    parser.add_argument(
        "--speaker_wav",
        nargs="+",
        help=
        "wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The x_vectors is computed as their average.",
        default=None,
    )
    parser.add_argument("--gst_style",
                        help="Wav path file for GST stylereference.",
                        default=None)
    parser.add_argument(
        "--list_speaker_idxs",
        help="List available speaker ids for the defined multi-speaker model.",
        type=str2bool,
        nargs="?",
        const=True,
        default=False,
    )
    # aux args
    parser.add_argument(
        "--save_spectogram",
        type=bool,
        help=
        "If true save raw spectogram for further (vocoder) processing in out_path.",
        default=False,
    )

    args = parser.parse_args()

    # print the description if either text or list_models is not set
    if args.text is None and not args.list_models and not args.list_speaker_idxs:
        parser.parse_args(["-h"])

    # load model manager
    path = Path(__file__).parent / "../.models.json"
    manager = ModelManager(path)

    model_path = None
    config_path = None
    speakers_file_path = None
    vocoder_path = None
    vocoder_config_path = None
    encoder_path = None
    encoder_config_path = None

    # CASE1: list pre-trained TTS models
    if args.list_models:
        manager.list_models()
        sys.exit()

    # CASE2: load pre-trained model paths
    if args.model_name is not None and not args.model_path:
        model_path, config_path, model_item = manager.download_model(
            args.model_name)
        args.vocoder_name = model_item[
            "default_vocoder"] if args.vocoder_name is None else args.vocoder_name

    if args.vocoder_name is not None and not args.vocoder_path:
        vocoder_path, vocoder_config_path, _ = manager.download_model(
            args.vocoder_name)

    # CASE3: set custome model paths
    if args.model_path is not None:
        model_path = args.model_path
        config_path = args.config_path
        speakers_file_path = args.speakers_file_path

    if args.vocoder_path is not None:
        vocoder_path = args.vocoder_path
        vocoder_config_path = args.vocoder_config_path

    if args.encoder_path is not None:
        encoder_path = args.encoder_path
        encoder_config_path = args.encoder_config_path

    # load models
    synthesizer = Synthesizer(
        model_path,
        config_path,
        speakers_file_path,
        vocoder_path,
        vocoder_config_path,
        encoder_path,
        encoder_config_path,
        args.use_cuda,
    )

    # query speaker ids of a multi-speaker model.
    if args.list_speaker_idxs:
        print(
            " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
        )
        print(synthesizer.speaker_manager.speaker_ids)
        return

    # check the arguments against a multi-speaker model.
    if synthesizer.tts_speakers_file and (not args.speaker_idx
                                          and not args.speaker_wav):
        print(
            " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
            "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
        )
        return

    # RUN THE SYNTHESIS
    print(" > Text: {}".format(args.text))

    # kick it
    wav = synthesizer.tts(args.text, args.speaker_idx, args.speaker_wav)

    # save the results
    print(" > Saving output to {}".format(args.out_path))
    synthesizer.save_wav(wav, args.out_path)
class DetectAndDeter:
    CLASSIFICATION_COUNT = 5
    TELEMARKETER_THRESH = 0.3
    VALID_CALLER_THRESH = 0.1
    IN_AUDIO_RATE = 8000
    DS_AUDIO_RATE = 16000
    MOZILLA_TTS_AUDIO_RATE = 22050
    QUIET_THRESH = 150
    QUIET_LENGTH = 3000

    def __init__(self, name):
        self.name = name  # user's name  e.g. "Bob Ross"
        self.valid_caller_event = Event()
        self.caller_audio_chunk = np.array([], dtype='int16')

        self.audio_in_queue = Queue()
        self.stt_to_classification_queue = Queue()
        self.stt_to_chatbot_queue = Queue()
        self.chatbot_to_tts_queue = Queue()
        self.audio_out_queue = Queue()

        self.manager = Manager()
        self.transcript = self.manager.list()
        self.is_telemarketer = self.manager.Value("is_telemarketer", None)

        self.deep_speech = None
        self.mozilla_tts = None

        self.final_transcript = None
        self.final_predictions = None

        self.speech_to_text_thread = Process(target=self.speech_to_text)
        self.classify_text_thread = Process(target=self.classify_text)
        self.generate_response_thread = Process(target=self.generate_responses)
        self.text_to_speech_thread = Process(target=self.text_to_speech)

        self.log = {
            "start": None,
            "end": None,
            "version": CONFIG['version'],
            "transcript": [],
            "is_telemarketer": None,
            "caller": None
        }

    @property
    def queues(self):
        return self.audio_in_queue, self.audio_out_queue

    def start(self):
        self.speech_to_text_thread.start()
        self.classify_text_thread.start()
        self.generate_response_thread.start()
        self.text_to_speech_thread.start()

        self.log["start"] = dt.datetime.now().isoformat()

    def close(self):
        self.log["transcript"] = [value for value in self.transcript]
        self.log["is_telemarketer"] = self.is_telemarketer.value
        self.log["end"] = dt.datetime.now().isoformat()

        self.speech_to_text_thread.terminate()
        self.speech_to_text_thread.join()
        self.speech_to_text_thread.close()

        self.classify_text_thread.terminate()
        self.classify_text_thread.join()
        self.classify_text_thread.close()

        self.generate_response_thread.terminate()
        self.generate_response_thread.join()
        self.generate_response_thread.close()

        self.text_to_speech_thread.terminate()
        self.text_to_speech_thread.join()
        self.text_to_speech_thread.close()

    def fill_log_info(self, caller_number):
        self.log['caller'] = caller_number
        return self.log

    def classify_text(self):
        predictions = []
        while self.is_telemarketer.value is None:
            idx = self.stt_to_classification_queue.get()
            text = self.transcript[idx]['text']

            preds = model.predict(text)
            transcript_line = self.transcript[idx]
            transcript_line['analysis'] = {
                "prediction": str(preds[0]).lower(),
                "confidence": float(max(preds[2]))
            }
            self.transcript[idx] = transcript_line
            predictions.append(str(preds[0]).lower())

            maybe_telemarketer = predictions.count("persuasion") / len(
                predictions)

            if len(predictions) > self.CLASSIFICATION_COUNT:
                print("CLASS")
                print(maybe_telemarketer, self.TELEMARKETER_THRESH,
                      self.VALID_CALLER_THRESH)
                if maybe_telemarketer > self.TELEMARKETER_THRESH:
                    self.is_telemarketer.value = True
                    break
                elif maybe_telemarketer < self.VALID_CALLER_THRESH:
                    self.is_telemarketer.value = False
                    # self.is_telemarketer.set()
                    break

        if not self.is_telemarketer.value:
            self.valid_caller_event.set()

    def generate_responses(self):
        while True:
            text = self.stt_to_chatbot_queue.get()
            print("Generate Response:", text)
            response = str(chatbot.get_response(text))

            self.chatbot_to_tts_queue.put(response)

    def text_to_speech(self):
        tts_config = CONFIG['tts_config']
        models_folder = Path(tts_config['folder'])

        model_path = str(models_folder / tts_config['model'])
        model_config_path = str(models_folder / tts_config['model_config'])
        vocoder_path = str(models_folder / tts_config['vocoder'])
        vocoder_config_path = str(models_folder / tts_config['vocoder_config'])

        self.mozilla_tts = Synthesizer(model_path, model_config_path,
                                       vocoder_path, vocoder_config_path)

        while True:
            response = self.chatbot_to_tts_queue.get()
            print("TTS:", response)

            sound_arr = np.array(self.mozilla_tts.tts(response))

            sound_arr *= 2**15
            sound_arr = sound_arr.astype('int16')

            sound = bytes(sound_arr)
            sound, _ = audioop.ratecv(sound, 2, 1, self.MOZILLA_TTS_AUDIO_RATE,
                                      self.IN_AUDIO_RATE, None)

            ulaw_sound = audioop.lin2ulaw(sound, 2)

            chunk_len = 540
            chunks = len(ulaw_sound) // chunk_len
            extra = len(ulaw_sound) - (chunks * chunk_len)

            for c in range(chunks):
                chunk = ulaw_sound[c * chunk_len:c * chunk_len + chunk_len]
                self.audio_out_queue.put(
                    base64.b64encode(chunk).decode('utf-8'))

            if extra != 0:
                chunk = ulaw_sound[-extra:]
                self.audio_out_queue.put(
                    base64.b64encode(chunk).decode('utf-8'))

            self.transcript.append({
                "speaker": "self",
                "text": response,
                "datetime": dt.datetime.now().isoformat()
            })

    def speech_to_text(self):
        stt_config = CONFIG['stt_config']
        models_folder = Path(stt_config['folder'])
        model_path = str(models_folder / stt_config['model'])
        scorer_path = str(models_folder / stt_config['scorer'])

        self.deep_speech = Model(model_path)
        self.deep_speech.enableExternalScorer(scorer_path)

        stream = self.deep_speech.createStream()

        while True:
            speech = self.audio_in_queue.get()

            while not self.audio_in_queue.empty():
                speech += self.audio_in_queue.get()

            lin_speech = audioop.ulaw2lin(speech, 2)
            ds_speech, _ = audioop.ratecv(lin_speech, 2, 1, self.IN_AUDIO_RATE,
                                          self.DS_AUDIO_RATE, None)

            lin_speech_arr = np.frombuffer(lin_speech, np.int16)
            ds_speech_arr = np.frombuffer(ds_speech, np.int16)

            stream.feedAudioContent(ds_speech_arr)

            self.caller_audio_chunk = np.concatenate(
                (self.caller_audio_chunk, lin_speech_arr))

            chunk_idx = max(0,
                            len(self.caller_audio_chunk) - self.QUIET_LENGTH)
            quiet_chunk = self.caller_audio_chunk[chunk_idx:]
            if (quiet_chunk < self.QUIET_THRESH).all() and (
                    self.caller_audio_chunk > self.QUIET_THRESH).any():
                text = stream.intermediateDecode()

                if text.strip():
                    self.stt_to_chatbot_queue.put(text)

                    idx = len(self.transcript
                              )  # insert to avoid race conditions with indexes
                    self.transcript.insert(
                        idx, {
                            "speaker": "caller",
                            "text": text,
                            "datetime": dt.datetime.now().isoformat()
                        })
                    self.stt_to_classification_queue.put(idx)

                    stream.finishStream()
                    stream = self.deep_speech.createStream()

                self.caller_audio_chunk = np.array([], dtype='int16')

    def make_greeting(self, one_party_consent):
        self.chatbot_to_tts_queue.put(
            f"Hi. This is {self.name} how may I help you?")

        if not one_party_consent:
            self.chatbot_to_tts_queue.put("Keep in mind, I record all calls")
예제 #22
0
def load_synthesizer():
    MODEL_PATH = './voice/models/glow-tts/best_model.pth.tar'
    CONFIG_PATH = './voice/models/glow-tts/config.json'
    use_cuda = False
    synthesizer = Synthesizer(MODEL_PATH, CONFIG_PATH, use_cuda)
    return synthesizer
예제 #23
0
파일: synthesize.py 프로젝트: coqui-ai/TTS
def main():
    description = """Synthesize speech on command line.

You can either use your trained model or choose a model from the provided list.

If you don't specify any models, then it uses LJSpeech based English model.

## Example Runs

### Single Speaker Models

- List provided models:

    ```
    $ tts --list_models
    ```

- Query info for model info by idx:

    ```
    $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
    ```

- Query info for model info by full name:

    ```
    $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
    ```

- Run TTS with default models:

    ```
    $ tts --text "Text for TTS"
    ```

- Run a TTS model with its default vocoder model:

    ```
    $ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>"
    ```

- Run with specific TTS and vocoder models from the list:

    ```
    $ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
    ```

- Run your own TTS model (Using Griffin-Lim Vocoder):

    ```
    $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
    ```

- Run your own TTS and Vocoder models:
    ```
    $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
        --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
    ```

### Multi-speaker Models

- List the available speakers and choose as <speaker_id> among them:

    ```
    $ tts --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
    ```

- Run the multi-speaker TTS model with the target speaker ID:

    ```
    $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
    ```

- Run your own multi-speaker TTS model:

    ```
    $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
    ```
    """
    # We remove Markdown code formatting programmatically here to allow us to copy-and-paste from main README to keep
    # documentation in sync more easily.
    parser = argparse.ArgumentParser(
        description=description.replace("    ```\n", ""),
        formatter_class=RawTextHelpFormatter,
    )

    parser.add_argument(
        "--list_models",
        type=str2bool,
        nargs="?",
        const=True,
        default=False,
        help="list available pre-trained TTS and vocoder models.",
    )

    parser.add_argument(
        "--model_info_by_idx",
        type=str,
        default=None,
        help="model info using query format: <model_type>/<model_query_idx>",
    )

    parser.add_argument(
        "--model_info_by_name",
        type=str,
        default=None,
        help="model info using query format: <model_type>/<language>/<dataset>/<model_name>",
    )

    parser.add_argument("--text", type=str, default=None, help="Text to generate speech.")

    # Args for running pre-trained TTS models.
    parser.add_argument(
        "--model_name",
        type=str,
        default="tts_models/en/ljspeech/tacotron2-DDC",
        help="Name of one of the pre-trained TTS models in format <language>/<dataset>/<model_name>",
    )
    parser.add_argument(
        "--vocoder_name",
        type=str,
        default=None,
        help="Name of one of the pre-trained  vocoder models in format <language>/<dataset>/<model_name>",
    )

    # Args for running custom models
    parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
    parser.add_argument(
        "--model_path",
        type=str,
        default=None,
        help="Path to model file.",
    )
    parser.add_argument(
        "--out_path",
        type=str,
        default="tts_output.wav",
        help="Output wav file path.",
    )
    parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
    parser.add_argument(
        "--vocoder_path",
        type=str,
        help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
        default=None,
    )
    parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
    parser.add_argument(
        "--encoder_path",
        type=str,
        help="Path to speaker encoder model file.",
        default=None,
    )
    parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)

    # args for multi-speaker synthesis
    parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
    parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
    parser.add_argument(
        "--speaker_idx",
        type=str,
        help="Target speaker ID for a multi-speaker TTS model.",
        default=None,
    )
    parser.add_argument(
        "--language_idx",
        type=str,
        help="Target language ID for a multi-lingual TTS model.",
        default=None,
    )
    parser.add_argument(
        "--speaker_wav",
        nargs="+",
        help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
        default=None,
    )
    parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
    parser.add_argument(
        "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
    )
    parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
    parser.add_argument(
        "--list_speaker_idxs",
        help="List available speaker ids for the defined multi-speaker model.",
        type=str2bool,
        nargs="?",
        const=True,
        default=False,
    )
    parser.add_argument(
        "--list_language_idxs",
        help="List available language ids for the defined multi-lingual model.",
        type=str2bool,
        nargs="?",
        const=True,
        default=False,
    )
    # aux args
    parser.add_argument(
        "--save_spectogram",
        type=bool,
        help="If true save raw spectogram for further (vocoder) processing in out_path.",
        default=False,
    )
    parser.add_argument(
        "--reference_wav",
        type=str,
        help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav",
        default=None,
    )
    parser.add_argument(
        "--reference_speaker_idx",
        type=str,
        help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
        default=None,
    )
    args = parser.parse_args()

    # print the description if either text or list_models is not set
    check_args = [
        args.text,
        args.list_models,
        args.list_speaker_idxs,
        args.list_language_idxs,
        args.reference_wav,
        args.model_info_by_idx,
        args.model_info_by_name,
    ]
    if not any(check_args):
        parser.parse_args(["-h"])

    # load model manager
    path = Path(__file__).parent / "../.models.json"
    manager = ModelManager(path)

    model_path = None
    config_path = None
    speakers_file_path = None
    language_ids_file_path = None
    vocoder_path = None
    vocoder_config_path = None
    encoder_path = None
    encoder_config_path = None

    # CASE1 #list : list pre-trained TTS models
    if args.list_models:
        manager.list_models()
        sys.exit()

    # CASE2 #info : model info of pre-trained TTS models
    if args.model_info_by_idx:
        model_query = args.model_info_by_idx
        manager.model_info_by_idx(model_query)
        sys.exit()

    if args.model_info_by_name:
        model_query_full_name = args.model_info_by_name
        manager.model_info_by_full_name(model_query_full_name)
        sys.exit()

    # CASE3: load pre-trained model paths
    if args.model_name is not None and not args.model_path:
        model_path, config_path, model_item = manager.download_model(args.model_name)
        args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name

    if args.vocoder_name is not None and not args.vocoder_path:
        vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)

    # CASE4: set custom model paths
    if args.model_path is not None:
        model_path = args.model_path
        config_path = args.config_path
        speakers_file_path = args.speakers_file_path
        language_ids_file_path = args.language_ids_file_path

    if args.vocoder_path is not None:
        vocoder_path = args.vocoder_path
        vocoder_config_path = args.vocoder_config_path

    if args.encoder_path is not None:
        encoder_path = args.encoder_path
        encoder_config_path = args.encoder_config_path

    # load models
    synthesizer = Synthesizer(
        model_path,
        config_path,
        speakers_file_path,
        language_ids_file_path,
        vocoder_path,
        vocoder_config_path,
        encoder_path,
        encoder_config_path,
        args.use_cuda,
    )

    # query speaker ids of a multi-speaker model.
    if args.list_speaker_idxs:
        print(
            " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
        )
        print(synthesizer.tts_model.speaker_manager.ids)
        return

    # query langauge ids of a multi-lingual model.
    if args.list_language_idxs:
        print(
            " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
        )
        print(synthesizer.tts_model.language_manager.ids)
        return

    # check the arguments against a multi-speaker model.
    if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
        print(
            " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
            "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
        )
        return

    # RUN THE SYNTHESIS
    if args.text:
        print(" > Text: {}".format(args.text))

    # kick it
    wav = synthesizer.tts(
        args.text,
        args.speaker_idx,
        args.language_idx,
        args.speaker_wav,
        reference_wav=args.reference_wav,
        style_wav=args.capacitron_style_wav,
        style_text=args.capacitron_style_text,
        reference_speaker_name=args.reference_speaker_idx,
    )

    # save the results
    print(" > Saving output to {}".format(args.out_path))
    synthesizer.save_wav(wav, args.out_path)
예제 #24
0
파일: server.py 프로젝트: y-kamiya/TTS
if args.vocoder_name is not None:
    vocoder_checkpoint_file, vocoder_config_file, vocoder_json_dict = manager.download_model(args.vocoder_name)

# If these were not specified in the CLI args, use default values with embedded model files
if not args.tts_checkpoint and os.path.isfile(tts_checkpoint_file):
    args.tts_checkpoint = tts_checkpoint_file
if not args.tts_config and os.path.isfile(tts_config_file):
    args.tts_config = tts_config_file

if not args.vocoder_checkpoint and os.path.isfile(vocoder_checkpoint_file):
    args.vocoder_checkpoint = vocoder_checkpoint_file
if not args.vocoder_config and os.path.isfile(vocoder_config_file):
    args.vocoder_config = vocoder_config_file

synthesizer = Synthesizer(
    args.tts_checkpoint, args.tts_config, args.vocoder_checkpoint, args.vocoder_config, args.use_cuda
)

app = Flask(__name__)


@app.route("/")
def index():
    return render_template("index.html", show_details=args.show_details)


@app.route("/details")
def details():
    model_config = load_config(args.tts_config)
    if args.vocoder_config is not None and os.path.isfile(args.vocoder_config):
        vocoder_config = load_config(args.vocoder_config)
예제 #25
0
 def test_split_into_sentences(self):
     """Check demo server sentences split as expected"""
     print("\n > Testing demo server sentence splitting")
     # pylint: disable=attribute-defined-outside-init, protected-access
     self.seg = Synthesizer._get_segmenter("en")
     sis = Synthesizer.split_into_sentences
     assert sis(self, "Hello. Two sentences") == ["Hello.", "Two sentences"]
     assert sis(
         self,
         "He went to meet the adviser from Scott, Waltman & Co. next morning."
     ) == [
         "He went to meet the adviser from Scott, Waltman & Co. next morning."
     ]
     assert sis(
         self,
         "Let's run it past Sarah and co. They'll want to see this.") == [
             "Let's run it past Sarah and co.",
             "They'll want to see this.",
         ]
     assert sis(self, "Where is Bobby Jr.'s rabbit?") == [
         "Where is Bobby Jr.'s rabbit?"
     ]
     assert sis(self, "Please inform the U.K. authorities right away.") == [
         "Please inform the U.K. authorities right away."
     ]
     assert sis(self, "Were David and co. at the event?") == [
         "Were David and co. at the event?"
     ]
     assert sis(
         self,
         "paging dr. green, please come to theatre four immediately.") == [
             "paging dr. green, please come to theatre four immediately."
         ]
     assert sis(
         self,
         "The email format is [email protected]. I think you reversed them."
     ) == [
         "The email format is [email protected].",
         "I think you reversed them.",
     ]
     assert sis(
         self,
         "The demo site is: https://top100.example.com/subsection/latestnews.html. Please send us your feedback.",
     ) == [
         "The demo site is: https://top100.example.com/subsection/latestnews.html.",
         "Please send us your feedback.",
     ]
     assert sis(
         self, "Scowling at him, 'You are not done yet!' she yelled.") == [
             "Scowling at him, 'You are not done yet!' she yelled."
         ]  # with the  final lowercase "she" we see it's all one sentence
     assert sis(self, "Hey!! So good to see you.") == [
         "Hey!!", "So good to see you."
     ]
     assert sis(self,
                "He went to Yahoo! but I don't know the division.") == [
                    "He went to Yahoo! but I don't know the division."
                ]
     assert sis(
         self,
         "If you can't remember a quote, “at least make up a memorable one that's plausible...\""
     ) == [
         "If you can't remember a quote, “at least make up a memorable one that's plausible...\""
     ]
     assert sis(self, "The address is not google.com.") == [
         "The address is not google.com."
     ]
     assert sis(self, "1.) The first item 2.) The second item") == [
         "1.) The first item", "2.) The second item"
     ]
     assert sis(self, "1) The first item 2) The second item") == [
         "1) The first item", "2) The second item"
     ]
     assert sis(
         self,
         "a. The first item b. The second item c. The third list item") == [
             "a. The first item",
             "b. The second item",
             "c. The third list item",
         ]
예제 #26
0
            print("Link: " + v.text)
            links.append([v.text.strip(), v.attrs['href']])
    return (txt, links)


ar = len(sys.argv)
if (ar == 1):
    print("Please use a URL as the first argument")
    quit()

url = sys.argv[1]
#txt = cleantext(txt)
#txt2 = summ(txt)
#print(txt2)
hashindex = HashIndex(url)
print(hashindex.index)

path = '/home/osiris/.local/share/tts/'

model_path = path + 'tts_models--en--ljspeech--tacotron2-DCA/model_file.pth.tar'
config_path = path + 'tts_models--en--ljspeech--tacotron2-DCA/config.json'
vocoder_path = path + 'vocoder_models--en--ljspeech--mulitband-melgan/model_file.pth.tar'
vocoder_config_path = path + 'vocoder_models--en--ljspeech--mulitband-melgan/config.json'

synthesizer = Synthesizer(model_path, config_path, vocoder_path,
                          vocoder_config_path, False)
#playall(txt2)

(txt, links) = getPageData(url)
go(txt)
예제 #27
0
파일: server.py 프로젝트: AyushExel/TTS
        args.vocoder_name)

# CASE3: set custome model paths
if args.model_path is not None:
    model_path = args.model_path
    config_path = args.config_path
    speakers_file_path = args.speakers_file_path

if args.vocoder_path is not None:
    vocoder_path = args.vocoder_path
    vocoder_config_path = args.vocoder_config_path

# load models
synthesizer = Synthesizer(model_path,
                          config_path,
                          speakers_file_path,
                          vocoder_path,
                          vocoder_config_path,
                          use_cuda=args.use_cuda)

use_multi_speaker = synthesizer.speaker_manager is not None
# TODO: set this from SpeakerManager
use_gst = synthesizer.tts_config.get("use_gst", False)
app = Flask(__name__)


def style_wav_uri_to_dict(style_wav: str) -> Union[str, dict]:
    """Transform an uri style_wav, in either a string (path to wav file to be use for style transfer)
    or a dict (gst tokens/values to be use for styling)

    Args:
        style_wav (str): uri
예제 #28
0
class TextToSpeechListenerTask(Listener, Task):

    queue: Queue = Queue(maxsize=20)

    model_name = "tts_models/en/ljspeech/tacotron2-DCA"

    vocoder_name = "vocoder_models/universal/libri-tts/fullband-melgan"

    use_cuda = False

    synthesizer: Synthesizer

    tts_sample_rate = 22050

    output_sample_rate = 16000

    # TODO move to local folder instead of system folder
    def __init__(self, auto_start: bool = True):
        super().__init__(auto_start)

        path = Path(
            __file__
        ).parent / "../../.venv/lib/python3.8/site-packages/TTS/.models.json"
        manager = ModelManager(path)

        model_path, config_path = manager.download_model(self.model_name)
        vocoder_path, vocoder_config_path = manager.download_model(
            self.vocoder_name)

        self.synthesizer = Synthesizer(model_path, config_path, vocoder_path,
                                       vocoder_config_path, self.use_cuda)

    def run(self):
        while self.running:
            try:
                text = self.queue.get(timeout=2)
                if isinstance(text, Text):
                    self.speak(text)
                self.queue.task_done()
            except EmptyQueueError:
                continue

    def speak(self, text: Text) -> None:
        if not text.value:
            logger.warning("Cannot synthesize empty text")
        data = self.synthesize(text.value)
        publish_message(self, AudioFrame(data), text.src)

    def synthesize(self, text: str) -> bytes:
        """ TTS outputs a sample rate of 22050, so we must desample it to be able to consume it again """
        audio_data = np.array(self.synthesizer.tts(text))
        audio_data_normalized = audio_data * (
            32767 / max(0.01, np.max(np.abs(audio_data))))
        audio_data_bytes = audio_data_normalized.astype(np.int16).tobytes()

        audio_data_bytes = self.desample_audio_data(audio_data_bytes)

        self.store_audio_data(audio_data_bytes)

        return audio_data_bytes

    def desample_audio_data(self, data: bytes) -> bytes:
        converted = audioop.ratecv(data, 2, 1, self.tts_sample_rate,
                                   self.output_sample_rate, None)
        return converted[0]

    def store_audio_data(
        self,
        data: bytes,
        path:
        str = "/home/jonas/Projects/robot/mind/assets-old/output/tts/output.raw"
    ) -> None:
        with open(path, "wb") as f:
            f.write(data)