Ejemplo n.º 1
0
 def __init__(self):
     gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
     tf.config.experimental.set_visible_devices(devices=gpus[0:2], device_type='GPU')
     for gpu in gpus:
         tf.config.experimental.set_memory_growth(gpu, True)
     self._converter_model()
     self.tts_pause = TTSSegPause()
     self.tts_py = TTSPinYin()
Ejemplo n.º 2
0
class TTSModel():
    def __init__(self):
        gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
        tf.config.experimental.set_visible_devices(devices=gpus[0:2],
                                                   device_type='GPU')
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        self.__init_model()
        self.tts_pause = TTSSegPause()
        self.tts_py = TTSPinYin()

    def __init_model(self):
        tacotron2_config = AutoConfig.from_pretrained(config.tacotron2_baker)
        self.tacotron2 = TFAutoModel.from_pretrained(
            config=tacotron2_config,
            pretrained_path=config.tacotron2_pretrained_path,
            training=False,
            name="tacotron2")
        self.tacotron2.setup_window(win_front=5, win_back=5)

        mb_melgan_config = AutoConfig.from_pretrained(
            config.multiband_melgan_baker)
        self.mb_melgan = TFAutoModel.from_pretrained(
            config=mb_melgan_config,
            pretrained_path=config.multiband_melgan_pretrained_path,
            name="mb_melgan")

        self.processor = AutoProcessor.from_pretrained(
            pretrained_path=config.baker_mapper_pretrained_path)

    def text_to_pinyin_sequence(self, text):
        # pinyin = self.processor.pinyin_parser(text, style=Style.TONE3, errors="ignore")
        pinyin, text = self.tts_py.get_pyin(text)
        new_pinyin = []
        for x in str(pinyin).split(" "):
            if "#" not in x:
                new_pinyin.append(x)
        phonemes = self.processor.get_phoneme_from_char_and_pinyin(
            text, new_pinyin)
        text = " ".join(phonemes)
        print("phoneme seq: {}".format(text))
        logging.info(
            "[TTSModel] [text_to_pinyin_sequence] phoneme seq:{}".format(text))
        input_ids = self.processor.text_to_sequence(text, inference=False)
        return input_ids

    def do_synthesis(self, input_text):
        input_text = self.tts_pause.add_pause(input_text)
        print("input_text>>>>", input_text)
        logging.info(
            "[TTSModel] [do_synthesis] input_text:{}".format(input_text))
        input_ids = self.processor.text_to_sequence(input_text, inference=True)

        _, mel_outputs, stop_token_prediction, alignment_history = self.tacotron2.inference(
            tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
            tf.convert_to_tensor([len(input_ids)], tf.int32),
            tf.convert_to_tensor([0], dtype=tf.int32))

        remove_end = 1024
        audio = self.mb_melgan.inference(mel_outputs)[0, :-remove_end, 0]

        return mel_outputs.numpy(), alignment_history.numpy(), audio.numpy()

    '''
Ejemplo n.º 3
0
# -*- coding:utf-8 -*-

import re
import yaml
import numpy as np
import pandas as pd
from pypinyin import Style
from conf.config import config
from core.parse_text_add_pause import TTSSegPause
from core.tftts_pinyin import TTSModel

if __name__ == "__main__":
    tts_model = TTSModel()
    tts_seg_pause = TTSSegPause()
    data_pd = pd.read_csv(config.MIX_VOICE_TEXT_DATA_PATH,
                          sep=',',
                          encoding='utf-8')
    mix_voice_text_index_list = list(
        data_pd[config.MIX_VOICE_TEXT_INDEX].values)
    mix_voice_text_list = list(data_pd[config.MIX_VOICE_TEXT].values)

    f2 = "./data/010001-020000.txt"

    f1 = open("./data/000001-010000.txt")
    lines = f1.readlines()
    with open(f2, "w") as file:
        for idx in range(0, len(lines), 2):
            utt_id, chn_char = lines[idx].strip().split()
            per_text_pinyin = lines[idx + 1].strip().split()
            if "IY1" in per_text_pinyin or "B" in chn_char:
                print(f"Skip this: {utt_id} {chn_char} {per_text_pinyin}")
Ejemplo n.º 4
0
class TTSModel():
    def __init__(self):
        gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
        tf.config.experimental.set_visible_devices(devices=gpus[0:2],
                                                   device_type='GPU')
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        self._converter_model()
        self.tts_pause = TTSSegPause()
        self.tts_py = TTSPinYin()

    def _converter_model(self):
        with open(config.tacotron2_baker) as f:
            conf = yaml.load(f, Loader=yaml.Loader)
        conf = Tacotron2Config(**conf["tacotron2_params"])
        self.tacotron2 = TFTacotron2(config=conf,
                                     training=False,
                                     name="tacotron2",
                                     enable_tflite_convertible=True)
        self.tacotron2.setup_window(win_front=5, win_back=5)
        self.tacotron2.setup_maximum_iterations(1000)  # be careful
        self.tacotron2._build()
        self.tacotron2.load_weights(config.tacotron2_pretrained_path)
        tacotron2_concrete_function = self.tacotron2.inference_tflite.get_concrete_function(
        )
        converter = tf.lite.TFLiteConverter.from_concrete_functions(
            [tacotron2_concrete_function])
        converter.optimizations = [tf.lite.Optimize.DEFAULT]
        converter.target_spec.supported_ops = [
            tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS
        ]
        tflite_model = converter.convert()
        with open('tacotron2.tflite', 'wb') as f:
            f.write(tflite_model)

        print('Model size is %f MBs.' % (len(tflite_model) / 1024 / 1024.0))

        #tacotron2_config = AutoConfig.from_pretrained( config.tacotron2_baker )
        #self.tacotron2 = TFAutoModel.from_pretrained( config=tacotron2_config, pretrained_path='tacotron2.tflite', training=False,  name="tacotron2" )
        #self.tacotron2.setup_window(win_front=5, win_back=5)
        self.interpreter = tf.lite.Interpreter(model_path='tacotron2.tflite')
        self.interpreter.allocate_tensors()
        self.input_details = self.interpreter.get_input_details()
        self.output_details = self.interpreter.get_output_details()

        mb_melgan_config = AutoConfig.from_pretrained(
            config.multiband_melgan_baker)
        self.mb_melgan = TFAutoModel.from_pretrained(
            config=mb_melgan_config,
            pretrained_path=config.multiband_melgan_pretrained_path,
            name="mb_melgan")

        self.processor = AutoProcessor.from_pretrained(
            pretrained_path=config.baker_mapper_pretrained_path)

    def prepare_input(self, input_ids):
        return (tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32),
                               0),
                tf.convert_to_tensor([len(input_ids)], tf.int32),
                tf.convert_to_tensor([0], dtype=tf.int32))

    def do_synthesis(self, input_text):
        input_text = self.tts_pause.add_pause(input_text)
        print("input_text>>>>", input_text)
        logging.info(
            "[TTSModel] [do_synthesis] input_text:{}".format(input_text))
        input_ids = self.processor.text_to_sequence(input_text, inference=True)
        # nput_ids = np.concatenate([input_ids, [219 - 1]], -1)
        self.interpreter.resize_tensor_input(self.input_details[0]['index'],
                                             [1, len(input_ids)])

        self.interpreter.allocate_tensors()
        input_data = self.prepare_input(input_ids)
        for i, detail in enumerate(self.input_details):
            input_shape = detail['shape']
            self.interpreter.set_tensor(detail['index'], input_data[i])
        # self.interpreter.invoke()
        decoder_output_tflite, mel_outputs = self.interpreter.get_tensor(
            self.output_details[0]['index']), interpreter.get_tensor(
                self.output_details[1]['index'])

        remove_end = 1024
        audio = self.mb_melgan.inference(mel_outputs)[0, :-remove_end, 0]

        return mel_outputs.numpy(), decoder_output_tflite.numpy(), audio.numpy(
        )

    '''
Ejemplo n.º 5
0
# print("interpreter_mb_melgan:",interpreter_mb_melgan)
interpreter_mb_melgan.allocate_tensors()

# Get input and output tensors.
input_details_mb_melgan = interpreter_mb_melgan.get_input_details()
# print("input_details_mb_melgan:",input_details_mb_melgan)
output_details_mb_melgan = interpreter_mb_melgan.get_output_details()
# print("output_details_mb_melgan:",output_details_mb_melgan)

# Prepare input data.
def prepare_input(input_ids):
  return (tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
          tf.convert_to_tensor([len(input_ids)], tf.int32),
          tf.convert_to_tensor([0], dtype=tf.int32))
  
tts_pause = TTSSegPause()
# Test the model on random input data.
def infer(input_text):
  processor = AutoProcessor.from_pretrained(pretrained_path=config_lp.baker_mapper_pretrained_path)
  input_text = tts_pause.add_pause(input_text)
  # logging.info( "[TTSModel] [do_synthesis] input_text:{}".format( input_text ) )
  input_ids = processor.text_to_sequence(input_text, inference=True) 
        
  # input_ids = np.concatenate([input_ids, [len(symbols) - 1]], -1)  # eos.
  # 
  interpreter_tacotron.resize_tensor_input(input_details_tacotron[0]['index'],  [1, len(input_ids)])
  interpreter_tacotron.allocate_tensors()
  input_data = prepare_input(input_ids)
  for i, detail in enumerate(input_details_tacotron):
    print(detail)
    input_shape = detail['shape']