def __init_model(self): input_text = "你好, 很高兴认识你" self.processor = AutoProcessor.from_pretrained( pretrained_path=config.baker_mapper_pretrained_path) input_ids = self.processor.text_to_sequence(input_text, inference=True) # tacotron2_config = AutoConfig.from_pretrained( config.tacotron2_baker ) # self.tacotron2 = TFAutoModel.from_pretrained( config=tacotron2_config, pretrained_path=config.tacotron2_pretrained_path, training=False, name="tacotron2" ) tacotron2_config = AutoConfig.from_pretrained(config.tacotron2_baker) self.tacotron2 = TFAutoModel.from_pretrained( config=tacotron2_config, pretrained_path=None, is_build= False, # don't build model if you want to save it to pb. (TF related bug) name="tacotron2") self.tacotron2.setup_window(win_front=5, win_back=5) _, mel_outputs, stop_token_prediction, alignment_history = self.tacotron2.inference( tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0), tf.convert_to_tensor([len(input_ids)], tf.int32), tf.convert_to_tensor([0], dtype=tf.int32)) self.tacotron2.load_weights(config.tacotron2_pretrained_path) tf.saved_model.save(self.tacotron2, "./test_saved", signatures=self.tacotron2.inference) self.tacotron2 = tf.saved_model.load("./test_saved") mb_melgan_config = AutoConfig.from_pretrained( config.multiband_melgan_baker) self.mb_melgan = TFAutoModel.from_pretrained( config=mb_melgan_config, pretrained_path=config.multiband_melgan_pretrained_path, name="mb_melgan") self.processor = AutoProcessor.from_pretrained( pretrained_path=config.baker_mapper_pretrained_path)
def __init__(self): self.converter = opencc.OpenCC('tw2s.json') tts_model_dir = os.environ['TTS_MODEL_DIR'] if not os.path.exists(tts_model_dir): parent_dir = os.path.dirname(tts_model_dir) zip_file_path = os.path.join(parent_dir, 'model_files.zip') download_blob('dailybrief', 'models/model_files.zip', zip_file_path) with zipfile.ZipFile(zip_file_path, 'r') as zip_ref: zip_ref.extractall(parent_dir) tacotron2_config_file = os.path.join( tts_model_dir, 'config/tacotron2.baker.v1.yaml') mb_melgan_config_file = os.path.join( tts_model_dir, 'config/multiband_melgan.baker.v1.yaml') tacotron2_config = AutoConfig.from_pretrained(tacotron2_config_file) mb_melgan_config = AutoConfig.from_pretrained(mb_melgan_config_file) text2mel_model_file = os.path.join( tts_model_dir, 'models/tacotron-model-100000.h5') vocoder_model_file = os.path.join(tts_model_dir, 'models/generator-920000.h5') baker_mapper_file = os.path.join(tts_model_dir, 'models/baker_mapper.json') self.text2mel_model = TFAutoModel.from_pretrained( config=tacotron2_config, pretrained_path=text2mel_model_file, name='tacotron2' ) self.vocoder_model = TFAutoModel.from_pretrained( config=mb_melgan_config, pretrained_path=vocoder_model_file, name='mb_melgan' ) self.processor = AutoProcessor.from_pretrained(pretrained_path=baker_mapper_file)
def init_speech(): global sf global tf global TFAutoModel global AutoProcessor import soundfile as sf import tensorflow as tf from tensorflow_tts.inference import TFAutoModel from tensorflow_tts.inference import AutoProcessor global fastspeech2 global mb_melgan global processor # initialize fastspeech2 model. fastspeech2 = TFAutoModel.from_pretrained( "tensorspeech/tts-fastspeech2-ljspeech-en") # initialize mb_melgan model mb_melgan = TFAutoModel.from_pretrained( "tensorspeech/tts-mb_melgan-ljspeech-en") # inference processor = AutoProcessor.from_pretrained( "tensorspeech/tts-fastspeech2-ljspeech-en") inference("Hello sir") debug("Speech", "init")
def _converter_model(self): with open( config.tacotron2_baker ) as f: conf = yaml.load(f, Loader=yaml.Loader) conf = Tacotron2Config(**conf["tacotron2_params"]) self.tacotron2 = TFTacotron2(config=conf, training=False, name="tacotron2", enable_tflite_convertible=True) self.tacotron2.setup_window(win_front=5, win_back=5) self.tacotron2.setup_maximum_iterations(1000) # be careful self.tacotron2._build() self.tacotron2.load_weights(config.tacotron2_pretrained_path) tacotron2_concrete_function = self.tacotron2.inference_tflite.get_concrete_function() converter = tf.lite.TFLiteConverter.from_concrete_functions( [tacotron2_concrete_function] ) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.target_spec.supported_ops = [ tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS ] tflite_model = converter.convert() with open('tacotron2.tflite', 'wb') as f: f.write(tflite_model) print('Model size is %f MBs.' % (len(tflite_model) / 1024 / 1024.0) ) #tacotron2_config = AutoConfig.from_pretrained( config.tacotron2_baker ) #self.tacotron2 = TFAutoModel.from_pretrained( config=tacotron2_config, pretrained_path='tacotron2.tflite', training=False, name="tacotron2" ) #self.tacotron2.setup_window(win_front=5, win_back=5) self.interpreter = tf.lite.Interpreter(model_path='tacotron2.tflite') self.interpreter.allocate_tensors() self.input_details = self.interpreter.get_input_details() self.output_details = self.interpreter.get_output_details() mb_melgan_config = AutoConfig.from_pretrained( config.multiband_melgan_baker ) self.mb_melgan = TFAutoModel.from_pretrained( config=mb_melgan_config, pretrained_path=config.multiband_melgan_pretrained_path, name="mb_melgan" ) self.processor = AutoProcessor.from_pretrained(pretrained_path=config.baker_mapper_pretrained_path)
def __init__(self): if (Text2SpeechModel == "dc_tts"): self.g = Graph(mode="synthesize") print("Text2Speech Tensorflow Graph loaded") elif (Text2SpeechModel == "RTVC"): enc_model_fpath = os.path.join( root_file_path, "RTVC", "encoder/saved_models/pretrained.pt") syn_model_dir = os.path.join( root_file_path, "RTVC", "synthesizer/saved_models/logs-pretrained") voc_model_fpath = os.path.join( root_file_path, "RTVC", "vocoder/saved_models/pretrained/pretrained.pt") encoder.load_model(enc_model_fpath) self.synthesizer = Synthesizer(os.path.join( syn_model_dir, "taco_pretrained"), low_mem=False) vocoder.load_model(voc_model_fpath) in_fpath = os.path.join("/", *root_file_path.split("/")[:-1], "REF/refaudioRTVC/ref.wav") preprocessed_wav = encoder.preprocess_wav(in_fpath) original_wav, sampling_rate = librosa.load(in_fpath) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) embed = encoder.embed_utterance(preprocessed_wav) self.embeds = [embed] elif (Text2SpeechModel == "AudioSynth"): taco_pretrained_config_path = os.path.join( root_file_path, 'AudioSynth/TensorFlowTTS/examples/tacotron2/conf/tacotron2.v1.yaml' ) tacotron2_config = AutoConfig.from_pretrained( taco_pretrained_config_path) taco_path = os.path.join(root_file_path, "AudioSynth/tacotron2-120k.h5") self.tacotron2 = TFAutoModel.from_pretrained( config=tacotron2_config, pretrained_path=taco_path, training=False, name="tacotron2") melgan_stft_pretrained_config_path = os.path.join( root_file_path, 'AudioSynth/TensorFlowTTS/examples/melgan.stft/conf/melgan.stft.v1.yaml' ) melgan_stft_config = AutoConfig.from_pretrained( melgan_stft_pretrained_config_path) melgan_stft_path = os.path.join(root_file_path, "AudioSynth/melgan.stft-2M.h5") self.melgan_stft = TFAutoModel.from_pretrained( config=melgan_stft_config, pretrained_path=melgan_stft_path, name="melgan_stft") self.processor = AutoProcessor.from_pretrained( pretrained_path=os.path.join( root_file_path, "AudioSynth/ljspeech_mapper.json")) mels, alignment_history, audios = do_synthesis( "Hello, how can I help you today?", self.tacotron2, self.melgan_stft, "TACOTRON", "MELGAN-STFT", self.processor)
def __init__(self): # initialize tts model. fastspeech2 or tacotron2 self.tts_model = TFAutoModel.from_pretrained( "tensorspeech/tts-fastspeech2-kss-ko") # initialize mb_melgan model self.mb_melgan = TFAutoModel.from_pretrained( "tensorspeech/tts-mb_melgan-kss-ko") # inference self.processor = AutoProcessor.from_pretrained( "tensorspeech/tts-fastspeech2-kss-ko")
def __init_model(self): tacotron2_config = AutoConfig.from_pretrained(config.tacotron2_baker) self.tacotron2 = TFAutoModel.from_pretrained( config=tacotron2_config, pretrained_path=config.tacotron2_pretrained_path, training=False, name="tacotron2") self.tacotron2.setup_window(win_front=5, win_back=5) mb_melgan_config = AutoConfig.from_pretrained(config.multiband_melgan_baker) self.mb_melgan = TFAutoModel.from_pretrained( config=mb_melgan_config, pretrained_path=config.multiband_melgan_pretrained_path, name="mb_melgan") self.processor = AutoProcessor.from_pretrained(pretrained_path=config.baker_mapper_pretrained_path)
def __init__(self): # gpu memory의 1/3 만을 할당하기로 제한 gpu_options = tf.compat.v1.GPUOptions( per_process_gpu_memory_fraction=0.8) conf = tf.compat.v1.ConfigProto(gpu_options=gpu_options) # 탄력적인 메모리 할당 #conf.gpu_options.allow_growth = True session = tf.compat.v1.Session(config=conf) # tacotron 설정, 학습된 모델 가져오기 module_path = os.path.dirname(os.path.abspath(__file__)) tacotron2_config = AutoConfig.from_pretrained( os.path.join(module_path, 'examples/tacotron2/conf/tacotron2.song44k.v5.yaml')) self.tacotron2 = TFAutoModel.from_pretrained( config=tacotron2_config, pretrained_path=os.path.join( module_path, "examples/tacotron2/exp/train.tacotron2.song44k.v5/checkpoints/model-68000.h5" ), name="tacotron2") # fast speech 설정, 학습된 모델 가져오기 fastspeech2_config = AutoConfig.from_pretrained( os.path.join( module_path, 'examples/fastspeech2/conf/fastspeech2.song44k.v5.1.yaml')) self.fastspeech2 = TFAutoModel.from_pretrained( config=fastspeech2_config, pretrained_path=os.path.join( module_path, "examples/fastspeech2/exp/train.fastspeech2.song44k.v5.1/checkpoints/model-600000.h5" ), name="fastspeech2") # mel gan 설정, 학습된 모델 가져오기 mb_melgan_config = AutoConfig.from_pretrained( os.path.join( module_path, 'examples/multiband_melgan/conf/multiband_melgan.ko.song44k.v5.1.yaml' )) self.mb_melgan = TFAutoModel.from_pretrained( config=mb_melgan_config, pretrained_path=os.path.join( module_path, "examples/multiband_melgan/exp/train.multiband_melgan.ko.song44k.v5.1/checkpoints/generator-1740000.h5" ), name="mb_melgan") #processor - 글자 별 상응하는 숫자의 mapper 설정 가져오기 self.processor = AutoProcessor.from_pretrained( pretrained_path=os.path.join(module_path, "test/files/kss_mapper.json"))
def __init__(self): # gpu memory의 1/3 만을 할당하기로 제한 gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=0.8) conf = tf.compat.v1.ConfigProto(gpu_options=gpu_options) # 탄력적인 메모리 할당 #conf.gpu_options.allow_growth = True session = tf.compat.v1.Session(config=conf) # tacotron 설정, 학습된 모델 가져오기 module_path = os.path.dirname(os.path.abspath(__file__)) # pdb.set_trace() tacotron2_config = AutoConfig.from_pretrained(os.path.join(module_path,'./examples/tacotron2/conf/tacotron2.song8k.v3.yaml')) self.tacotron2 = TFAutoModel.from_pretrained( config=tacotron2_config, pretrained_path=os.path.join(module_path,"./examples/tacotron2/exp/train.tacotron2.song8k.v3/checkpoints/model-68000.h5"), name="tacotron2" ) # fast speech 설정, 학습된 모델 가져오기 fastspeech2_config = AutoConfig.from_pretrained(os.path.join(module_path,'./examples/fastspeech2/conf/fastspeech2.song8k.v3.yaml')) self.fastspeech2 = TFAutoModel.from_pretrained( config=fastspeech2_config, # pretrained_path=os.path.join(module_path,"./examples/fastspeech2/exp/train.fastspeech2.song8k.v1.1/checkpoints/model-200000.h5"), pretrained_path=os.path.join(module_path,"./examples/fastspeech2/exp/train.fastspeech2.song8k.v3/checkpoints/model-200000.h5"), name="fastspeech2" ) # fastspeech1_config = AutoConfig.from_pretrained(os.path.join(module_path,'examples/fastspeech/conf/fastspeech.v3_song44k_v51.yaml')) # self.fastspeech1 = TFAutoModel.from_pretrained( # config=fastspeech1_config, # pretrained_path=os.path.join(module_path,"examples/fastspeech/exp/train.fastspeech.song.v41/checkpoints/model-200000.h5"), # name="fastspeech1" # ) # resizing positional embedding # self.fastspeech1._build() # self.fastspeech1.save_weights("./resize.h5") # self.fastspeech1.resize_positional_embeddings(8000) # self.fastspeech1.load_weights("./resize.h5", by_name=True, skip_mismatch=True) # mel gan 설정, 학습된 모델 가져오기 mb_melgan_config = AutoConfig.from_pretrained(os.path.join(module_path,'./examples/multiband_melgan/conf/multiband_melgan.ko.8k.v3.yaml')) self.mb_melgan = TFAutoModel.from_pretrained( config=mb_melgan_config, pretrained_path=os.path.join(module_path,"./examples/multiband_melgan/exp/train.multiband_melgan.ko.song8k.v3/checkpoints/generator-1000000.h5"), name="mb_melgan" ) #processor - 글자 별 상응하는 숫자의 mapper 설정 가져오기 self.processor = AutoProcessor.from_pretrained(pretrained_path=os.path.join(module_path,"test/files/kss_mapper.json"))
def infer(input_text): processor = AutoProcessor.from_pretrained(pretrained_path=config_lp.baker_mapper_pretrained_path) input_text = tts_pause.add_pause(input_text) # logging.info( "[TTSModel] [do_synthesis] input_text:{}".format( input_text ) ) input_ids = processor.text_to_sequence(input_text, inference=True) # input_ids = np.concatenate([input_ids, [len(symbols) - 1]], -1) # eos. # interpreter_tacotron.resize_tensor_input(input_details_tacotron[0]['index'], [1, len(input_ids)]) interpreter_tacotron.allocate_tensors() input_data = prepare_input(input_ids) for i, detail in enumerate(input_details_tacotron): print(detail) input_shape = detail['shape'] interpreter_tacotron.set_tensor(detail['index'], input_data[i]) interpreter_tacotron.invoke() # The function `get_tensor()` returns a copy of the tensor data. # Use `tensor()` in order to get a pointer to the tensor. return (interpreter_tacotron.get_tensor(output_details_tacotron[0]['index']), # decoder_output_tflite interpreter_tacotron.get_tensor(output_details_tacotron[1]['index'])) # mel_output_tflite
# fastspeech2_config = AutoConfig.from_pretrained('examples/fastspeech2/conf/fastspeech2.baker.v2.yaml') # fastspeech2 = TFAutoModel.from_pretrained( # config=fastspeech2_config, # pretrained_path="trained/fastspeech2-200k.h5", # name="fastspeech2" # ) # MB-MelGAN mb_melgan_config = AutoConfig.from_pretrained( 'examples/multiband_melgan/conf/multiband_melgan.baker.v1.yaml') mb_melgan = TFAutoModel.from_pretrained( config=mb_melgan_config, pretrained_path="trained/mb.melgan_word-480k.h5", name="mb_melgan") processor = AutoProcessor.from_pretrained( pretrained_path="trained/baker_mapper_word.json") # BakerProcessor def do_synthesis(input_text, text2mel_model, vocoder_model, text2mel_name, vocoder_name): input_ids = processor.text_to_sequence(input_text, inference=True) # text2mel part if text2mel_name == "TACOTRON": _, mel_outputs, stop_token_prediction, alignment_history = text2mel_model.inference( tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0), tf.convert_to_tensor([len(input_ids)], tf.int32), tf.convert_to_tensor([0], dtype=tf.int32)) elif text2mel_name == "FASTSPEECH2": mel_before, mel_outputs, duration_outputs, _, _ = text2mel_model.inference( tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
# GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software Foundation, # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # The Original Code is Copyright (C) 2020 Voxell Technologies. # All rights reserved. import soundfile as sf import tensorflow as tf from tensorflow_tts.inference import AutoConfig from tensorflow_tts.inference import TFAutoModel from tensorflow_tts.inference import AutoProcessor processor = AutoProcessor.from_pretrained("../ljspeech_mapper.json") # initialize tacotron2 model config = AutoConfig.from_pretrained("../tacotron2/conf/tacotron2.v1.yaml") tacotron2 = TFAutoModel.from_pretrained( config=config, pretrained_path=None, is_build=True, name="tacotron2" ) tacotron2.setup_window(win_front=6, win_back=6) tacotron2.setup_maximum_iterations(3000) tacotron2.load_weights("../tacotron2/checkpoints/model-120000.h5") # tf.saved_model.save(tacotron2, "../tacotron2/inference", signatures=tacotron2.inference)
'--path_fs', default= "examples/fastspeech2_libritts/outdir_libri/checkpoints/model-855000.h5" ) parser.add_argument('--path_mb', default="checks/mb_melgan_or/mb.melgan-940k.h5") args = parser.parse_args() fastspeech2_config = AutoConfig.from_pretrained( 'examples/fastspeech2/conf/fastspeech2.v1.yaml') fastspeech2 = TFAutoModel.from_pretrained( config=fastspeech2_config, pretrained_path=args. path_fs, #"examples/fastspeech2_libritts/outdir_libri/checkpoints/model-855000.h5", #training=False, name="fastspeech2") mb_melgan_config = AutoConfig.from_pretrained( 'examples/multiband_melgan/conf/multiband_melgan.v1.yaml') mb_melgan = TFAutoModel.from_pretrained( config=mb_melgan_config, pretrained_path=args. path_mb, #"checks/mb_melgan_or/mb.melgan-940k.h5", name="mb_melgan") processor = AutoProcessor.from_pretrained( pretrained_path="dump_ljspeech/ljspeech_mapper.json") app.run(host='0.0.0.0', port=5454)
def __init_model(self): self.processor = AutoProcessor.from_pretrained( pretrained_path=config.baker_mapper_pretrained_path)
import sys if __name__ == "__main__": argvs = sys.argv if (len(argvs) != 3): print("usage: python3 {} mapper.json text(hanzi)".format(argvs[0])) else: from tensorflow_tts.inference import AutoProcessor mapper_json = argvs[1] processor = AutoProcessor.from_pretrained(pretrained_path=mapper_json) input_text = argvs[2] input_ids = processor.text_to_sequence(input_text, inference=True) print(" ".join(str(i) for i in input_ids))
""" tacotron2_config = AutoConfig.from_pretrained( 'TensorFlowTTS/examples/tacotron2/conf/tacotron2.baker.v1.yaml') tacotron2 = TFAutoModel.from_pretrained(config=tacotron2_config, pretrained_path="tacotron2-100k.h5", training=False, name="tacotron2") mb_melgan_config = AutoConfig.from_pretrained( 'TensorFlowTTS/examples/multiband_melgan/conf/multiband_melgan.baker.v1.yaml' ) mb_melgan = TFAutoModel.from_pretrained(config=mb_melgan_config, pretrained_path="mb.melgan-920k.h5", name="mb_melgan") processor = AutoProcessor.from_pretrained( pretrained_path="./baker_mapper.json") def do_synthesis(input_text, text2mel_model, vocoder_model, text2mel_name, vocoder_name): input_ids = processor.text_to_sequence(input_text, inference=True) # text2mel part if text2mel_name == "TACOTRON": _, mel_outputs, stop_token_prediction, alignment_history = text2mel_model.inference( tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0), tf.convert_to_tensor([len(input_ids)], tf.int32), tf.convert_to_tensor([0], dtype=tf.int32)) elif text2mel_name == "FASTSPEECH2": mel_before, mel_outputs, duration_outputs, _, _ = text2mel_model.inference( tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
def test_auto_processor(mapper_path): processor = AutoProcessor.from_pretrained(pretrained_path=mapper_path)
name="fastspeech2" ) # MB-MelGAN mb_melgan_config = AutoConfig.from_pretrained('examples/multiband_melgan/conf/multiband_melgan.baker.v1.yaml') mb_melgan = TFAutoModel.from_pretrained( config=mb_melgan_config, # pretrained_path="trained/mb.melgan.char-800k.h5", # "trained/mb.melgan-1M.h5" pretrained_path="trained/mb.melgan_word_428k.h5", # "trained/mb.melgan-1M.h5" # is_build=False, # don't build model if you want to save it to pb. (TF related bug) name="mb_melgan" ) # LJSpeechProcessor # processor = AutoProcessor.from_pretrained("trained/baker_mapper_char.json") processor = AutoProcessor.from_pretrained("trained/baker_mapper_word.json") # save tacotron2 to pb def save_tacotron2_pb(): input_text = "i love you so much." input_ids = processor.text_to_sequence(input_text) tacotron2.setup_window(win_front=3, win_back=3) tacotron2.setup_maximum_iterations(3000) decoder_output, mel_outputs, stop_token_prediction, alignment_history = tacotron2.inference( input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0), input_lengths=tf.convert_to_tensor([len(input_ids)], tf.int32), speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32), ) tacotron2.load_weights("examples/tacotron2/exp/baker.mix.ali/checkpoints/model-22000.h5")
def test_auto_processor(mapper_path): processor = AutoProcessor.from_pretrained(pretrained_path=mapper_path) processor.save_pretrained("./test_saved") processor = AutoProcessor.from_pretrained("./test_saved/processor.json")