def __init__(self, tts="fastspeech2", generator="multiband_melgan_generator"): CONFIG_MAPPING = OrderedDict([ ("fastspeech2", (self._load_fastspeech2, self._infer_fastspeech2, FASTSPEECH2_TFLITE_PATH)), ("multiband_melgan_generator", (self._load_mb_melgan, OUT_MB_MELGAN_TFLITE_DIR)), ("multiband_melgan2_generator", (self._load_mb_melgan2, OUT_MB_MELGAN2_TFLITE_DIR)), ("melgan_generator", (self._load_melgan, OUT_MELGAN_TFLITE_DIR)), ("tacotron2", (self._load_tacotron, self._infer_tacotron2, TACOTRON_TFLITE_PATH)), ]) try: _tts, _inference, _tflite_path = CONFIG_MAPPING[tts] _generator, _mel_tflite_path = CONFIG_MAPPING[generator] except Exception: raise ValueError("Unrecognized tts ({}) or generator ({}). " "Supported models are: {}".format( tts, generator, ", ".join(CONFIG_MAPPING.keys()))) # self._tts = _tts() # TTS Model, unused if we use tflite model self._generator = _generator() # MelGan Vocoder self._inference = _inference # TTS Inference function call self._processor = LJSpeechProcessor(None, symbols=LJSPEECH_SYMBOLS) self._interpreter = tf.lite.Interpreter(model_path=_tflite_path) self._interpreter.allocate_tensors() # Get input and output tensors. self._input_details = self._interpreter.get_input_details() self._output_details = self._interpreter.get_output_details() config = os.path.join(_mel_tflite_path, 'config.yml') with open(config) as f: melgan_config = yaml.load(f, Loader=yaml.Loader) self._sampling_rate = melgan_config["sampling_rate"]
def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser( description= "Preprocess audio and then extract features (See detail in tensorflow_tts/bin/preprocess.py)." ) parser.add_argument("--rootdir", default=None, type=str, required=True, help="root path.") parser.add_argument("--outdir", default=None, type=str, required=True, help="output dir.") parser.add_argument("--config", type=str, required=True, help="yaml format configuration file.") parser.add_argument( "--n_cpus", type=int, default=4, required=False, help="number of CPUs to use for multi-processing.", ) parser.add_argument( "--test_size", type=float, default=0.05, required=False, help= "the proportion of the dataset to include in the test split. (default=0.05)", ) parser.add_argument( "--verbose", type=int, default=1, help="logging level. higher is more logging. (default=1)", ) args = parser.parse_args() # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) elif args.verbose > 0: logging.basicConfig( level=logging.INFO, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") # load config with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) processor = LJSpeechProcessor(root_path=args.rootdir, cleaner_names="english_cleaners") # check directly existence if not os.path.exists(args.outdir): os.makedirs(args.outdir, exist_ok=True) os.makedirs(os.path.join(args.outdir, "valid"), exist_ok=True) os.makedirs(os.path.join(args.outdir, "valid", "raw-feats"), exist_ok=True) os.makedirs(os.path.join(args.outdir, "valid", "wavs"), exist_ok=True) os.makedirs(os.path.join(args.outdir, "valid", "ids"), exist_ok=True) os.makedirs(os.path.join(args.outdir, "valid", "raw-f0"), exist_ok=True) os.makedirs(os.path.join(args.outdir, "valid", "raw-energies"), exist_ok=True) os.makedirs(os.path.join(args.outdir, "train"), exist_ok=True) os.makedirs(os.path.join(args.outdir, "train", "raw-feats"), exist_ok=True) os.makedirs(os.path.join(args.outdir, "train", "wavs"), exist_ok=True) os.makedirs(os.path.join(args.outdir, "train", "ids"), exist_ok=True) os.makedirs(os.path.join(args.outdir, "train", "raw-f0"), exist_ok=True) os.makedirs(os.path.join(args.outdir, "train", "raw-energies"), exist_ok=True) # train test split idx_train, idx_valid = train_test_split( range(len(processor.items)), shuffle=True, test_size=args.test_size, random_state=42, ) # train/valid utt_ids train_utt_ids = [] valid_utt_ids = [] for idx in range(len(processor.items)): utt_ids = processor.get_one_sample(idx)["utt_id"] if idx in idx_train: train_utt_ids.append(utt_ids) elif idx in idx_valid: valid_utt_ids.append(utt_ids) # save train and valid utt_ids to track later. np.save(os.path.join(args.outdir, "train_utt_ids.npy"), train_utt_ids) np.save(os.path.join(args.outdir, "valid_utt_ids.npy"), valid_utt_ids) pbar = tqdm(initial=0, total=len(processor.items), desc="[Preprocessing]") # process each data def save_to_file(idx): sample = processor.get_one_sample(idx) # get info from sample. audio = sample["audio"] text_ids = sample["text_ids"] utt_id = sample["utt_id"] rate = sample["rate"] # check assert len( audio.shape) == 1, f"{utt_id} seems to be multi-channel signal." assert (np.abs(audio).max() <= 1.0), f"{utt_id} seems to be different from 16 bit PCM." assert (rate == config["sampling_rate"] ), f"{utt_id} seems to have a different sampling rate." # trim silence if config["trim_silence"]: audio, _ = librosa.effects.trim( audio, top_db=config["trim_threshold_in_db"], frame_length=config["trim_frame_size"], hop_length=config["trim_hop_size"], ) if "sampling_rate_for_feats" not in config: x = audio sampling_rate = config["sampling_rate"] hop_size = config["hop_size"] else: x = librosa.resample(audio, rate, config["sampling_rate_for_feats"]) sampling_rate = config["sampling_rate_for_feats"] assert ( config["hop_size"] * config["sampling_rate_for_feats"] % rate == 0 ), "hop_size must be int value. please check sampling_rate_for_feats is correct." hop_size = config["hop_size"] * config[ "sampling_rate_for_feats"] // rate # extract feature mel, x_stft = logmelfilterbank( x, sampling_rate=sampling_rate, hop_size=hop_size, fft_size=config["fft_size"], win_length=config["win_length"], window=config["window"], num_mels=config["num_mels"], fmin=config["fmin"], fmax=config["fmax"], ) # make sure the audio length and feature length audio = np.pad(audio, (0, config["fft_size"]), mode="edge") audio = audio[:len(mel) * config["hop_size"]] # extract raw pitch f0, _ = pw.dio( x.astype(np.double), fs=config["sampling_rate"], f0_ceil=config["fmax"], frame_period=1000 * config["hop_size"] / config["sampling_rate"], ) if len(f0) >= len(mel): f0 = f0[:len(mel)] else: f0 = np.pad(f0, ((0, len(mel) - len(f0)))) # extract energy S = librosa.magphase(x_stft)[0] energy = np.sqrt(np.sum(S**2, axis=0)) assert len(mel) * config["hop_size"] == len(audio) assert len(mel) == len(f0) == len(energy) # apply global gain if config["global_gain_scale"] > 0.0: audio *= config["global_gain_scale"] if np.abs(audio).max() >= 1.0: logging.warn(f"{utt_id} causes clipping. " f"it is better to re-consider global gain scale.") # save if config["format"] == "npy": if idx in idx_train: subdir = "train" elif idx in idx_valid: subdir = "valid" np.save( os.path.join(args.outdir, subdir, "wavs", f"{utt_id}-wave.npy"), audio.astype(np.float32), allow_pickle=False, ) np.save( os.path.join(args.outdir, subdir, "raw-feats", f"{utt_id}-raw-feats.npy"), mel.astype(np.float32), allow_pickle=False, ) np.save( os.path.join(args.outdir, subdir, "ids", f"{utt_id}-ids.npy"), text_ids.astype(np.int32), allow_pickle=False, ) np.save( os.path.join(args.outdir, subdir, "raw-f0", f"{utt_id}-raw-f0.npy"), f0.astype(np.float32), allow_pickle=False, ) np.save( os.path.join(args.outdir, subdir, "raw-energies", f"{utt_id}-raw-energy.npy"), energy.astype(np.float32), allow_pickle=False, ) else: raise ValueError("support only npy format.") pbar.update(1) # apply multi-processing Pool p = Pool(nodes=args.n_cpus) p.map(save_to_file, range(len(processor.items))) pbar.close()
class TTS(object): """Initializes a TTS and Vocoder model to consume strings and return .wav files.""" def __init__(self, tts="fastspeech2", generator="multiband_melgan_generator"): CONFIG_MAPPING = OrderedDict([ ("fastspeech2", (self._load_fastspeech2, self._infer_fastspeech2, FASTSPEECH2_TFLITE_PATH)), ("multiband_melgan_generator", (self._load_mb_melgan, OUT_MB_MELGAN_TFLITE_DIR)), ("multiband_melgan2_generator", (self._load_mb_melgan2, OUT_MB_MELGAN2_TFLITE_DIR)), ("melgan_generator", (self._load_melgan, OUT_MELGAN_TFLITE_DIR)), ("tacotron2", (self._load_tacotron, self._infer_tacotron2, TACOTRON_TFLITE_PATH)), ]) try: _tts, _inference, _tflite_path = CONFIG_MAPPING[tts] _generator, _mel_tflite_path = CONFIG_MAPPING[generator] except Exception: raise ValueError("Unrecognized tts ({}) or generator ({}). " "Supported models are: {}".format( tts, generator, ", ".join(CONFIG_MAPPING.keys()))) # self._tts = _tts() # TTS Model, unused if we use tflite model self._generator = _generator() # MelGan Vocoder self._inference = _inference # TTS Inference function call self._processor = LJSpeechProcessor(None, symbols=LJSPEECH_SYMBOLS) self._interpreter = tf.lite.Interpreter(model_path=_tflite_path) self._interpreter.allocate_tensors() # Get input and output tensors. self._input_details = self._interpreter.get_input_details() self._output_details = self._interpreter.get_output_details() config = os.path.join(_mel_tflite_path, 'config.yml') with open(config) as f: melgan_config = yaml.load(f, Loader=yaml.Loader) self._sampling_rate = melgan_config["sampling_rate"] def _load_melgan(self, path='./model_files/melgan'): # initialize melgan model for vocoding config = os.path.join(path, 'config.yml') with open(config) as f: melgan_config = yaml.load(f, Loader=yaml.Loader) melgan_config = MelGANGeneratorConfig( **melgan_config["generator_params"]) melgan = TFMelGANGenerator(config=melgan_config, name='melgan_generator') melgan._build() weights = os.path.join(path, 'generator-1670000.h5') melgan.load_weights(weights) return melgan def _load_mb_melgan(self, path='./model_files/multiband_melgan'): # initialize melgan model for vocoding config = os.path.join(path, 'config.yml') with open(config) as f: melgan_config = yaml.load(f, Loader=yaml.Loader) melgan_config = MultiBandMelGANGeneratorConfig( **melgan_config["generator_params"]) melgan = TFMBMelGANGenerator(config=melgan_config, name='melgan_generator') melgan._build() weights = os.path.join(path, 'generator-940000.h5') melgan.load_weights(weights) return melgan def _load_mb_melgan2(self, path='./model_files/multiband_melgan2'): # initialize melgan model for vocoding config = os.path.join(path, 'config.yml') with open(config) as f: melgan_config = yaml.load(f, Loader=yaml.Loader) melgan_config = MultiBandMelGANGeneratorConfig( **melgan_config["multiband_melgan_generator_params"]) melgan = TFMBMelGANGenerator(config=melgan_config, name='melgan_generator') melgan._build() weights = os.path.join(path, 'libritts_24k.h5') melgan.load_weights(weights) return melgan def _load_fastspeech2(self, path='./model_files/fastspeech2'): config = os.path.join(path, 'config.yml') with open(config) as f: config = yaml.load(f, Loader=yaml.Loader) config = FastSpeech2Config(**config["fastspeech_params"]) fastspeech2 = TFFastSpeech2(config=config, name="fastspeech2v1", enable_tflite_convertible=True) fastspeech2._build() weights = os.path.join(path, 'model-150000.h5') fastspeech2.load_weights(weights) print(fastspeech2.summary()) return fastspeech2 def _load_tacotron(self, path=OUT_TACOTRON_TFLITE_DIR): # initialize Tacotron2 model. config = os.path.join(path, 'config.yml') with open(config) as f: config = yaml.load(f, Loader=yaml.Loader) config = Tacotron2Config(**config["tacotron2_params"]) tacotron2 = TFTacotron2(config=config, training=False, name="tacotron2v1", enable_tflite_convertible=True) # Newly added : tacotron2.setup_window(win_front=6, win_back=6) tacotron2.setup_maximum_iterations(3000) tacotron2._build() weights = os.path.join(path, 'model-120000.h5') tacotron2.load_weights(weights) print(tacotron2.summary()) return tacotron2 def _generate_tflite(self, model, out_file, out_dir): # Concrete Function model_concrete_function = model.inference_tflite.get_concrete_function( ) converter = tf.lite.TFLiteConverter.from_concrete_functions( [model_concrete_function]) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.target_spec.supported_ops = [ tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS ] tflite_model = converter.convert() out = os.path.join(out_dir, out_file) # Save the TF Lite model. with open(out, 'wb') as f: f.write(tflite_model) print('Model size is %f MBs.' % (len(tflite_model) / 1024 / 1024.0)) def generate_fastspeech2_tflite(self): model = self._load_fastspeech2() self._generate_tflite(model, OUT_FASTSPEECH2_TFLITE_FILE, OUT_FASTSPEECH2_TFLITE_DIR) def generate_tacotron_tflite(self): model = self._load_tacotron() self._generate_tflite(model, OUT_TACOTRON_TFLITE_FILE, OUT_TACOTRON_TFLITE_DIR) def generate_melgan_tflite(self): model = self._load_tacotron() self._generate_tflite(model, OUT_MELGAN_TFLITE_FILE, OUT_MELGAN_TFLITE_DIR) def generate_multiband_melgan_tflite(self): model = self._load_tacotron() self._generate_tflite(model, OUT_MB_MELGAN_TFLITE_FILE, OUT_MB_MELGAN_TFLITE_DIR) # Prepare input data. def _prepare_input_tacotron2(self, input_ids): return (tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0), tf.convert_to_tensor([len(input_ids)], tf.int32), tf.convert_to_tensor([0], dtype=tf.int32)) def _prepare_input_fastspeech2(self, input_ids): input_ids = tf.expand_dims( tf.convert_to_tensor(input_ids, dtype=tf.int32), 0) return (input_ids, tf.convert_to_tensor([0], tf.int32), tf.convert_to_tensor([1.0], dtype=tf.float32), tf.convert_to_tensor([1.0], dtype=tf.float32), tf.convert_to_tensor([1.0], dtype=tf.float32)) def _infer_fastspeech2(self, input_text, interpreter, input_details, output_details): # NOTE: FOR DEBUGGING # for x in input_details: # print(x) # for x in output_details: # print(x) input_ids = self._processor.text_to_sequence(input_text.lower()) interpreter.resize_tensor_input(input_details[0]['index'], [1, len(input_ids)]) interpreter.resize_tensor_input(input_details[1]['index'], [1]) interpreter.resize_tensor_input(input_details[2]['index'], [1]) interpreter.resize_tensor_input(input_details[3]['index'], [1]) interpreter.resize_tensor_input(input_details[4]['index'], [1]) interpreter.allocate_tensors() input_data = self._prepare_input_fastspeech2(input_ids) for i, detail in enumerate(input_details): input_shape = detail['shape'] interpreter.set_tensor(detail['index'], input_data[i]) interpreter.invoke() # The function `get_tensor()` returns a copy of the tensor data. # Use `tensor()` in order to get a pointer to the tensor. return (interpreter.get_tensor(output_details[0]['index']), interpreter.get_tensor(output_details[1]['index'])) def _infer_tacotron2(self, input_text, interpreter, input_details, output_details): input_ids = self._processor.text_to_sequence(input_text.lower()) # eos. input_ids = np.concatenate([input_ids, [len(LJSPEECH_SYMBOLS) - 1]], -1) interpreter.resize_tensor_input(input_details[0]['index'], [1, len(input_ids)]) interpreter.allocate_tensors() input_data = self._prepare_input_tacotron2(input_ids) for i, detail in enumerate(input_details): # NOTE: FOR DEBUGGING # print(detail) input_shape = detail['shape'] interpreter.set_tensor(detail['index'], input_data[i]) interpreter.invoke() # The function `get_tensor()` returns a copy of the tensor data. # Use `tensor()` in order to get a pointer to the tensor. return (interpreter.get_tensor(output_details[0]['index']), interpreter.get_tensor(output_details[1]['index'])) def run_inference(self, input_text, out_file_name): _, mel_output_tflite = self._inference(input_text, self._interpreter, self._input_details, self._output_details) audio_after_tflite = self._generator(mel_output_tflite)[0, :, 0] sf.write('{}.wav'.format(out_file_name), audio_after_tflite, self._sampling_rate) # NOTE: COLAB EXAMPLE FOR INFERENCE FROM TFLITE MODELS: # https://colab.research.google.com/drive/1HudLLpT9CQdh2k04c06bHUwLubhGTWxA?usp=sharing