Example #1
0
    def __init__(self,
                 tts="fastspeech2",
                 generator="multiband_melgan_generator"):
        CONFIG_MAPPING = OrderedDict([
            ("fastspeech2", (self._load_fastspeech2, self._infer_fastspeech2,
                             FASTSPEECH2_TFLITE_PATH)),
            ("multiband_melgan_generator", (self._load_mb_melgan,
                                            OUT_MB_MELGAN_TFLITE_DIR)),
            ("multiband_melgan2_generator", (self._load_mb_melgan2,
                                             OUT_MB_MELGAN2_TFLITE_DIR)),
            ("melgan_generator", (self._load_melgan, OUT_MELGAN_TFLITE_DIR)),
            ("tacotron2", (self._load_tacotron, self._infer_tacotron2,
                           TACOTRON_TFLITE_PATH)),
        ])
        try:
            _tts, _inference, _tflite_path = CONFIG_MAPPING[tts]
            _generator, _mel_tflite_path = CONFIG_MAPPING[generator]
        except Exception:
            raise ValueError("Unrecognized tts ({}) or generator ({}). "
                             "Supported models are: {}".format(
                                 tts, generator,
                                 ", ".join(CONFIG_MAPPING.keys())))
        # self._tts = _tts() # TTS Model, unused if we use tflite model
        self._generator = _generator()  # MelGan Vocoder
        self._inference = _inference  # TTS Inference function call

        self._processor = LJSpeechProcessor(None, symbols=LJSPEECH_SYMBOLS)
        self._interpreter = tf.lite.Interpreter(model_path=_tflite_path)
        self._interpreter.allocate_tensors()

        # Get input and output tensors.
        self._input_details = self._interpreter.get_input_details()
        self._output_details = self._interpreter.get_output_details()

        config = os.path.join(_mel_tflite_path, 'config.yml')

        with open(config) as f:
            melgan_config = yaml.load(f, Loader=yaml.Loader)
        self._sampling_rate = melgan_config["sampling_rate"]
Example #2
0
def main():
    """Run preprocessing process."""
    parser = argparse.ArgumentParser(
        description=
        "Preprocess audio and then extract features (See detail in tensorflow_tts/bin/preprocess.py)."
    )
    parser.add_argument("--rootdir",
                        default=None,
                        type=str,
                        required=True,
                        help="root path.")
    parser.add_argument("--outdir",
                        default=None,
                        type=str,
                        required=True,
                        help="output dir.")
    parser.add_argument("--config",
                        type=str,
                        required=True,
                        help="yaml format configuration file.")
    parser.add_argument(
        "--n_cpus",
        type=int,
        default=4,
        required=False,
        help="number of CPUs to use for multi-processing.",
    )
    parser.add_argument(
        "--test_size",
        type=float,
        default=0.05,
        required=False,
        help=
        "the proportion of the dataset to include in the test split. (default=0.05)",
    )
    parser.add_argument(
        "--verbose",
        type=int,
        default=1,
        help="logging level. higher is more logging. (default=1)",
    )
    args = parser.parse_args()

    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")

    # load config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    config.update(vars(args))

    processor = LJSpeechProcessor(root_path=args.rootdir,
                                  cleaner_names="english_cleaners")

    # check directly existence
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir, exist_ok=True)
        os.makedirs(os.path.join(args.outdir, "valid"), exist_ok=True)
        os.makedirs(os.path.join(args.outdir, "valid", "raw-feats"),
                    exist_ok=True)
        os.makedirs(os.path.join(args.outdir, "valid", "wavs"), exist_ok=True)
        os.makedirs(os.path.join(args.outdir, "valid", "ids"), exist_ok=True)
        os.makedirs(os.path.join(args.outdir, "valid", "raw-f0"),
                    exist_ok=True)
        os.makedirs(os.path.join(args.outdir, "valid", "raw-energies"),
                    exist_ok=True)
        os.makedirs(os.path.join(args.outdir, "train"), exist_ok=True)
        os.makedirs(os.path.join(args.outdir, "train", "raw-feats"),
                    exist_ok=True)
        os.makedirs(os.path.join(args.outdir, "train", "wavs"), exist_ok=True)
        os.makedirs(os.path.join(args.outdir, "train", "ids"), exist_ok=True)
        os.makedirs(os.path.join(args.outdir, "train", "raw-f0"),
                    exist_ok=True)
        os.makedirs(os.path.join(args.outdir, "train", "raw-energies"),
                    exist_ok=True)

    # train test split
    idx_train, idx_valid = train_test_split(
        range(len(processor.items)),
        shuffle=True,
        test_size=args.test_size,
        random_state=42,
    )

    # train/valid utt_ids
    train_utt_ids = []
    valid_utt_ids = []

    for idx in range(len(processor.items)):
        utt_ids = processor.get_one_sample(idx)["utt_id"]
        if idx in idx_train:
            train_utt_ids.append(utt_ids)
        elif idx in idx_valid:
            valid_utt_ids.append(utt_ids)

    # save train and valid utt_ids to track later.
    np.save(os.path.join(args.outdir, "train_utt_ids.npy"), train_utt_ids)
    np.save(os.path.join(args.outdir, "valid_utt_ids.npy"), valid_utt_ids)

    pbar = tqdm(initial=0, total=len(processor.items), desc="[Preprocessing]")

    # process each data
    def save_to_file(idx):
        sample = processor.get_one_sample(idx)

        # get info from sample.
        audio = sample["audio"]
        text_ids = sample["text_ids"]
        utt_id = sample["utt_id"]
        rate = sample["rate"]

        # check
        assert len(
            audio.shape) == 1, f"{utt_id} seems to be multi-channel signal."
        assert (np.abs(audio).max() <=
                1.0), f"{utt_id} seems to be different from 16 bit PCM."
        assert (rate == config["sampling_rate"]
                ), f"{utt_id} seems to have a different sampling rate."

        # trim silence
        if config["trim_silence"]:
            audio, _ = librosa.effects.trim(
                audio,
                top_db=config["trim_threshold_in_db"],
                frame_length=config["trim_frame_size"],
                hop_length=config["trim_hop_size"],
            )

        if "sampling_rate_for_feats" not in config:
            x = audio
            sampling_rate = config["sampling_rate"]
            hop_size = config["hop_size"]
        else:
            x = librosa.resample(audio, rate,
                                 config["sampling_rate_for_feats"])
            sampling_rate = config["sampling_rate_for_feats"]
            assert (
                config["hop_size"] * config["sampling_rate_for_feats"] %
                rate == 0
            ), "hop_size must be int value. please check sampling_rate_for_feats is correct."
            hop_size = config["hop_size"] * config[
                "sampling_rate_for_feats"] // rate

        # extract feature
        mel, x_stft = logmelfilterbank(
            x,
            sampling_rate=sampling_rate,
            hop_size=hop_size,
            fft_size=config["fft_size"],
            win_length=config["win_length"],
            window=config["window"],
            num_mels=config["num_mels"],
            fmin=config["fmin"],
            fmax=config["fmax"],
        )

        # make sure the audio length and feature length
        audio = np.pad(audio, (0, config["fft_size"]), mode="edge")
        audio = audio[:len(mel) * config["hop_size"]]

        # extract raw pitch
        f0, _ = pw.dio(
            x.astype(np.double),
            fs=config["sampling_rate"],
            f0_ceil=config["fmax"],
            frame_period=1000 * config["hop_size"] / config["sampling_rate"],
        )

        if len(f0) >= len(mel):
            f0 = f0[:len(mel)]
        else:
            f0 = np.pad(f0, ((0, len(mel) - len(f0))))

        # extract energy
        S = librosa.magphase(x_stft)[0]
        energy = np.sqrt(np.sum(S**2, axis=0))

        assert len(mel) * config["hop_size"] == len(audio)
        assert len(mel) == len(f0) == len(energy)

        # apply global gain
        if config["global_gain_scale"] > 0.0:
            audio *= config["global_gain_scale"]
        if np.abs(audio).max() >= 1.0:
            logging.warn(f"{utt_id} causes clipping. "
                         f"it is better to re-consider global gain scale.")

        # save
        if config["format"] == "npy":
            if idx in idx_train:
                subdir = "train"
            elif idx in idx_valid:
                subdir = "valid"

            np.save(
                os.path.join(args.outdir, subdir, "wavs",
                             f"{utt_id}-wave.npy"),
                audio.astype(np.float32),
                allow_pickle=False,
            )
            np.save(
                os.path.join(args.outdir, subdir, "raw-feats",
                             f"{utt_id}-raw-feats.npy"),
                mel.astype(np.float32),
                allow_pickle=False,
            )
            np.save(
                os.path.join(args.outdir, subdir, "ids", f"{utt_id}-ids.npy"),
                text_ids.astype(np.int32),
                allow_pickle=False,
            )
            np.save(
                os.path.join(args.outdir, subdir, "raw-f0",
                             f"{utt_id}-raw-f0.npy"),
                f0.astype(np.float32),
                allow_pickle=False,
            )
            np.save(
                os.path.join(args.outdir, subdir, "raw-energies",
                             f"{utt_id}-raw-energy.npy"),
                energy.astype(np.float32),
                allow_pickle=False,
            )
        else:
            raise ValueError("support only npy format.")

        pbar.update(1)

    # apply multi-processing Pool
    p = Pool(nodes=args.n_cpus)
    p.map(save_to_file, range(len(processor.items)))
    pbar.close()
Example #3
0
class TTS(object):
    """Initializes a TTS and Vocoder model to consume strings and return .wav files."""
    def __init__(self,
                 tts="fastspeech2",
                 generator="multiband_melgan_generator"):
        CONFIG_MAPPING = OrderedDict([
            ("fastspeech2", (self._load_fastspeech2, self._infer_fastspeech2,
                             FASTSPEECH2_TFLITE_PATH)),
            ("multiband_melgan_generator", (self._load_mb_melgan,
                                            OUT_MB_MELGAN_TFLITE_DIR)),
            ("multiband_melgan2_generator", (self._load_mb_melgan2,
                                             OUT_MB_MELGAN2_TFLITE_DIR)),
            ("melgan_generator", (self._load_melgan, OUT_MELGAN_TFLITE_DIR)),
            ("tacotron2", (self._load_tacotron, self._infer_tacotron2,
                           TACOTRON_TFLITE_PATH)),
        ])
        try:
            _tts, _inference, _tflite_path = CONFIG_MAPPING[tts]
            _generator, _mel_tflite_path = CONFIG_MAPPING[generator]
        except Exception:
            raise ValueError("Unrecognized tts ({}) or generator ({}). "
                             "Supported models are: {}".format(
                                 tts, generator,
                                 ", ".join(CONFIG_MAPPING.keys())))
        # self._tts = _tts() # TTS Model, unused if we use tflite model
        self._generator = _generator()  # MelGan Vocoder
        self._inference = _inference  # TTS Inference function call

        self._processor = LJSpeechProcessor(None, symbols=LJSPEECH_SYMBOLS)
        self._interpreter = tf.lite.Interpreter(model_path=_tflite_path)
        self._interpreter.allocate_tensors()

        # Get input and output tensors.
        self._input_details = self._interpreter.get_input_details()
        self._output_details = self._interpreter.get_output_details()

        config = os.path.join(_mel_tflite_path, 'config.yml')

        with open(config) as f:
            melgan_config = yaml.load(f, Loader=yaml.Loader)
        self._sampling_rate = melgan_config["sampling_rate"]

    def _load_melgan(self, path='./model_files/melgan'):
        # initialize melgan model for vocoding
        config = os.path.join(path, 'config.yml')
        with open(config) as f:
            melgan_config = yaml.load(f, Loader=yaml.Loader)
        melgan_config = MelGANGeneratorConfig(
            **melgan_config["generator_params"])
        melgan = TFMelGANGenerator(config=melgan_config,
                                   name='melgan_generator')
        melgan._build()
        weights = os.path.join(path, 'generator-1670000.h5')
        melgan.load_weights(weights)
        return melgan

    def _load_mb_melgan(self, path='./model_files/multiband_melgan'):
        # initialize melgan model for vocoding
        config = os.path.join(path, 'config.yml')
        with open(config) as f:
            melgan_config = yaml.load(f, Loader=yaml.Loader)
        melgan_config = MultiBandMelGANGeneratorConfig(
            **melgan_config["generator_params"])
        melgan = TFMBMelGANGenerator(config=melgan_config,
                                     name='melgan_generator')
        melgan._build()
        weights = os.path.join(path, 'generator-940000.h5')
        melgan.load_weights(weights)
        return melgan

    def _load_mb_melgan2(self, path='./model_files/multiband_melgan2'):
        # initialize melgan model for vocoding
        config = os.path.join(path, 'config.yml')
        with open(config) as f:
            melgan_config = yaml.load(f, Loader=yaml.Loader)
        melgan_config = MultiBandMelGANGeneratorConfig(
            **melgan_config["multiband_melgan_generator_params"])
        melgan = TFMBMelGANGenerator(config=melgan_config,
                                     name='melgan_generator')
        melgan._build()
        weights = os.path.join(path, 'libritts_24k.h5')
        melgan.load_weights(weights)
        return melgan

    def _load_fastspeech2(self, path='./model_files/fastspeech2'):
        config = os.path.join(path, 'config.yml')
        with open(config) as f:
            config = yaml.load(f, Loader=yaml.Loader)
        config = FastSpeech2Config(**config["fastspeech_params"])
        fastspeech2 = TFFastSpeech2(config=config,
                                    name="fastspeech2v1",
                                    enable_tflite_convertible=True)

        fastspeech2._build()
        weights = os.path.join(path, 'model-150000.h5')
        fastspeech2.load_weights(weights)
        print(fastspeech2.summary())
        return fastspeech2

    def _load_tacotron(self, path=OUT_TACOTRON_TFLITE_DIR):
        # initialize Tacotron2 model.
        config = os.path.join(path, 'config.yml')
        with open(config) as f:
            config = yaml.load(f, Loader=yaml.Loader)
        config = Tacotron2Config(**config["tacotron2_params"])
        tacotron2 = TFTacotron2(config=config,
                                training=False,
                                name="tacotron2v1",
                                enable_tflite_convertible=True)

        # Newly added :
        tacotron2.setup_window(win_front=6, win_back=6)
        tacotron2.setup_maximum_iterations(3000)

        tacotron2._build()
        weights = os.path.join(path, 'model-120000.h5')
        tacotron2.load_weights(weights)
        print(tacotron2.summary())
        return tacotron2

    def _generate_tflite(self, model, out_file, out_dir):
        # Concrete Function
        model_concrete_function = model.inference_tflite.get_concrete_function(
        )
        converter = tf.lite.TFLiteConverter.from_concrete_functions(
            [model_concrete_function])
        converter.optimizations = [tf.lite.Optimize.DEFAULT]
        converter.target_spec.supported_ops = [
            tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS
        ]
        tflite_model = converter.convert()
        out = os.path.join(out_dir, out_file)
        # Save the TF Lite model.
        with open(out, 'wb') as f:
            f.write(tflite_model)
        print('Model size is %f MBs.' % (len(tflite_model) / 1024 / 1024.0))

    def generate_fastspeech2_tflite(self):
        model = self._load_fastspeech2()
        self._generate_tflite(model, OUT_FASTSPEECH2_TFLITE_FILE,
                              OUT_FASTSPEECH2_TFLITE_DIR)

    def generate_tacotron_tflite(self):
        model = self._load_tacotron()
        self._generate_tflite(model, OUT_TACOTRON_TFLITE_FILE,
                              OUT_TACOTRON_TFLITE_DIR)

    def generate_melgan_tflite(self):
        model = self._load_tacotron()
        self._generate_tflite(model, OUT_MELGAN_TFLITE_FILE,
                              OUT_MELGAN_TFLITE_DIR)

    def generate_multiband_melgan_tflite(self):
        model = self._load_tacotron()
        self._generate_tflite(model, OUT_MB_MELGAN_TFLITE_FILE,
                              OUT_MB_MELGAN_TFLITE_DIR)

    # Prepare input data.
    def _prepare_input_tacotron2(self, input_ids):
        return (tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32),
                               0),
                tf.convert_to_tensor([len(input_ids)], tf.int32),
                tf.convert_to_tensor([0], dtype=tf.int32))

    def _prepare_input_fastspeech2(self, input_ids):
        input_ids = tf.expand_dims(
            tf.convert_to_tensor(input_ids, dtype=tf.int32), 0)
        return (input_ids, tf.convert_to_tensor([0], tf.int32),
                tf.convert_to_tensor([1.0], dtype=tf.float32),
                tf.convert_to_tensor([1.0], dtype=tf.float32),
                tf.convert_to_tensor([1.0], dtype=tf.float32))

    def _infer_fastspeech2(self, input_text, interpreter, input_details,
                           output_details):
        # NOTE: FOR DEBUGGING
        # for x in input_details:
        #     print(x)
        # for x in output_details:
        #     print(x)
        input_ids = self._processor.text_to_sequence(input_text.lower())
        interpreter.resize_tensor_input(input_details[0]['index'],
                                        [1, len(input_ids)])
        interpreter.resize_tensor_input(input_details[1]['index'], [1])
        interpreter.resize_tensor_input(input_details[2]['index'], [1])
        interpreter.resize_tensor_input(input_details[3]['index'], [1])
        interpreter.resize_tensor_input(input_details[4]['index'], [1])
        interpreter.allocate_tensors()
        input_data = self._prepare_input_fastspeech2(input_ids)
        for i, detail in enumerate(input_details):
            input_shape = detail['shape']
            interpreter.set_tensor(detail['index'], input_data[i])

        interpreter.invoke()

        # The function `get_tensor()` returns a copy of the tensor data.
        # Use `tensor()` in order to get a pointer to the tensor.
        return (interpreter.get_tensor(output_details[0]['index']),
                interpreter.get_tensor(output_details[1]['index']))

    def _infer_tacotron2(self, input_text, interpreter, input_details,
                         output_details):
        input_ids = self._processor.text_to_sequence(input_text.lower())
        # eos.
        input_ids = np.concatenate([input_ids, [len(LJSPEECH_SYMBOLS) - 1]],
                                   -1)
        interpreter.resize_tensor_input(input_details[0]['index'],
                                        [1, len(input_ids)])
        interpreter.allocate_tensors()
        input_data = self._prepare_input_tacotron2(input_ids)
        for i, detail in enumerate(input_details):
            # NOTE: FOR DEBUGGING
            # print(detail)
            input_shape = detail['shape']
            interpreter.set_tensor(detail['index'], input_data[i])

        interpreter.invoke()

        # The function `get_tensor()` returns a copy of the tensor data.
        # Use `tensor()` in order to get a pointer to the tensor.
        return (interpreter.get_tensor(output_details[0]['index']),
                interpreter.get_tensor(output_details[1]['index']))

    def run_inference(self, input_text, out_file_name):
        _, mel_output_tflite = self._inference(input_text, self._interpreter,
                                               self._input_details,
                                               self._output_details)
        audio_after_tflite = self._generator(mel_output_tflite)[0, :, 0]
        sf.write('{}.wav'.format(out_file_name), audio_after_tflite,
                 self._sampling_rate)


# NOTE: COLAB EXAMPLE FOR INFERENCE FROM TFLITE MODELS:
# https://colab.research.google.com/drive/1HudLLpT9CQdh2k04c06bHUwLubhGTWxA?usp=sharing