コード例 #1
0
    def __init__(self, args):
        super().__init__()
        with open(args.am_config) as f:
            am_config = yaml.load(f, Loader=yaml.Loader)

        with open(am_config['speech_config']) as f:
            self.speech_config = yaml.load(f, Loader=yaml.Loader)
        self.speech_featurizer = TFSpeechFeaturizer(self.speech_config)

        self.am = self.build_am(args.am_config, args.am_model)

        with open(args.iam_config) as f:
            iam_config = yaml.load(f, Loader=yaml.Loader)
        iam_config.update(self.speech_config)
        iam_config['n_mels'] = iam_config['asr_features']
        iam_config['hop_size'] = iam_config['asr_downsample'] * iam_config[
            'sample_rate'] * iam_config['stride_ms'] // 1000

        self.iam, self.pqmf = self.build_iam(iam_config, args.iam_model)

        with open(args.sv_config) as f:
            sv_config = yaml.load(f, Loader=yaml.Loader)
        sv_config.update(self.speech_config)

        self.sv = self.build_sv(sv_config, args.sv_model)

        with open(args.vc_config) as f:
            vc_config = yaml.load(f, Loader=yaml.Loader)
        vc_config.update(self.speech_config)
        vc_config['hop_size'] = vc_config['sample_rate'] * vc_config[
            'stride_ms'] // 1000
        vc_config['sampling_rate'] = vc_config['sample_rate']

        self.vc = self.build_vc(vc_config, args.vc_model)
コード例 #2
0
def main(argv):
    speech_file = argv[1]
    feature_type = argv[2]
    speech_conf = {
        "sample_rate": 16000,
        "frame_ms": 25,
        "stride_ms": 10,
        "feature_type": feature_type,
        "preemphasis": 0.97,
        "normalize_signal": True,
        "normalize_feature": True,
        "normalize_per_feature": False,
        "num_feature_bins": 80,
    }
    signal = read_raw_audio(speech_file, speech_conf["sample_rate"])

    nsf = NumpySpeechFeaturizer(speech_conf)
    sf = TFSpeechFeaturizer(speech_conf)
    ft = nsf.stft(signal)
    print(ft.shape, np.mean(ft))
    ft = sf.stft(signal).numpy()
    print(ft.shape, np.mean(ft))
    ft = sf.extract(signal)

    plt.figure(figsize=(16, 2.5))
    ax = plt.gca()
    ax.set_title(f"{feature_type}", fontweight="bold")
    librosa.display.specshow(ft.T, cmap="magma")
    v1 = np.linspace(ft.min(), ft.max(), 8, endpoint=True)
    plt.colorbar(pad=0.01, fraction=0.02, ax=ax, format="%.2f", ticks=v1)
    plt.tight_layout()
コード例 #3
0
    def __init__(self, path='ConformerS.h5'):
        # fetch and load the config of the model
        config = Config('tamil_tech/configs/conformer_new_config.yml', learning=True)

        # load speech and text featurizers
        speech_featurizer = TFSpeechFeaturizer(config.speech_config)
        text_featurizer = CharFeaturizer(config.decoder_config)

        # check if model already exists in given path, else download the model in the given path
        if os.path.exists(path):
          pass
        else:
          print("Downloading Model...")
          file_id = config.file_id
          download_file_from_google_drive(file_id, path)
          print("Downloaded Model Successfully...")
        
        # load model using config
        self.model = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
        # set shape of the featurizer and build the model
        self.model._build(speech_featurizer.shape)
        # load weights of the model
        self.model.load_weights(path, by_name=True)
        # display model summary
        self.model.summary(line_length=120)
        # set featurizers for the model
        self.model.add_featurizers(speech_featurizer, text_featurizer)

        print("Loaded Model...!")
コード例 #4
0
ファイル: test_ds2.py プロジェクト: wxqwinner/TensorFlowASR
def test_ds2():
    config = Config(DEFAULT_YAML)

    text_featurizer = CharFeaturizer(config.decoder_config)

    speech_featurizer = TFSpeechFeaturizer(config.speech_config)

    model = DeepSpeech2(vocabulary_size=text_featurizer.num_classes, **config.model_config)

    model._build(speech_featurizer.shape)
    model.summary(line_length=150)

    model.add_featurizers(speech_featurizer=speech_featurizer, text_featurizer=text_featurizer)

    concrete_func = model.make_tflite_function(greedy=False).get_concrete_function()
    converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.experimental_new_converter = True
    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]
    converter.convert()

    print("Converted successfully with beam search")

    concrete_func = model.make_tflite_function(greedy=True).get_concrete_function()
    converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.experimental_new_converter = True
    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]
    converter.convert()

    print("Converted successfully with greedy")
コード例 #5
0
def test_featurizer():
    config = {
        "output_path_prefix":
        "/data/models/asr/conformer_sentencepiece_subword",
        "model_type":
        "unigram",
        "target_vocab_size":
        8000,
        "blank_at_zero":
        True,
        "beam_width":
        5,
        "norm_score":
        True,
        "corpus_files": [
            "/data/datasets/LibriSpeech/train-clean-100/transcripts.tsv"
            "/data/datasets/LibriSpeech/train-clean-360/transcripts.tsv"
            "/data/datasets/LibriSpeech/train-other-500/transcripts.tsv"
        ]
    }

    config_speech = {
        "sample_rate": 16000,
        "frame_ms": 25,
        "stride_ms": 10,
        "num_feature_bins": 80,
        'feature_type': "log_mel_spectrogram",
        "preemphasis": 0.97,
        "normalize_signal": True,
        "normalize_feature": True,
        "normalize_per_feature": False
    }

    text_featurizer_sentencepiece = SentencePieceFeaturizer.load_from_file(
        config, None)
    subwords_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                                 os.pardir, os.pardir, "vocabularies",
                                 "librispeech_train_4_1030.subwords")
    text_featurizer_subwords = SubwordFeaturizer.load_from_file(
        config, subwords_path)
    speech_featurizer = TFSpeechFeaturizer(config_speech)
    data_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                             "transcripts_librispeech_train_clean_100.tsv")

    def get_data(featurizer: TextFeaturizer):
        train_dataset = ASRSliceDataset(data_paths=[data_path],
                                        speech_featurizer=speech_featurizer,
                                        text_featurizer=featurizer,
                                        stage="train",
                                        shuffle=False)
        train_data = train_dataset.create(1)
        return next(iter(train_data))

    data_sentencepiece = get_data(text_featurizer_sentencepiece)
    data_subwords = get_data(text_featurizer_subwords)

    assert len(data_sentencepiece) == len(data_subwords)
    assert data_sentencepiece[0].shape == data_subwords[0].shape
    assert data_sentencepiece[0].dtype == data_subwords[0].dtype
コード例 #6
0
def test_iextract():
    config = {
        "output_path_prefix": "/data/models/asr/conformer_sentencepiece_subword",
        "model_type": "unigram",
        "target_vocab_size": 8000,
        "blank_at_zero": True,
        "beam_width": 5,
        "norm_score": True,
        "corpus_files": [
            "/data/datasets/LibriSpeech/train-clean-100/transcripts.tsv"
            "/data/datasets/LibriSpeech/train-clean-360/transcripts.tsv"
            "/data/datasets/LibriSpeech/train-other-500/transcripts.tsv"]}

    config_speech = {
        "sample_rate": 16000,
        "frame_ms": 25,
        "stride_ms": 10,
        "num_feature_bins": 80,
        'feature_type': "log_mel_spectrogram",
        "preemphasis": 0.97,
        "normalize_signal": True,
        "normalize_feature": True,
        "normalize_per_frame": False}

    text_featurizer_sentencepiece = SentencePieceFeaturizer.load_from_file(config, None)
    speech_featurizer = TFSpeechFeaturizer(config_speech)
    data_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), "transcripts_librispeech_train_clean_100.tsv")

    train_dataset = ASRSliceTestDataset(
        data_paths=[data_path],
        speech_featurizer=speech_featurizer,
        text_featurizer=text_featurizer_sentencepiece,
        stage="train",
        shuffle=False
    )
    train_data = train_dataset.create(1)
    batch = next(iter(train_data))
    file_paths, features, input_length, labels = batch
    labels = text_featurizer_sentencepiece.iextract(labels)
    labels = labels.numpy()[0].decode("utf-8")

    # Open transcript
    file_path = file_paths[0].numpy().decode("utf-8")
    file_path = re.sub(r"(?<!\s)-[0-9]{4}.flac", ".trans.txt", file_path)
    print(file_path)
    with open(file_path, "r") as f:
        lines = f.read().splitlines()
    m = re.search(r"[0-9]+-[0-9]+-[0-9]+\s+([\w\s]+)", lines[0])
    transcript = m.groups(1)[0].lower()

    assert(labels == transcript)
コード例 #7
0
def main():

    args = parse_args()

    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)

    model = TFSpeechFeaturizer(config)
    executor = ProcessPoolExecutor(max_workers=cpu_count())

    all_filenames = find_files(args.dataset, args.suffix)

    futures = []

    print('num files total: %d' % len(all_filenames), all_filenames[0])

    suffix = args.suffix.replace('*', '')
    # for file in all_filenames:
    #   futures.append(executor.submit(partial(process_file, file, model, suffix)))
    # results = [future.result() for future in tqdm(futures)]

    for file in tqdm(all_filenames):
        process_file(file, model, suffix)
コード例 #8
0
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})

setup_devices([args.device])

from tensorflow_asr.configs.user_config import UserConfig
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
from tensorflow_asr.runners.base_runners import BaseTester
from model import DeepSpeech2

tf.random.set_seed(0)
assert args.export

config = UserConfig(DEFAULT_YAML, args.config, learning=True)
speech_featurizer = TFSpeechFeaturizer(config["speech_config"])
text_featurizer = CharFeaturizer(config["decoder_config"])
# Build DS2 model
ds2_model = DeepSpeech2(input_shape=speech_featurizer.shape,
                        arch_config=config["model_config"],
                        num_classes=text_featurizer.num_classes,
                        name="deepspeech2")
ds2_model._build(speech_featurizer.shape)
ds2_model.load_weights(args.saved, by_name=True)
ds2_model.summary(line_length=150)
ds2_model.add_featurizers(speech_featurizer, text_featurizer)

if args.tfrecords:
    test_dataset = ASRTFRecordDataset(
        data_paths=config["learning_config"]["dataset_config"]["test_paths"],
        tfrecords_dir=config["learning_config"]["dataset_config"]["tfrecords_dir"],
コード例 #9
0
def test_contextnet():
    config = Config(DEFAULT_YAML)

    text_featurizer = CharFeaturizer(config.decoder_config)

    speech_featurizer = TFSpeechFeaturizer(config.speech_config)

    model = ContextNet(vocabulary_size=text_featurizer.num_classes,
                       **config.model_config)

    model.make(speech_featurizer.shape)
    model.summary(line_length=150)

    model.add_featurizers(speech_featurizer=speech_featurizer,
                          text_featurizer=text_featurizer)

    concrete_func = model.make_tflite_function(
        timestamp=False).get_concrete_function()
    converter = tf.lite.TFLiteConverter.from_concrete_functions(
        [concrete_func])
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.experimental_new_converter = True
    converter.target_spec.supported_ops = [
        tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS
    ]
    tflite = converter.convert()

    logger.info("Converted successfully with no timestamp")

    concrete_func = model.make_tflite_function(
        timestamp=True).get_concrete_function()
    converter = tf.lite.TFLiteConverter.from_concrete_functions(
        [concrete_func])
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.experimental_new_converter = True
    converter.target_spec.supported_ops = [
        tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS
    ]
    converter.convert()

    logger.info("Converted successfully with timestamp")

    tflitemodel = tf.lite.Interpreter(model_content=tflite)
    signal = tf.random.normal([4000])

    input_details = tflitemodel.get_input_details()
    output_details = tflitemodel.get_output_details()
    tflitemodel.resize_tensor_input(input_details[0]["index"], [4000])
    tflitemodel.allocate_tensors()
    tflitemodel.set_tensor(input_details[0]["index"], signal)
    tflitemodel.set_tensor(input_details[1]["index"],
                           tf.constant(text_featurizer.blank, dtype=tf.int32))
    tflitemodel.set_tensor(
        input_details[2]["index"],
        tf.zeros([
            config.model_config["prediction_num_rnns"], 2, 1,
            config.model_config["prediction_rnn_units"]
        ],
                 dtype=tf.float32))
    tflitemodel.invoke()
    hyp = tflitemodel.get_tensor(output_details[0]["index"])

    logger.info(hyp)
コード例 #10
0
                    default=False,
                    action="store_true",
                    help="Whether to use `SentencePiece` model")

args = parser.parse_args()

env_util.setup_devices([args.device], cpu=args.cpu)

from tensorflow_asr.configs.config import Config
from tensorflow_asr.featurizers.speech_featurizers import read_raw_audio
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer, SubwordFeaturizer, SentencePieceFeaturizer
from tensorflow_asr.models.transducer.conformer import Conformer

config = Config(args.config)
speech_featurizer = TFSpeechFeaturizer(config.speech_config)
if args.sentence_piece:
    print("Loading SentencePiece model ...")
    text_featurizer = SentencePieceFeaturizer.load_from_file(
        config.decoder_config, args.subwords)
elif args.subwords and os.path.exists(args.subwords):
    print("Loading subwords ...")
    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config,
                                                       args.subwords)
else:
    text_featurizer = CharFeaturizer(config.decoder_config)
text_featurizer.decoder_config.beam_width = args.beam_width

# build model
conformer = Conformer(**config.model_config,
                      vocabulary_size=text_featurizer.num_classes)
def main():
    parser = argparse.ArgumentParser(prog="Conformer Training")

    parser.add_argument("--config",
                        type=str,
                        default=DEFAULT_YAML,
                        help="The file path of model configuration file")

    parser.add_argument("--max_ckpts",
                        type=int,
                        default=10,
                        help="Max number of checkpoints to keep")

    parser.add_argument("--tbs",
                        type=int,
                        default=None,
                        help="Train batch size per replica")

    parser.add_argument("--ebs",
                        type=int,
                        default=None,
                        help="Evaluation batch size per replica")

    parser.add_argument("--acs",
                        type=int,
                        default=None,
                        help="Train accumulation steps")

    parser.add_argument("--devices",
                        type=int,
                        nargs="*",
                        default=[0],
                        help="Devices' ids to apply distributed training")

    parser.add_argument("--mxp",
                        default=False,
                        action="store_true",
                        help="Enable mixed precision")

    parser.add_argument("--subwords",
                        type=str,
                        default=None,
                        help="Path to file that stores generated subwords")

    parser.add_argument("--subwords_corpus",
                        nargs="*",
                        type=str,
                        default=[],
                        help="Transcript files for generating subwords")

    parser.add_argument(
        "--train-dir",
        '-td',
        nargs='*',
        default=["en_ng_male_train.tsv", "en_ng_female_train.tsv"])
    parser.add_argument("--train-reg-dir",
                        '-trd',
                        nargs='*',
                        default=[
                            "libritts_train-clean-100.tsv",
                            "libritts_train-clean-360.tsv",
                            "libritts_train-other-500.tsv"
                        ])
    parser.add_argument(
        "--dev-dir",
        '-dd',
        nargs='*',
        default=["en_ng_male_eval.tsv", "en_ng_female_eval.tsv"])
    parser.add_argument("--dev-reg-dir",
                        '-drd',
                        nargs='*',
                        default=["libritts_test-other.tsv"])

    args = parser.parse_args()

    tf.config.optimizer.set_experimental_options(
        {"auto_mixed_precision": args.mxp})

    strategy = setup_strategy(args.devices)

    config = Config(args.config, learning=True)
    config.train_dir = args.train_dir
    config.dev_dir = args.dev_dir
    config.train_reg_dir = args.train_reg_dir
    config.dev_reg_dir = args.dev_reg_dir
    with open(config.speech_config) as f:
        speech_config = yaml.load(f, Loader=yaml.Loader)
    speech_featurizer = TFSpeechFeaturizer(speech_config)

    if args.subwords and os.path.exists(args.subwords):
        print("Loading subwords ...")
        text_featurizer = SubwordFeaturizer.load_from_file(
            config.decoder_config, args.subwords)
    else:
        print("Generating subwords ...")
        text_featurizer = SubwordFeaturizer.build_from_corpus(
            config.decoder_config, corpus_files=args.subwords_corpus)
        text_featurizer.save_to_file(args.subwords)

    train_dataset = Dataset(data_paths=config.train_dir,
                            speech_featurizer=speech_featurizer,
                            text_featurizer=text_featurizer,
                            augmentations=config.learning_config.augmentations,
                            stage="train",
                            cache=False,
                            shuffle=False)
    train_reg_dataset = DatasetInf(
        data_paths=config.train_reg_dir,
        speech_featurizer=speech_featurizer,
        text_featurizer=text_featurizer,
        augmentations=config.learning_config.augmentations,
        stage="train",
        cache=False,
        shuffle=False)
    eval_dataset = Dataset(data_paths=config.dev_dir,
                           speech_featurizer=speech_featurizer,
                           text_featurizer=text_featurizer,
                           stage="eval",
                           cache=False,
                           shuffle=False)
    eval_reg_dataset = DatasetInf(
        data_paths=config.dev_reg_dir,
        speech_featurizer=speech_featurizer,
        text_featurizer=text_featurizer,
        augmentations=config.learning_config.augmentations,
        stage="eval",
        cache=False,
        shuffle=False)

    conformer_trainer = MultiReaderTransducerTrainer(
        config=config.learning_config.running_config,
        text_featurizer=text_featurizer,
        strategy=strategy)

    with conformer_trainer.strategy.scope():
        # build model
        conformer = Conformer(**config.model_config,
                              vocabulary_size=text_featurizer.num_classes)
        conformer._build(speech_featurizer.shape)
        conformer.summary(line_length=120)

        optimizer = tf.keras.optimizers.Adam(
            TransformerSchedule(d_model=conformer.dmodel,
                                warmup_steps=config.learning_config.
                                optimizer_config["warmup_steps"],
                                max_lr=(0.05 / math.sqrt(conformer.dmodel))),
            beta_1=config.learning_config.optimizer_config["beta1"],
            beta_2=config.learning_config.optimizer_config["beta2"],
            epsilon=config.learning_config.optimizer_config["epsilon"])

    conformer_trainer.compile(model=conformer,
                              optimizer=optimizer,
                              max_to_keep=args.max_ckpts)
    conformer_trainer.fit(
        train_dataset,
        train_reg_dataset,
        # alpha for regularising dataset; alpha = 1 for training dataset
        1.,
        eval_dataset,
        eval_reg_dataset,
        train_bs=args.tbs,
        eval_bs=args.ebs,
        train_acs=args.acs)
コード例 #12
0
args = parser.parse_args()

tf.config.optimizer.set_experimental_options(
    {"auto_mixed_precision": args.mxp})

strategy = setup_strategy(args.devices)

from tensorflow_asr.configs.config import Config
from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras, ASRSliceDatasetKeras
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer
from tensorflow_asr.models.keras.contextnet import ContextNet
from tensorflow_asr.optimizers.schedules import TransformerSchedule

config = Config(args.config)
speech_featurizer = TFSpeechFeaturizer(config.speech_config)

if args.subwords and os.path.exists(args.subwords):
    print("Loading subwords ...")
    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config,
                                                       args.subwords)
else:
    print("Generating subwords ...")
    text_featurizer = SubwordFeaturizer.build_from_corpus(
        config.decoder_config, corpus_files=args.subwords_corpus)
    text_featurizer.save_to_file(args.subwords)

if args.tfrecords:
    train_dataset = ASRTFRecordDatasetKeras(
        speech_featurizer=speech_featurizer,
        text_featurizer=text_featurizer,
コード例 #13
0
args = parser.parse_args()

setup_devices([args.device], cpu=args.cpu)

from tensorflow_asr.configs.config import Config
from tensorflow_asr.featurizers.speech_featurizers import read_raw_audio
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer, SubwordFeaturizer
from tensorflow_asr.models.conformer import Conformer

config = Config(args.config, learning=False)
with open(config.speech_config) as f:
  speech_config = yaml.load(f, Loader=yaml.Loader)

speech_featurizer = TFSpeechFeaturizer(speech_config)
if args.subwords and os.path.exists(args.subwords):
  print("Loading subwords ...")
  text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
else:
  text_featurizer = CharFeaturizer(config.decoder_config)
text_featurizer.decoder_config.beam_width = args.beam_width

# build model
conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
conformer._build(speech_featurizer.shape)
conformer.load_weights(args.saved, by_name=True)
conformer.summary(line_length=120)
conformer.add_featurizers(speech_featurizer, text_featurizer)

import numpy as np
コード例 #14
0
class Inference(tf.keras.Model):
    def __init__(self, args):
        super().__init__()
        with open(args.am_config) as f:
            am_config = yaml.load(f, Loader=yaml.Loader)

        with open(am_config['speech_config']) as f:
            self.speech_config = yaml.load(f, Loader=yaml.Loader)
        self.speech_featurizer = TFSpeechFeaturizer(self.speech_config)

        self.am = self.build_am(args.am_config, args.am_model)

        with open(args.iam_config) as f:
            iam_config = yaml.load(f, Loader=yaml.Loader)
        iam_config.update(self.speech_config)
        iam_config['n_mels'] = iam_config['asr_features']
        iam_config['hop_size'] = iam_config['asr_downsample'] * iam_config[
            'sample_rate'] * iam_config['stride_ms'] // 1000

        self.iam, self.pqmf = self.build_iam(iam_config, args.iam_model)

        with open(args.sv_config) as f:
            sv_config = yaml.load(f, Loader=yaml.Loader)
        sv_config.update(self.speech_config)

        self.sv = self.build_sv(sv_config, args.sv_model)

        with open(args.vc_config) as f:
            vc_config = yaml.load(f, Loader=yaml.Loader)
        vc_config.update(self.speech_config)
        vc_config['hop_size'] = vc_config['sample_rate'] * vc_config[
            'stride_ms'] // 1000
        vc_config['sampling_rate'] = vc_config['sample_rate']

        self.vc = self.build_vc(vc_config, args.vc_model)

    # @tf.function
    def call(self, x):
        c = self.speech_featurizer.tf_extract(x)
        gc = self.sv(tf.reshape(c, [1, -1, self.speech_config['n_mels']]))

        with tf.device('/cpu:0'):
            c = self.am.encoder_inference(c)
            c = tf.expand_dims(c, 0)

        x = self.iam(mels=c, training=False)
        x = self.pqmf.synthesis(x)
        x = tf.squeeze(x)

        c = self.speech_featurizer.tf_extract(x)
        c = tf.reshape(c, [1, -1, self.speech_config['n_mels']])

        y = self.vc(mels=c, gc=gc, training=False)['y_mb_hat']
        y = self.pqmf.synthesis(y)
        y = tf.squeeze(y)

        return x, y

    def build_am(self, config_path, model_path):
        config = Config(config_path, learning=False)
        conformer = Conformer(**config.model_config, vocabulary_size=1031)
        conformer._build(self.speech_featurizer.shape)
        print('loading am...')
        conformer.load_weights(model_path, by_name=True)
        return conformer

    def build_iam(self, config, model_path):
        generator = MelGANGenerator(
            config=MultiBandMelGANGeneratorConfig(
                **config["multiband_melgan_generator_params"]),
            name="multi_band_melgan_generator",
        )
        generator.set_shape(config['n_mels'])
        pqmf = TFPQMF(
            MultiBandMelGANGeneratorConfig(
                **config["multiband_melgan_generator_params"]),
            dtype=tf.float32,
            name="pqmf",
        )
        fake_mels = tf.random.uniform(shape=[1, 100, config['n_mels']],
                                      dtype=tf.float32)
        output = generator(mels=fake_mels, training=False)
        y_hat = pqmf.synthesis(output)
        print('loading iam...')
        generator.load_weights(model_path)
        return generator, pqmf

    def build_sv(self, config, model_path):
        model = GE2E(name='ge2e', **config['model'])
        fake_mels = tf.random.uniform(shape=[1, 100, config['n_mels']],
                                      dtype=tf.float32)
        model(fake_mels)
        print('loading sv...')
        model.load_weights(model_path)
        return model

    def build_vc(self, config, model_path):
        encoder = Encoder(**config['encoder'])
        generator = MelGANGeneratorVQ(
            encoder=encoder,
            config=MultiBandMelGANGeneratorConfig(
                **config["multiband_melgan_generator_params"]),
            name="multi_band_melgan_generator",
        )
        generator.set_shape(config['n_mels'], config['gc_channels'])

        fake_mels = tf.random.uniform(shape=[1, 100, config['n_mels']],
                                      dtype=tf.float32)
        fake_gc = tf.random.uniform(shape=[1, config['gc_channels']],
                                    dtype=tf.float32)
        y_mb_hat = generator(mels=fake_mels, gc=fake_gc,
                             training=False)['y_mb_hat']
        print('loading vc...')
        generator.load_weights(model_path)
        return generator
コード例 #15
0
ファイル: test.py プロジェクト: kojhliang/TensorFlowASR
assert args.saved

tf.config.optimizer.set_experimental_options(
    {"auto_mixed_precision": args.mxp})

env_util.setup_devices([args.device], cpu=args.cpu)

from tensorflow_asr.configs.config import Config
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer
from tensorflow_asr.models.ctc.jasper import Jasper
from tensorflow_asr.utils import app_util

config = Config(args.config)
speech_featurizer = TFSpeechFeaturizer(config.speech_config)

if args.sentence_piece:
    print("Use SentencePiece ...")
    text_featurizer = SentencePieceFeaturizer(config.decoder_config)
elif args.subwords:
    print("Use subwords ...")
    text_featurizer = SubwordFeaturizer(config.decoder_config)
else:
    print("Use characters ...")
    text_featurizer = CharFeaturizer(config.decoder_config)

tf.random.set_seed(0)

if args.tfrecords:
    test_dataset = ASRTFRecordDataset(
コード例 #16
0
data = "/mnt/Data/ML/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv"

text_featurizer = CharFeaturizer({
    "vocabulary": None,
    "blank_at_zero": True,
    "beam_width": 5,
    "norm_score": True
})

speech_featurizer = TFSpeechFeaturizer({
    "sample_rate": 16000,
    "frame_ms": 25,
    "stride_ms": 10,
    "num_feature_bins": 80,
    "feature_type": "log_mel_spectrogram",
    "preemphasis": 0.97,
    "normalize_signal": True,
    "normalize_feature": True,
    "normalize_per_feature": False
})

dataset = ASRSliceDataset(stage="train",
                          speech_featurizer=speech_featurizer,
                          text_featurizer=text_featurizer,
                          data_paths=[data],
                          augmentations=augments,
                          shuffle=True).create(4).take(100)

while True:
    print("--------------------------------------------")
コード例 #17
0
def main():
    """Run training process."""
    parser = argparse.ArgumentParser(
        description=
        "Train MultiBand MelGAN (See detail in examples/multiband_melgan/train_multiband_melgan.py)"
    )
    parser.add_argument("--feature", '-f', required=True)
    parser.add_argument("--speaker", '-s', required=True)
    parser.add_argument("--config", '-c', required=True)
    parser.add_argument("--resume", '-r', required=True)
    args = parser.parse_args()

    # return strategy
    STRATEGY = return_strategy()

    # load and save config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    with open(config['speech_config']) as f:
        speech_config = yaml.load(f, Loader=yaml.Loader)
    config.update(speech_config)
    config['hop_size'] = config['sample_rate'] * config['stride_ms'] // 1000
    config['sampling_rate'] = config['sample_rate']

    config.update(vars(args))
    config["version"] = tensorflow_tts.__version__
    for key, value in config.items():
        logging.info(f"{key} = {value}")

    with STRATEGY.scope():
        encoder = Encoder(**config['encoder'])

        generator = MelGANGeneratorVQ(
            encoder=encoder,
            config=MultiBandMelGANGeneratorConfig(
                **config["multiband_melgan_generator_params"]),
            name="multi_band_melgan_generator",
        )
        generator.set_shape(config['n_mels'], config['gc_channels'])

        pqmf = TFPQMF(
            MultiBandMelGANGeneratorConfig(
                **config["multiband_melgan_generator_params"]),
            dtype=tf.float32,
            name="pqmf",
        )

        # dummy input to build model.
        fake_mels = tf.random.uniform(shape=[1, 100, config['n_mels']],
                                      dtype=tf.float32)
        fake_gc = tf.random.uniform(shape=[1, config['gc_channels']],
                                    dtype=tf.float32)
        y_mb_hat = generator(mels=fake_mels, gc=fake_gc,
                             training=False)['y_mb_hat']
        y_hat = pqmf.synthesis(y_mb_hat)

        generator.load_weights(args.resume)
        generator.summary()

    speech_featurizer = TFSpeechFeaturizer(speech_config)
    if args.feature.endswith('_mel.npy'):
        mels = tf.constant(np.load(args.feature), tf.float32)
    else:
        signal, _ = librosa.load(args.feature, sr=config['sample_rate'])
        mels = speech_featurizer.tf_extract(signal)
    mels = tf.reshape(mels, [1, -1, config['n_mels']])

    gc = tf.constant(
        np.load(args.speaker).reshape([1, config['gc_channels']]), tf.float32)
    # gc = tf.constant(np.zeros(256).reshape([1, config['gc_channels']]), tf.float32)
    output = generator(mels=mels, gc=gc, training=False)['y_mb_hat']
    y_hat = pqmf.synthesis(output).numpy().reshape([-1])
    print('output:', y_hat.shape)
    save_name = args.feature.replace('.wav', '_gen_vc.wav')
    save_name = args.feature.replace('_mel.npy', '_gen_vc.wav')
    save_name = save_name.split('/')[-1]
    wavfile.write(save_name, config['sample_rate'], y_hat)

    def depreemphasis(signal: np.ndarray, coeff=0.97):
        if not coeff or coeff <= 0.0: return signal
        x = np.zeros(signal.shape[0], dtype=np.float32)
        x[0] = signal[0]
        for n in range(1, signal.shape[0], 1):
            x[n] = coeff * x[n - 1] + signal[n]
        return x

    y_hat = depreemphasis(y_hat)
    wavfile.write(save_name.replace('.wav', '_depre.wav'),
                  config['sample_rate'], y_hat)