args = parser.parse_args()

tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})

strategy = setup_strategy(args.devices)

from tensorflow_asr.configs.config import Config
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer
from trainer import TrainerWithMasking
from tensorflow_asr.models.conformer import Conformer
from tensorflow_asr.optimizers.schedules import TransformerSchedule

config = Config(args.config, learning=True)
speech_featurizer = TFSpeechFeaturizer(config.speech_config)

if args.subwords and os.path.exists(args.subwords):
    print("Loading subwords ...")
    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
else:
    print("Generating subwords ...")
    text_featurizer = SubwordFeaturizer.build_from_corpus(
        config.decoder_config,
        corpus_files=args.subwords_corpus
    )
    text_featurizer.save_to_file(args.subwords)

if args.tfrecords:
    train_dataset = ASRTFRecordDataset(
Beispiel #2
0
                    help="Enable caching for dataset")

args = parser.parse_args()

tf.config.optimizer.set_experimental_options(
    {"auto_mixed_precision": args.mxp})

strategy = setup_strategy(args.devices)

from sasegan.runners.trainer import SeganTrainer
from sasegan.datasets.train_dataset import SeganAugTrainDataset
from tensorflow_asr.configs.config import Config
from sasegan.models.segan import Generator, Discriminator
from sasegan.featurizers.speech_featurizer import NumpySpeechFeaturizer, TFSpeechFeaturizer

config = Config(args.config)

speech_featurizer = NumpySpeechFeaturizer(config.speech_config) if args.nfx \
    else TFSpeechFeaturizer(config.speech_config)

dataset = SeganAugTrainDataset(
    stage="train",
    speech_featurizer=speech_featurizer,
    clean_dir=config.learning_config.dataset_config.clean_train_paths,
    noises_config=config.learning_config.dataset_config.noise_config,
    cache=args.cache,
    shuffle=True)

segan_trainer = SeganTrainer(config.learning_config.running_config)

with segan_trainer.strategy.scope():
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser(prog="Conformer Training")

    parser.add_argument("--config",
                        type=str,
                        default=DEFAULT_YAML,
                        help="The file path of model configuration file")

    parser.add_argument("--max_ckpts",
                        type=int,
                        default=10,
                        help="Max number of checkpoints to keep")

    parser.add_argument("--tbs",
                        type=int,
                        default=None,
                        help="Train batch size per replica")

    parser.add_argument("--ebs",
                        type=int,
                        default=None,
                        help="Evaluation batch size per replica")

    parser.add_argument("--acs",
                        type=int,
                        default=None,
                        help="Train accumulation steps")

    parser.add_argument("--devices",
                        type=int,
                        nargs="*",
                        default=[0],
                        help="Devices' ids to apply distributed training")

    parser.add_argument("--mxp",
                        default=False,
                        action="store_true",
                        help="Enable mixed precision")

    parser.add_argument("--subwords",
                        type=str,
                        default=None,
                        help="Path to file that stores generated subwords")

    parser.add_argument("--subwords_corpus",
                        nargs="*",
                        type=str,
                        default=[],
                        help="Transcript files for generating subwords")

    parser.add_argument("--train-dir", '-td', nargs='*', required=True)
    parser.add_argument("--dev-dir", '-dd', nargs='*', required=True)

    args = parser.parse_args()

    tf.config.optimizer.set_experimental_options(
        {"auto_mixed_precision": args.mxp})

    strategy = setup_strategy(args.devices)

    config = Config(args.config, learning=True)
    with open(config.speech_config) as f:
        speech_config = yaml.load(f, Loader=yaml.Loader)
    speech_featurizer = TFSpeechFeaturizer(speech_config)

    if args.subwords and os.path.exists(args.subwords):
        print("Loading subwords ...")
        text_featurizer = SubwordFeaturizer.load_from_file(
            config.decoder_config, args.subwords)
    else:
        print("Generating subwords ...")
        text_featurizer = SubwordFeaturizer.build_from_corpus(
            config.decoder_config, corpus_files=args.subwords_corpus)
        text_featurizer.save_to_file(args.subwords)

    train_dataset = Dataset(data_paths=args.train_dir,
                            speech_featurizer=speech_featurizer,
                            text_featurizer=text_featurizer,
                            augmentations=config.learning_config.augmentations,
                            stage="train",
                            cache=False,
                            shuffle=False)
    eval_dataset = Dataset(data_paths=args.dev_dir,
                           speech_featurizer=speech_featurizer,
                           text_featurizer=text_featurizer,
                           stage="eval",
                           cache=False,
                           shuffle=False)

    conformer_trainer = TransducerTrainer(
        config=config.learning_config.running_config,
        text_featurizer=text_featurizer,
        strategy=strategy)

    with conformer_trainer.strategy.scope():
        # build model
        conformer = Conformer(**config.model_config,
                              vocabulary_size=text_featurizer.num_classes)
        conformer._build(speech_featurizer.shape)
        conformer.summary(line_length=120)

        optimizer = tf.keras.optimizers.Adam(
            TransformerSchedule(d_model=conformer.dmodel,
                                warmup_steps=config.learning_config.
                                optimizer_config["warmup_steps"],
                                max_lr=(0.05 / math.sqrt(conformer.dmodel))),
            beta_1=config.learning_config.optimizer_config["beta1"],
            beta_2=config.learning_config.optimizer_config["beta2"],
            epsilon=config.learning_config.optimizer_config["epsilon"])

    conformer_trainer.compile(model=conformer,
                              optimizer=optimizer,
                              max_to_keep=args.max_ckpts)

    conformer_trainer.fit(train_dataset,
                          eval_dataset,
                          train_bs=args.tbs,
                          eval_bs=args.ebs,
                          train_acs=args.acs)
Beispiel #4
0
                    help="Path to file that stores generated subwords")

parser.add_argument("--output_name", type=str, default="test",
                    help="Result filename name prefix")

args = parser.parse_args()

setup_devices([args.device], cpu=args.cpu)

from tensorflow_asr.configs.config import Config
from tensorflow_asr.featurizers.speech_featurizers import read_raw_audio
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer, SubwordFeaturizer
from tensorflow_asr.models.conformer import Conformer

config = Config(args.config, learning=False)
speech_featurizer = TFSpeechFeaturizer(config.speech_config)
if args.subwords and os.path.exists(args.subwords):
    print("Loading subwords ...")
    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
else:
    text_featurizer = CharFeaturizer(config.decoder_config)
text_featurizer.decoder_config.beam_width = args.beam_width

# build model
conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
conformer._build(speech_featurizer.shape)
conformer.load_weights(args.saved, by_name=True)
conformer.summary(line_length=120)
conformer.add_featurizers(speech_featurizer, text_featurizer)
Beispiel #5
0
def main(
    config: str = DEFAULT_YAML,
    tfrecords: bool = False,
    sentence_piece: bool = False,
    subwords: bool = True,
    bs: int = None,
    spx: int = 1,
    metadata: str = None,
    static_length: bool = False,
    devices: list = [0],
    mxp: bool = False,
    pretrained: str = None,
):
    tf.keras.backend.clear_session()
    tf.config.optimizer.set_experimental_options({"auto_mixed_precision": mxp})
    strategy = env_util.setup_strategy(devices)

    config = Config(config)

    speech_featurizer, text_featurizer = featurizer_helpers.prepare_featurizers(
        config=config,
        subwords=subwords,
        sentence_piece=sentence_piece,
    )

    train_dataset, eval_dataset = dataset_helpers.prepare_training_datasets(
        config=config,
        speech_featurizer=speech_featurizer,
        text_featurizer=text_featurizer,
        tfrecords=tfrecords,
        metadata=metadata,
    )

    if not static_length:
        speech_featurizer.reset_length()
        text_featurizer.reset_length()

    train_data_loader, eval_data_loader, global_batch_size = dataset_helpers.prepare_training_data_loaders(
        config=config,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        strategy=strategy,
        batch_size=bs,
    )

    with strategy.scope():
        contextnet = ContextNet(**config.model_config, vocabulary_size=text_featurizer.num_classes)
        contextnet.make(speech_featurizer.shape, prediction_shape=text_featurizer.prepand_shape, batch_size=global_batch_size)
        if pretrained:
            contextnet.load_weights(pretrained, by_name=True, skip_mismatch=True)
        contextnet.summary(line_length=100)
        optimizer = tf.keras.optimizers.Adam(
            TransformerSchedule(
                d_model=contextnet.dmodel,
                warmup_steps=config.learning_config.optimizer_config.pop("warmup_steps", 10000),
                max_lr=(0.05 / math.sqrt(contextnet.dmodel)),
            ),
            **config.learning_config.optimizer_config
        )
        contextnet.compile(
            optimizer=optimizer,
            experimental_steps_per_execution=spx,
            global_batch_size=global_batch_size,
            blank=text_featurizer.blank,
        )

    callbacks = [
        tf.keras.callbacks.ModelCheckpoint(**config.learning_config.running_config.checkpoint),
        tf.keras.callbacks.experimental.BackupAndRestore(config.learning_config.running_config.states_dir),
        tf.keras.callbacks.TensorBoard(**config.learning_config.running_config.tensorboard),
    ]

    contextnet.fit(
        train_data_loader,
        epochs=config.learning_config.running_config.num_epochs,
        validation_data=eval_data_loader,
        callbacks=callbacks,
        steps_per_epoch=train_dataset.total_steps,
        validation_steps=eval_dataset.total_steps if eval_data_loader else None,
    )
Beispiel #6
0
def test_streaming_transducer():
    config = Config(DEFAULT_YAML)

    text_featurizer = CharFeaturizer(config.decoder_config)

    speech_featurizer = TFSpeechFeaturizer(config.speech_config)

    model = RnnTransducer(vocabulary_size=text_featurizer.num_classes,
                          **config.model_config)

    model.make(speech_featurizer.shape)
    model.summary(line_length=150)

    model.add_featurizers(speech_featurizer=speech_featurizer,
                          text_featurizer=text_featurizer)

    concrete_func = model.make_tflite_function(
        timestamp=False).get_concrete_function()
    converter = tf.lite.TFLiteConverter.from_concrete_functions(
        [concrete_func])
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.experimental_new_converter = True
    converter.target_spec.supported_ops = [
        tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS
    ]
    tflite_model = converter.convert()

    print("Converted successfully with no timestamp")

    concrete_func = model.make_tflite_function(
        timestamp=True).get_concrete_function()
    converter = tf.lite.TFLiteConverter.from_concrete_functions(
        [concrete_func])
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.experimental_new_converter = True
    converter.target_spec.supported_ops = [
        tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS
    ]
    converter.convert()

    print("Converted successfully with timestamp")

    tflitemodel = tf.lite.Interpreter(model_content=tflite_model)
    signal = tf.random.normal([4000])

    input_details = tflitemodel.get_input_details()
    output_details = tflitemodel.get_output_details()
    tflitemodel.resize_tensor_input(input_details[0]["index"], signal.shape)
    tflitemodel.allocate_tensors()
    tflitemodel.set_tensor(input_details[0]["index"], signal)
    tflitemodel.set_tensor(input_details[1]["index"],
                           tf.constant(text_featurizer.blank, dtype=tf.int32))
    tflitemodel.set_tensor(
        input_details[2]["index"],
        tf.zeros([
            config.model_config["encoder_nlayers"], 2, 1,
            config.model_config["encoder_rnn_units"]
        ],
                 dtype=tf.float32))
    tflitemodel.set_tensor(
        input_details[3]["index"],
        tf.zeros([
            config.model_config["prediction_num_rnns"], 2, 1,
            config.model_config["prediction_rnn_units"]
        ],
                 dtype=tf.float32))
    tflitemodel.invoke()
    hyp = tflitemodel.get_tensor(output_details[0]["index"])

    print(hyp)
Beispiel #7
0
import librosa

import matplotlib.pyplot as plt
import librosa.display
import numpy as np
import soundfile


def visual(title, audio, sample_rate):
    plt.figure(figsize=(8, 4))
    librosa.display.waveplot(audio, sr=sample_rate)
    plt.title(title)
    plt.tight_layout()
    #plt.show()
    plt.savefig("./audio_b.jpg")


config_dir = "tests/config_aishell.yml"
config = Config(config_dir, learning=True)

aug = config.learning_config.augmentations

sampling_rate = 16000
audio = '/tsdata/ASR/aishell-1//wav/train/S0002/BAC009S0002W0123.wav'
signal = read_raw_audio(audio, sampling_rate)

#visual('Original', signal, sampling_rate)

signal = aug.before.augment(signal)
visual('Original', signal, sampling_rate)
soundfile.write('./test_d.wav', signal, 16000)