from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer
from tensorflow_asr.models.keras.contextnet import ContextNet
from tensorflow_asr.optimizers.schedules import TransformerSchedule

config = Config(args.config)
speech_featurizer = TFSpeechFeaturizer(config.speech_config)

if args.sentence_piece:
    print("Loading SentencePiece model ...")
    text_featurizer = SentencePieceFeaturizer.load_from_file(
        config.decoder_config, args.subwords)
elif args.subwords and os.path.exists(args.subwords):
    print("Loading subwords ...")
    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config,
                                                       args.subwords)
else:
    print("Generating subwords ...")
    text_featurizer = SubwordFeaturizer.build_from_corpus(
        config.decoder_config, corpus_files=args.subwords_corpus)
    text_featurizer.save_to_file(args.subwords)

train_dataset = ASRTFRecordDatasetKeras(
    speech_featurizer=speech_featurizer,
    text_featurizer=text_featurizer,
    **vars(config.learning_config.train_dataset_config),
    indefinite=True)
eval_dataset = ASRTFRecordDatasetKeras(
    speech_featurizer=speech_featurizer,
    text_featurizer=text_featurizer,
    **vars(config.learning_config.eval_dataset_config),
setup_devices([args.device], cpu=args.cpu)

from tensorflow_asr.configs.config import Config
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer
from tensorflow_asr.runners.base_runners import BaseTester
from tensorflow_asr.models.conformer import Conformer

config = Config(args.config, learning=True)
speech_featurizer = TFSpeechFeaturizer(config.speech_config)

if args.subwords and os.path.exists(args.subwords):
    print("Loading subwords ...")
    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config,
                                                       args.subwords)
else:
    raise ValueError("subwords must be set")

tf.random.set_seed(0)
assert args.saved

if args.tfrecords:
    test_dataset = ASRTFRecordDataset(
        data_paths=config.learning_config.dataset_config.test_paths,
        tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
        speech_featurizer=speech_featurizer,
        text_featurizer=text_featurizer,
        stage="test",
        shuffle=False)
else:
DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                            "config.yml")

parser = argparse.ArgumentParser(prog="Vocab Training with Subwords")

parser.add_argument("corpus",
                    nargs="*",
                    type=str,
                    default=[],
                    help="Transcript files for generating subwords")

parser.add_argument("--config",
                    type=str,
                    default=DEFAULT_YAML,
                    help="The file path of model configuration file")

parser.add_argument("--output_file",
                    type=str,
                    default=None,
                    help="Path to file that stores generated subwords")

args = parser.parse_args()

config = Config(args.config)

print("Generating subwords ...")

text_featurizer = SubwordFeaturizer.build_from_corpus(config.decoder_config,
                                                      args.corpus)
text_featurizer.save_to_file(args.output_file)
Example #4
0
parser.add_argument("--subwords", type=str, default=None, help="Use subwords")

parser.add_argument("output",
                    type=str,
                    default=None,
                    help="TFLite file path to be exported")

args = parser.parse_args()

assert args.saved and args.output

config = Config(args.config)
speech_featurizer = TFSpeechFeaturizer(config.speech_config)

if args.subwords:
    text_featurizer = SubwordFeaturizer(config.decoder_config)
else:
    text_featurizer = CharFeaturizer(config.decoder_config)

# build model
jasper = Jasper(**config.model_config,
                vocabulary_size=text_featurizer.num_classes)
jasper.make(speech_featurizer.shape)
jasper.load_weights(args.saved, by_name=True)
jasper.summary(line_length=100)
jasper.add_featurizers(speech_featurizer, text_featurizer)

concrete_func = jasper.make_tflite_function().get_concrete_function()
converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
converter.experimental_new_converter = True
converter.optimizations = [tf.lite.Optimize.DEFAULT]
Example #5
0
from tensorflow_asr.featurizers.speech_featurizers import read_raw_audio

parser = argparse.ArgumentParser(prog="test subword")

parser.add_argument("transcripts", nargs="+", type=str, default=[None])

args = parser.parse_args()

config = {
    "vocabulary": None,
    "target_vocab_size": 1024,
    "max_subword_length": 4,
    "blank_at_zero": True,
    "beam_width": 5,
    "norm_score": True
}

text_featurizer = SubwordFeaturizer.build_from_corpus(config, args.transcripts)

print(len(text_featurizer.subwords.subwords))
print(text_featurizer.upoints)
print(text_featurizer.num_classes)

a = text_featurizer.extract("hello world")

print(a)

b = text_featurizer.indices2upoints(a)

tf.print(tf.strings.unicode_encode(b, "UTF-8"))
def main():
    parser = argparse.ArgumentParser(prog="Conformer Training")

    parser.add_argument("--config",
                        type=str,
                        default=DEFAULT_YAML,
                        help="The file path of model configuration file")

    parser.add_argument("--max_ckpts",
                        type=int,
                        default=10,
                        help="Max number of checkpoints to keep")

    parser.add_argument("--tbs",
                        type=int,
                        default=None,
                        help="Train batch size per replica")

    parser.add_argument("--ebs",
                        type=int,
                        default=None,
                        help="Evaluation batch size per replica")

    parser.add_argument("--acs",
                        type=int,
                        default=None,
                        help="Train accumulation steps")

    parser.add_argument("--devices",
                        type=int,
                        nargs="*",
                        default=[0],
                        help="Devices' ids to apply distributed training")

    parser.add_argument("--mxp",
                        default=False,
                        action="store_true",
                        help="Enable mixed precision")

    parser.add_argument("--subwords",
                        type=str,
                        default=None,
                        help="Path to file that stores generated subwords")

    parser.add_argument("--subwords_corpus",
                        nargs="*",
                        type=str,
                        default=[],
                        help="Transcript files for generating subwords")

    parser.add_argument(
        "--train-dir",
        '-td',
        nargs='*',
        default=["en_ng_male_train.tsv", "en_ng_female_train.tsv"])
    parser.add_argument("--train-reg-dir",
                        '-trd',
                        nargs='*',
                        default=[
                            "libritts_train-clean-100.tsv",
                            "libritts_train-clean-360.tsv",
                            "libritts_train-other-500.tsv"
                        ])
    parser.add_argument(
        "--dev-dir",
        '-dd',
        nargs='*',
        default=["en_ng_male_eval.tsv", "en_ng_female_eval.tsv"])
    parser.add_argument("--dev-reg-dir",
                        '-drd',
                        nargs='*',
                        default=["libritts_test-other.tsv"])

    args = parser.parse_args()

    tf.config.optimizer.set_experimental_options(
        {"auto_mixed_precision": args.mxp})

    strategy = setup_strategy(args.devices)

    config = Config(args.config, learning=True)
    config.train_dir = args.train_dir
    config.dev_dir = args.dev_dir
    config.train_reg_dir = args.train_reg_dir
    config.dev_reg_dir = args.dev_reg_dir
    with open(config.speech_config) as f:
        speech_config = yaml.load(f, Loader=yaml.Loader)
    speech_featurizer = TFSpeechFeaturizer(speech_config)

    if args.subwords and os.path.exists(args.subwords):
        print("Loading subwords ...")
        text_featurizer = SubwordFeaturizer.load_from_file(
            config.decoder_config, args.subwords)
    else:
        print("Generating subwords ...")
        text_featurizer = SubwordFeaturizer.build_from_corpus(
            config.decoder_config, corpus_files=args.subwords_corpus)
        text_featurizer.save_to_file(args.subwords)

    train_dataset = Dataset(data_paths=config.train_dir,
                            speech_featurizer=speech_featurizer,
                            text_featurizer=text_featurizer,
                            augmentations=config.learning_config.augmentations,
                            stage="train",
                            cache=False,
                            shuffle=False)
    train_reg_dataset = DatasetInf(
        data_paths=config.train_reg_dir,
        speech_featurizer=speech_featurizer,
        text_featurizer=text_featurizer,
        augmentations=config.learning_config.augmentations,
        stage="train",
        cache=False,
        shuffle=False)
    eval_dataset = Dataset(data_paths=config.dev_dir,
                           speech_featurizer=speech_featurizer,
                           text_featurizer=text_featurizer,
                           stage="eval",
                           cache=False,
                           shuffle=False)
    eval_reg_dataset = DatasetInf(
        data_paths=config.dev_reg_dir,
        speech_featurizer=speech_featurizer,
        text_featurizer=text_featurizer,
        augmentations=config.learning_config.augmentations,
        stage="eval",
        cache=False,
        shuffle=False)

    conformer_trainer = MultiReaderTransducerTrainer(
        config=config.learning_config.running_config,
        text_featurizer=text_featurizer,
        strategy=strategy)

    with conformer_trainer.strategy.scope():
        # build model
        conformer = Conformer(**config.model_config,
                              vocabulary_size=text_featurizer.num_classes)
        conformer._build(speech_featurizer.shape)
        conformer.summary(line_length=120)

        optimizer = tf.keras.optimizers.Adam(
            TransformerSchedule(d_model=conformer.dmodel,
                                warmup_steps=config.learning_config.
                                optimizer_config["warmup_steps"],
                                max_lr=(0.05 / math.sqrt(conformer.dmodel))),
            beta_1=config.learning_config.optimizer_config["beta1"],
            beta_2=config.learning_config.optimizer_config["beta2"],
            epsilon=config.learning_config.optimizer_config["epsilon"])

    conformer_trainer.compile(model=conformer,
                              optimizer=optimizer,
                              max_to_keep=args.max_ckpts)
    conformer_trainer.fit(
        train_dataset,
        train_reg_dataset,
        # alpha for regularising dataset; alpha = 1 for training dataset
        1.,
        eval_dataset,
        eval_reg_dataset,
        train_bs=args.tbs,
        eval_bs=args.ebs,
        train_acs=args.acs)
Example #7
0
def process(text):
    encoded_output = subword.extract(text.decode('utf-8'))
    encoded_input = subword.prepand_blank(encoded_output)
    encoded_output = tf.concat([encoded_output, [subword.blank]], axis=0)
    assert encoded_input.shape == encoded_output.shape
    return encoded_input, encoded_output


@tf.function
def parse(record):
    return tf.numpy_function(process, inp=[record], Tout=[tf.int32, tf.int32])


config = Config('config.yml', learning=True)
subword = SubwordFeaturizer.load_from_file(
    config.decoder_config,
    '/home/joaoalvarenga/datasets/conformer_subwords.subwords')
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    'checkpoint/lm.ckpt', save_weights_only=True, verbose=1)

print(subword.num_classes)
batch_size = 32
dataset = tf.data.TextLineDataset(
    '/media/work/joaoalvarenga/ptwiki-20181125.txt')
dataset = dataset.map(parse)
dataset = dataset.cache()
# dataset = dataset.batch(batch_size, drop_remainder=True)
dataset = dataset.padded_batch(batch_size=batch_size,
                               padded_shapes=(tf.TensorShape([None]),
                                              tf.TensorShape([None])),
                               padding_values=(subword.blank, subword.blank),