def prepare_featurizers( config: Config, subwords: bool = True, sentence_piece: bool = False, ): speech_featurizer = speech_featurizers.TFSpeechFeaturizer( config.speech_config) if sentence_piece: logger.info("Loading SentencePiece model ...") text_featurizer = text_featurizers.SentencePieceFeaturizer( config.decoder_config) elif subwords: logger.info("Loading subwords ...") text_featurizer = text_featurizers.SubwordFeaturizer( config.decoder_config) else: logger.info("Use characters ...") text_featurizer = text_featurizers.CharFeaturizer( config.decoder_config) return speech_featurizer, text_featurizer
from tensorflow_asr.configs.config import Config from tensorflow_asr.datasets import asr_dataset from tensorflow_asr.featurizers import speech_featurizers, text_featurizers from tensorflow_asr.models.transducer.conformer import Conformer from tensorflow_asr.optimizers.schedules import TransformerSchedule config = Config(args.config) speech_featurizer = speech_featurizers.TFSpeechFeaturizer(config.speech_config) if args.sentence_piece: logger.info("Loading SentencePiece model ...") text_featurizer = text_featurizers.SentencePieceFeaturizer( config.decoder_config) elif args.subwords: logger.info("Loading subwords ...") text_featurizer = text_featurizers.SubwordFeaturizer(config.decoder_config) else: logger.info("Use characters ...") text_featurizer = text_featurizers.CharFeaturizer(config.decoder_config) if args.tfrecords: train_dataset = asr_dataset.ASRTFRecordDataset( speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, **vars(config.learning_config.train_dataset_config), indefinite=True) eval_dataset = asr_dataset.ASRTFRecordDataset( speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, **vars(config.learning_config.eval_dataset_config), indefinite=True)