def __init__(self, path='ConformerS.h5'): # fetch and load the config of the model config = Config('tamil_tech/configs/subword_conformer_new.yml', learning=True) # load speech and text featurizers speech_featurizer = TFSpeechFeaturizer(config.speech_config) text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, config.decoder_config['subwords']) # check if model already exists in given path, else download the model in the given path if os.path.exists(path): pass else: print("Downloading Model...") file_id = config.file_id download_file_from_google_drive(file_id, path) print("Downloaded Model Successfully...") # load model using config self.model = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) # set shape of the featurizer and build the model self.model._build(speech_featurizer.shape) # load weights of the model self.model.load_weights(path, by_name=True) # display model summary self.model.summary(line_length=120) # set featurizers for the model self.model.add_featurizers(speech_featurizer, text_featurizer) print("Loaded Model...!")
def test_featurizer(): config = { "output_path_prefix": "/data/models/asr/conformer_sentencepiece_subword", "model_type": "unigram", "target_vocab_size": 8000, "blank_at_zero": True, "beam_width": 5, "norm_score": True, "corpus_files": [ "/data/datasets/LibriSpeech/train-clean-100/transcripts.tsv" "/data/datasets/LibriSpeech/train-clean-360/transcripts.tsv" "/data/datasets/LibriSpeech/train-other-500/transcripts.tsv" ] } config_speech = { "sample_rate": 16000, "frame_ms": 25, "stride_ms": 10, "num_feature_bins": 80, 'feature_type': "log_mel_spectrogram", "preemphasis": 0.97, "normalize_signal": True, "normalize_feature": True, "normalize_per_feature": False } text_featurizer_sentencepiece = SentencePieceFeaturizer.load_from_file( config, None) subwords_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), os.pardir, os.pardir, "vocabularies", "librispeech_train_4_1030.subwords") text_featurizer_subwords = SubwordFeaturizer.load_from_file( config, subwords_path) speech_featurizer = TFSpeechFeaturizer(config_speech) data_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), "transcripts_librispeech_train_clean_100.tsv") def get_data(featurizer: TextFeaturizer): train_dataset = ASRSliceDataset(data_paths=[data_path], speech_featurizer=speech_featurizer, text_featurizer=featurizer, stage="train", shuffle=False) train_data = train_dataset.create(1) return next(iter(train_data)) data_sentencepiece = get_data(text_featurizer_sentencepiece) data_subwords = get_data(text_featurizer_subwords) assert len(data_sentencepiece) == len(data_subwords) assert data_sentencepiece[0].shape == data_subwords[0].shape assert data_sentencepiece[0].dtype == data_subwords[0].dtype
parser.add_argument("transcripts", nargs="+", type=str, default=None, help="Paths to transcript files") args = parser.parse_args() transcripts = preprocess_paths(args.transcripts) tfrecords_dir = preprocess_paths(args.tfrecords_dir) config = Config(args.config) if args.sentence_piece: print("Loading SentencePiece model ...") text_featurizer = SentencePieceFeaturizer.load_from_file( config.decoder_config, args.subwords) elif args.subwords and os.path.exists(args.subwords): print("Loading subwords ...") text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) ASRTFRecordDataset(data_paths=transcripts, tfrecords_dir=tfrecords_dir, speech_featurizer=None, text_featurizer=text_featurizer, stage=args.mode, shuffle=args.shuffle, tfrecords_shards=args.tfrecords_shards).create_tfrecords()
def main(): parser = argparse.ArgumentParser(prog="Conformer Training") parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica") parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica") parser.add_argument("--acs", type=int, default=None, help="Train accumulation steps") parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords") parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords") parser.add_argument( "--train-dir", '-td', nargs='*', default=["en_ng_male_train.tsv", "en_ng_female_train.tsv"]) parser.add_argument("--train-reg-dir", '-trd', nargs='*', default=[ "libritts_train-clean-100.tsv", "libritts_train-clean-360.tsv", "libritts_train-other-500.tsv" ]) parser.add_argument( "--dev-dir", '-dd', nargs='*', default=["en_ng_male_eval.tsv", "en_ng_female_eval.tsv"]) parser.add_argument("--dev-reg-dir", '-drd', nargs='*', default=["libritts_test-other.tsv"]) args = parser.parse_args() tf.config.optimizer.set_experimental_options( {"auto_mixed_precision": args.mxp}) strategy = setup_strategy(args.devices) config = Config(args.config, learning=True) config.train_dir = args.train_dir config.dev_dir = args.dev_dir config.train_reg_dir = args.train_reg_dir config.dev_reg_dir = args.dev_reg_dir with open(config.speech_config) as f: speech_config = yaml.load(f, Loader=yaml.Loader) speech_featurizer = TFSpeechFeaturizer(speech_config) if args.subwords and os.path.exists(args.subwords): print("Loading subwords ...") text_featurizer = SubwordFeaturizer.load_from_file( config.decoder_config, args.subwords) else: print("Generating subwords ...") text_featurizer = SubwordFeaturizer.build_from_corpus( config.decoder_config, corpus_files=args.subwords_corpus) text_featurizer.save_to_file(args.subwords) train_dataset = Dataset(data_paths=config.train_dir, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, augmentations=config.learning_config.augmentations, stage="train", cache=False, shuffle=False) train_reg_dataset = DatasetInf( data_paths=config.train_reg_dir, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, augmentations=config.learning_config.augmentations, stage="train", cache=False, shuffle=False) eval_dataset = Dataset(data_paths=config.dev_dir, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, stage="eval", cache=False, shuffle=False) eval_reg_dataset = DatasetInf( data_paths=config.dev_reg_dir, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, augmentations=config.learning_config.augmentations, stage="eval", cache=False, shuffle=False) conformer_trainer = MultiReaderTransducerTrainer( config=config.learning_config.running_config, text_featurizer=text_featurizer, strategy=strategy) with conformer_trainer.strategy.scope(): # build model conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) conformer._build(speech_featurizer.shape) conformer.summary(line_length=120) optimizer = tf.keras.optimizers.Adam( TransformerSchedule(d_model=conformer.dmodel, warmup_steps=config.learning_config. optimizer_config["warmup_steps"], max_lr=(0.05 / math.sqrt(conformer.dmodel))), beta_1=config.learning_config.optimizer_config["beta1"], beta_2=config.learning_config.optimizer_config["beta2"], epsilon=config.learning_config.optimizer_config["epsilon"]) conformer_trainer.compile(model=conformer, optimizer=optimizer, max_to_keep=args.max_ckpts) conformer_trainer.fit( train_dataset, train_reg_dataset, # alpha for regularising dataset; alpha = 1 for training dataset 1., eval_dataset, eval_reg_dataset, train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs)
def process(text): encoded_output = subword.extract(text.decode('utf-8')) encoded_input = subword.prepand_blank(encoded_output) encoded_output = tf.concat([encoded_output, [subword.blank]], axis=0) assert encoded_input.shape == encoded_output.shape return encoded_input, encoded_output @tf.function def parse(record): return tf.numpy_function(process, inp=[record], Tout=[tf.int32, tf.int32]) config = Config('config.yml', learning=True) subword = SubwordFeaturizer.load_from_file( config.decoder_config, '/home/joaoalvarenga/datasets/conformer_subwords.subwords') checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( 'checkpoint/lm.ckpt', save_weights_only=True, verbose=1) print(subword.num_classes) batch_size = 32 dataset = tf.data.TextLineDataset( '/media/work/joaoalvarenga/ptwiki-20181125.txt') dataset = dataset.map(parse) dataset = dataset.cache() # dataset = dataset.batch(batch_size, drop_remainder=True) dataset = dataset.padded_batch(batch_size=batch_size, padded_shapes=(tf.TensorShape([None]), tf.TensorShape([None])), padding_values=(subword.blank, subword.blank),