args = parser.parse_args() tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) strategy = setup_strategy(args.devices) from tensorflow_asr.configs.config import Config from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer from trainer import TrainerWithMasking from tensorflow_asr.models.conformer import Conformer from tensorflow_asr.optimizers.schedules import TransformerSchedule config = Config(args.config, learning=True) speech_featurizer = TFSpeechFeaturizer(config.speech_config) if args.subwords and os.path.exists(args.subwords): print("Loading subwords ...") text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) else: print("Generating subwords ...") text_featurizer = SubwordFeaturizer.build_from_corpus( config.decoder_config, corpus_files=args.subwords_corpus ) text_featurizer.save_to_file(args.subwords) if args.tfrecords: train_dataset = ASRTFRecordDataset(
help="Enable caching for dataset") args = parser.parse_args() tf.config.optimizer.set_experimental_options( {"auto_mixed_precision": args.mxp}) strategy = setup_strategy(args.devices) from sasegan.runners.trainer import SeganTrainer from sasegan.datasets.train_dataset import SeganAugTrainDataset from tensorflow_asr.configs.config import Config from sasegan.models.segan import Generator, Discriminator from sasegan.featurizers.speech_featurizer import NumpySpeechFeaturizer, TFSpeechFeaturizer config = Config(args.config) speech_featurizer = NumpySpeechFeaturizer(config.speech_config) if args.nfx \ else TFSpeechFeaturizer(config.speech_config) dataset = SeganAugTrainDataset( stage="train", speech_featurizer=speech_featurizer, clean_dir=config.learning_config.dataset_config.clean_train_paths, noises_config=config.learning_config.dataset_config.noise_config, cache=args.cache, shuffle=True) segan_trainer = SeganTrainer(config.learning_config.running_config) with segan_trainer.strategy.scope():
def main(): parser = argparse.ArgumentParser(prog="Conformer Training") parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica") parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica") parser.add_argument("--acs", type=int, default=None, help="Train accumulation steps") parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords") parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords") parser.add_argument("--train-dir", '-td', nargs='*', required=True) parser.add_argument("--dev-dir", '-dd', nargs='*', required=True) args = parser.parse_args() tf.config.optimizer.set_experimental_options( {"auto_mixed_precision": args.mxp}) strategy = setup_strategy(args.devices) config = Config(args.config, learning=True) with open(config.speech_config) as f: speech_config = yaml.load(f, Loader=yaml.Loader) speech_featurizer = TFSpeechFeaturizer(speech_config) if args.subwords and os.path.exists(args.subwords): print("Loading subwords ...") text_featurizer = SubwordFeaturizer.load_from_file( config.decoder_config, args.subwords) else: print("Generating subwords ...") text_featurizer = SubwordFeaturizer.build_from_corpus( config.decoder_config, corpus_files=args.subwords_corpus) text_featurizer.save_to_file(args.subwords) train_dataset = Dataset(data_paths=args.train_dir, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, augmentations=config.learning_config.augmentations, stage="train", cache=False, shuffle=False) eval_dataset = Dataset(data_paths=args.dev_dir, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, stage="eval", cache=False, shuffle=False) conformer_trainer = TransducerTrainer( config=config.learning_config.running_config, text_featurizer=text_featurizer, strategy=strategy) with conformer_trainer.strategy.scope(): # build model conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) conformer._build(speech_featurizer.shape) conformer.summary(line_length=120) optimizer = tf.keras.optimizers.Adam( TransformerSchedule(d_model=conformer.dmodel, warmup_steps=config.learning_config. optimizer_config["warmup_steps"], max_lr=(0.05 / math.sqrt(conformer.dmodel))), beta_1=config.learning_config.optimizer_config["beta1"], beta_2=config.learning_config.optimizer_config["beta2"], epsilon=config.learning_config.optimizer_config["epsilon"]) conformer_trainer.compile(model=conformer, optimizer=optimizer, max_to_keep=args.max_ckpts) conformer_trainer.fit(train_dataset, eval_dataset, train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs)
help="Path to file that stores generated subwords") parser.add_argument("--output_name", type=str, default="test", help="Result filename name prefix") args = parser.parse_args() setup_devices([args.device], cpu=args.cpu) from tensorflow_asr.configs.config import Config from tensorflow_asr.featurizers.speech_featurizers import read_raw_audio from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer, SubwordFeaturizer from tensorflow_asr.models.conformer import Conformer config = Config(args.config, learning=False) speech_featurizer = TFSpeechFeaturizer(config.speech_config) if args.subwords and os.path.exists(args.subwords): print("Loading subwords ...") text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) else: text_featurizer = CharFeaturizer(config.decoder_config) text_featurizer.decoder_config.beam_width = args.beam_width # build model conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) conformer._build(speech_featurizer.shape) conformer.load_weights(args.saved, by_name=True) conformer.summary(line_length=120) conformer.add_featurizers(speech_featurizer, text_featurizer)
def main( config: str = DEFAULT_YAML, tfrecords: bool = False, sentence_piece: bool = False, subwords: bool = True, bs: int = None, spx: int = 1, metadata: str = None, static_length: bool = False, devices: list = [0], mxp: bool = False, pretrained: str = None, ): tf.keras.backend.clear_session() tf.config.optimizer.set_experimental_options({"auto_mixed_precision": mxp}) strategy = env_util.setup_strategy(devices) config = Config(config) speech_featurizer, text_featurizer = featurizer_helpers.prepare_featurizers( config=config, subwords=subwords, sentence_piece=sentence_piece, ) train_dataset, eval_dataset = dataset_helpers.prepare_training_datasets( config=config, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, tfrecords=tfrecords, metadata=metadata, ) if not static_length: speech_featurizer.reset_length() text_featurizer.reset_length() train_data_loader, eval_data_loader, global_batch_size = dataset_helpers.prepare_training_data_loaders( config=config, train_dataset=train_dataset, eval_dataset=eval_dataset, strategy=strategy, batch_size=bs, ) with strategy.scope(): contextnet = ContextNet(**config.model_config, vocabulary_size=text_featurizer.num_classes) contextnet.make(speech_featurizer.shape, prediction_shape=text_featurizer.prepand_shape, batch_size=global_batch_size) if pretrained: contextnet.load_weights(pretrained, by_name=True, skip_mismatch=True) contextnet.summary(line_length=100) optimizer = tf.keras.optimizers.Adam( TransformerSchedule( d_model=contextnet.dmodel, warmup_steps=config.learning_config.optimizer_config.pop("warmup_steps", 10000), max_lr=(0.05 / math.sqrt(contextnet.dmodel)), ), **config.learning_config.optimizer_config ) contextnet.compile( optimizer=optimizer, experimental_steps_per_execution=spx, global_batch_size=global_batch_size, blank=text_featurizer.blank, ) callbacks = [ tf.keras.callbacks.ModelCheckpoint(**config.learning_config.running_config.checkpoint), tf.keras.callbacks.experimental.BackupAndRestore(config.learning_config.running_config.states_dir), tf.keras.callbacks.TensorBoard(**config.learning_config.running_config.tensorboard), ] contextnet.fit( train_data_loader, epochs=config.learning_config.running_config.num_epochs, validation_data=eval_data_loader, callbacks=callbacks, steps_per_epoch=train_dataset.total_steps, validation_steps=eval_dataset.total_steps if eval_data_loader else None, )
def test_streaming_transducer(): config = Config(DEFAULT_YAML) text_featurizer = CharFeaturizer(config.decoder_config) speech_featurizer = TFSpeechFeaturizer(config.speech_config) model = RnnTransducer(vocabulary_size=text_featurizer.num_classes, **config.model_config) model.make(speech_featurizer.shape) model.summary(line_length=150) model.add_featurizers(speech_featurizer=speech_featurizer, text_featurizer=text_featurizer) concrete_func = model.make_tflite_function( timestamp=False).get_concrete_function() converter = tf.lite.TFLiteConverter.from_concrete_functions( [concrete_func]) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.experimental_new_converter = True converter.target_spec.supported_ops = [ tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS ] tflite_model = converter.convert() print("Converted successfully with no timestamp") concrete_func = model.make_tflite_function( timestamp=True).get_concrete_function() converter = tf.lite.TFLiteConverter.from_concrete_functions( [concrete_func]) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.experimental_new_converter = True converter.target_spec.supported_ops = [ tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS ] converter.convert() print("Converted successfully with timestamp") tflitemodel = tf.lite.Interpreter(model_content=tflite_model) signal = tf.random.normal([4000]) input_details = tflitemodel.get_input_details() output_details = tflitemodel.get_output_details() tflitemodel.resize_tensor_input(input_details[0]["index"], signal.shape) tflitemodel.allocate_tensors() tflitemodel.set_tensor(input_details[0]["index"], signal) tflitemodel.set_tensor(input_details[1]["index"], tf.constant(text_featurizer.blank, dtype=tf.int32)) tflitemodel.set_tensor( input_details[2]["index"], tf.zeros([ config.model_config["encoder_nlayers"], 2, 1, config.model_config["encoder_rnn_units"] ], dtype=tf.float32)) tflitemodel.set_tensor( input_details[3]["index"], tf.zeros([ config.model_config["prediction_num_rnns"], 2, 1, config.model_config["prediction_rnn_units"] ], dtype=tf.float32)) tflitemodel.invoke() hyp = tflitemodel.get_tensor(output_details[0]["index"]) print(hyp)
import librosa import matplotlib.pyplot as plt import librosa.display import numpy as np import soundfile def visual(title, audio, sample_rate): plt.figure(figsize=(8, 4)) librosa.display.waveplot(audio, sr=sample_rate) plt.title(title) plt.tight_layout() #plt.show() plt.savefig("./audio_b.jpg") config_dir = "tests/config_aishell.yml" config = Config(config_dir, learning=True) aug = config.learning_config.augmentations sampling_rate = 16000 audio = '/tsdata/ASR/aishell-1//wav/train/S0002/BAC009S0002W0123.wav' signal = read_raw_audio(audio, sampling_rate) #visual('Original', signal, sampling_rate) signal = aug.before.augment(signal) visual('Original', signal, sampling_rate) soundfile.write('./test_d.wav', signal, 16000)