def main( config: str = DEFAULT_YAML, h5: str = None, subwords: bool = False, sentence_piece: bool = False, output: str = None, ): assert h5 and output tf.keras.backend.clear_session() tf.compat.v1.enable_control_flow_v2() config = Config(config) speech_featurizer, text_featurizer = featurizer_helpers.prepare_featurizers( config=config, subwords=subwords, sentence_piece=sentence_piece, ) deepspeech2 = DeepSpeech2(**config.model_config, vocabulary_size=text_featurizer.num_classes) deepspeech2.make(speech_featurizer.shape) deepspeech2.load_weights(h5, by_name=True) deepspeech2.summary(line_length=100) deepspeech2.add_featurizers(speech_featurizer, text_featurizer) exec_helpers.convert_tflite(model=deepspeech2, output=output)
def __init__(self, path='ConformerS.h5'): # fetch and load the config of the model config = Config('tamil_tech/configs/conformer_new_config.yml', learning=True) # load speech and text featurizers speech_featurizer = TFSpeechFeaturizer(config.speech_config) text_featurizer = CharFeaturizer(config.decoder_config) # check if model already exists in given path, else download the model in the given path if os.path.exists(path): pass else: print("Downloading Model...") file_id = config.file_id download_file_from_google_drive(file_id, path) print("Downloaded Model Successfully...") # load model using config self.model = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) # set shape of the featurizer and build the model self.model._build(speech_featurizer.shape) # load weights of the model self.model.load_weights(path, by_name=True) # display model summary self.model.summary(line_length=120) # set featurizers for the model self.model.add_featurizers(speech_featurizer, text_featurizer) print("Loaded Model...!")
def test_ds2(): config = Config(DEFAULT_YAML) text_featurizer = CharFeaturizer(config.decoder_config) speech_featurizer = TFSpeechFeaturizer(config.speech_config) model = DeepSpeech2(vocabulary_size=text_featurizer.num_classes, **config.model_config) model._build(speech_featurizer.shape) model.summary(line_length=150) model.add_featurizers(speech_featurizer=speech_featurizer, text_featurizer=text_featurizer) concrete_func = model.make_tflite_function(greedy=False).get_concrete_function() converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func]) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.experimental_new_converter = True converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] converter.convert() print("Converted successfully with beam search") concrete_func = model.make_tflite_function(greedy=True).get_concrete_function() converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func]) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.experimental_new_converter = True converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] converter.convert() print("Converted successfully with greedy")
def build_am(self, config_path, model_path): config = Config(config_path, learning=False) conformer = Conformer(**config.model_config, vocabulary_size=1031) conformer._build(self.speech_featurizer.shape) print('loading am...') conformer.load_weights(model_path, by_name=True) return conformer
def main( config: str = DEFAULT_YAML, h5: str = None, sentence_piece: bool = False, subwords: bool = False, output_dir: str = None, ): assert h5 and output_dir config = Config(config) tf.random.set_seed(0) tf.keras.backend.clear_session() speech_featurizer, text_featurizer = featurizer_helpers.prepare_featurizers( config=config, subwords=subwords, sentence_piece=sentence_piece, ) # build model conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) conformer.make(speech_featurizer.shape) conformer.load_weights(h5, by_name=True) conformer.summary(line_length=100) conformer.add_featurizers(speech_featurizer, text_featurizer) class ConformerModule(tf.Module): def __init__(self, model: Conformer, name=None): super().__init__(name=name) self.model = model self.num_rnns = config.model_config["prediction_num_rnns"] self.rnn_units = config.model_config["prediction_rnn_units"] self.rnn_nstates = 2 if config.model_config[ "prediction_rnn_type"] == "lstm" else 1 @tf.function( input_signature=[tf.TensorSpec(shape=[None], dtype=tf.float32)]) def pred(self, signal): predicted = tf.constant(0, dtype=tf.int32) states = tf.zeros( [self.num_rnns, self.rnn_nstates, 1, self.rnn_units], dtype=tf.float32) features = self.model.speech_featurizer.tf_extract(signal) encoded = self.model.encoder_inference(features) hypothesis = self.model._perform_greedy(encoded, tf.shape(encoded)[0], predicted, states, tflite=False) transcript = self.model.text_featurizer.indices2upoints( hypothesis.prediction) return transcript module = ConformerModule(model=conformer) tf.saved_model.save(module, export_dir=output_dir, signatures=module.pred.get_concrete_function())
def main( config: str = DEFAULT_YAML, saved: str = None, mxp: bool = False, bs: int = None, sentence_piece: bool = False, subwords: bool = False, device: int = 0, cpu: bool = False, output: str = "test.tsv", ): assert saved and output tf.random.set_seed(0) tf.keras.backend.clear_session() tf.config.optimizer.set_experimental_options({"auto_mixed_precision": mxp}) env_util.setup_devices([device], cpu=cpu) config = Config(config) speech_featurizer, text_featurizer = featurizer_helpers.prepare_featurizers( config=config, subwords=subwords, sentence_piece=sentence_piece, ) deepspeech2 = DeepSpeech2(**config.model_config, vocabulary_size=text_featurizer.num_classes) deepspeech2.make(speech_featurizer.shape) deepspeech2.load_weights(saved, by_name=True) deepspeech2.summary(line_length=100) deepspeech2.add_featurizers(speech_featurizer, text_featurizer) test_dataset = dataset_helpers.prepare_testing_datasets( config=config, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer) batch_size = bs or config.learning_config.running_config.batch_size test_data_loader = test_dataset.create(batch_size) exec_helpers.run_testing(model=deepspeech2, test_dataset=test_dataset, test_data_loader=test_data_loader, output=output)
type=str, default=None, help="Path to file that stores generated subwords") parser.add_argument("transcripts", nargs="+", type=str, default=None, help="Paths to transcript files") args = parser.parse_args() transcripts = preprocess_paths(args.transcripts) tfrecords_dir = preprocess_paths(args.tfrecords_dir) config = Config(args.config) if args.sentence_piece: print("Loading SentencePiece model ...") text_featurizer = SentencePieceFeaturizer.load_from_file( config.decoder_config, args.subwords) elif args.subwords and os.path.exists(args.subwords): print("Loading subwords ...") text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) ASRTFRecordDataset(data_paths=transcripts, tfrecords_dir=tfrecords_dir, speech_featurizer=None, text_featurizer=text_featurizer, stage=args.mode,
args = parser.parse_args() tf.config.optimizer.set_experimental_options( {"auto_mixed_precision": args.mxp}) setup_devices([args.device], cpu=args.cpu) from tensorflow_asr.configs.config import Config from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer from tensorflow_asr.runners.base_runners import BaseTester from tensorflow_asr.models.conformer import Conformer config = Config(args.config, learning=True) speech_featurizer = TFSpeechFeaturizer(config.speech_config) if args.subwords and os.path.exists(args.subwords): print("Loading subwords ...") text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) else: raise ValueError("subwords must be set") tf.random.set_seed(0) assert args.saved if args.tfrecords: test_dataset = ASRTFRecordDataset( data_paths=config.learning_config.dataset_config.test_paths,
def test_contextnet(): config = Config(DEFAULT_YAML) text_featurizer = CharFeaturizer(config.decoder_config) speech_featurizer = TFSpeechFeaturizer(config.speech_config) model = ContextNet(vocabulary_size=text_featurizer.num_classes, **config.model_config) model.make(speech_featurizer.shape) model.summary(line_length=150) model.add_featurizers(speech_featurizer=speech_featurizer, text_featurizer=text_featurizer) concrete_func = model.make_tflite_function( timestamp=False).get_concrete_function() converter = tf.lite.TFLiteConverter.from_concrete_functions( [concrete_func]) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.experimental_new_converter = True converter.target_spec.supported_ops = [ tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS ] tflite = converter.convert() logger.info("Converted successfully with no timestamp") concrete_func = model.make_tflite_function( timestamp=True).get_concrete_function() converter = tf.lite.TFLiteConverter.from_concrete_functions( [concrete_func]) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.experimental_new_converter = True converter.target_spec.supported_ops = [ tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS ] converter.convert() logger.info("Converted successfully with timestamp") tflitemodel = tf.lite.Interpreter(model_content=tflite) signal = tf.random.normal([4000]) input_details = tflitemodel.get_input_details() output_details = tflitemodel.get_output_details() tflitemodel.resize_tensor_input(input_details[0]["index"], [4000]) tflitemodel.allocate_tensors() tflitemodel.set_tensor(input_details[0]["index"], signal) tflitemodel.set_tensor(input_details[1]["index"], tf.constant(text_featurizer.blank, dtype=tf.int32)) tflitemodel.set_tensor( input_details[2]["index"], tf.zeros([ config.model_config["prediction_num_rnns"], 2, 1, config.model_config["prediction_rnn_units"] ], dtype=tf.float32)) tflitemodel.invoke() hyp = tflitemodel.get_tensor(output_details[0]["index"]) logger.info(hyp)
# Copyright 2020 TalentedSoft ( Author: Shipeng XIA ) import os import soundfile os.environ["CUDA_VISIBLE_DEVICES"] = "-1" from tensorflow_asr.configs.config import Config from scripts.visual import load_signal sample_rate = 16000 config_dir = "scripts/augment/config_augment.yml" file_path = "/work/kaldi/egs/XSP/TensorFlowASR/data/Aishell_1/test_transcripts.tsv" output_path = "./testaugmenta/" if not os.path.exists(output_path): os.makedirs(output_path) config = Config(config_dir, learning=True) aug = config.learning_config.augmentations with open(file_path, "r", encoding="utf-8") as lines: wav = [line.split("\t", 2)[0] for line in lines] for i in wav: if i == 'PATH': continue name = i.split('/')[-1] signal, sample_rate = load_signal(i, sample_rate) signal = aug.before.augment(signal) soundfile.write(output_path + name, signal, 16000)
def main(): parser = argparse.ArgumentParser(prog="Conformer Training") parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica") parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica") parser.add_argument("--acs", type=int, default=None, help="Train accumulation steps") parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords") parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords") parser.add_argument( "--train-dir", '-td', nargs='*', default=["en_ng_male_train.tsv", "en_ng_female_train.tsv"]) parser.add_argument("--train-reg-dir", '-trd', nargs='*', default=[ "libritts_train-clean-100.tsv", "libritts_train-clean-360.tsv", "libritts_train-other-500.tsv" ]) parser.add_argument( "--dev-dir", '-dd', nargs='*', default=["en_ng_male_eval.tsv", "en_ng_female_eval.tsv"]) parser.add_argument("--dev-reg-dir", '-drd', nargs='*', default=["libritts_test-other.tsv"]) args = parser.parse_args() tf.config.optimizer.set_experimental_options( {"auto_mixed_precision": args.mxp}) strategy = setup_strategy(args.devices) config = Config(args.config, learning=True) config.train_dir = args.train_dir config.dev_dir = args.dev_dir config.train_reg_dir = args.train_reg_dir config.dev_reg_dir = args.dev_reg_dir with open(config.speech_config) as f: speech_config = yaml.load(f, Loader=yaml.Loader) speech_featurizer = TFSpeechFeaturizer(speech_config) if args.subwords and os.path.exists(args.subwords): print("Loading subwords ...") text_featurizer = SubwordFeaturizer.load_from_file( config.decoder_config, args.subwords) else: print("Generating subwords ...") text_featurizer = SubwordFeaturizer.build_from_corpus( config.decoder_config, corpus_files=args.subwords_corpus) text_featurizer.save_to_file(args.subwords) train_dataset = Dataset(data_paths=config.train_dir, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, augmentations=config.learning_config.augmentations, stage="train", cache=False, shuffle=False) train_reg_dataset = DatasetInf( data_paths=config.train_reg_dir, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, augmentations=config.learning_config.augmentations, stage="train", cache=False, shuffle=False) eval_dataset = Dataset(data_paths=config.dev_dir, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, stage="eval", cache=False, shuffle=False) eval_reg_dataset = DatasetInf( data_paths=config.dev_reg_dir, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, augmentations=config.learning_config.augmentations, stage="eval", cache=False, shuffle=False) conformer_trainer = MultiReaderTransducerTrainer( config=config.learning_config.running_config, text_featurizer=text_featurizer, strategy=strategy) with conformer_trainer.strategy.scope(): # build model conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) conformer._build(speech_featurizer.shape) conformer.summary(line_length=120) optimizer = tf.keras.optimizers.Adam( TransformerSchedule(d_model=conformer.dmodel, warmup_steps=config.learning_config. optimizer_config["warmup_steps"], max_lr=(0.05 / math.sqrt(conformer.dmodel))), beta_1=config.learning_config.optimizer_config["beta1"], beta_2=config.learning_config.optimizer_config["beta2"], epsilon=config.learning_config.optimizer_config["epsilon"]) conformer_trainer.compile(model=conformer, optimizer=optimizer, max_to_keep=args.max_ckpts) conformer_trainer.fit( train_dataset, train_reg_dataset, # alpha for regularising dataset; alpha = 1 for training dataset 1., eval_dataset, eval_reg_dataset, train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs)
def main( config: str = DEFAULT_YAML, tfrecords: bool = False, sentence_piece: bool = False, subwords: bool = False, bs: int = None, spx: int = 1, metadata: str = None, static_length: bool = False, devices: list = [0], mxp: bool = False, pretrained: str = None, ): tf.keras.backend.clear_session() tf.config.optimizer.set_experimental_options({"auto_mixed_precision": mxp}) strategy = env_util.setup_strategy(devices) config = Config(config) speech_featurizer, text_featurizer = featurizer_helpers.prepare_featurizers( config=config, subwords=subwords, sentence_piece=sentence_piece, ) train_dataset, eval_dataset = dataset_helpers.prepare_training_datasets( config=config, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, tfrecords=tfrecords, metadata=metadata, ) if not static_length: speech_featurizer.reset_length() text_featurizer.reset_length() train_data_loader, eval_data_loader, global_batch_size = dataset_helpers.prepare_training_data_loaders( config=config, train_dataset=train_dataset, eval_dataset=eval_dataset, strategy=strategy, batch_size=bs, ) with strategy.scope(): deepspeech2 = DeepSpeech2(**config.model_config, vocabulary_size=text_featurizer.num_classes) deepspeech2.make(speech_featurizer.shape, batch_size=global_batch_size) if pretrained: deepspeech2.load_weights(pretrained, by_name=True, skip_mismatch=True) deepspeech2.summary(line_length=100) deepspeech2.compile( optimizer=config.learning_config.optimizer_config, experimental_steps_per_execution=spx, global_batch_size=global_batch_size, blank=text_featurizer.blank, ) callbacks = [ tf.keras.callbacks.ModelCheckpoint( **config.learning_config.running_config.checkpoint), tf.keras.callbacks.experimental.BackupAndRestore( config.learning_config.running_config.states_dir), tf.keras.callbacks.TensorBoard( **config.learning_config.running_config.tensorboard), ] deepspeech2.fit( train_data_loader, epochs=config.learning_config.running_config.num_epochs, validation_data=eval_data_loader, callbacks=callbacks, steps_per_epoch=train_dataset.total_steps, validation_steps=eval_dataset.total_steps if eval_data_loader else None, )
parser.add_argument("--cpu", '-cpu', default=False, action="store_true", help="Whether to only use cpu") parser.add_argument("--subwords", '-sub', type=str, default=None, help="Path to file that stores generated subwords") args = parser.parse_args() setup_devices([args.device], cpu=args.cpu) from tensorflow_asr.configs.config import Config from tensorflow_asr.featurizers.speech_featurizers import read_raw_audio from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer, SubwordFeaturizer from tensorflow_asr.models.conformer import Conformer config = Config(args.config, learning=False) with open(config.speech_config) as f: speech_config = yaml.load(f, Loader=yaml.Loader) speech_featurizer = TFSpeechFeaturizer(speech_config) if args.subwords and os.path.exists(args.subwords): print("Loading subwords ...") text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) else: text_featurizer = CharFeaturizer(config.decoder_config) text_featurizer.decoder_config.beam_width = args.beam_width # build model conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) conformer._build(speech_featurizer.shape) conformer.load_weights(args.saved, by_name=True)
def process(text): encoded_output = subword.extract(text.decode('utf-8')) encoded_input = subword.prepand_blank(encoded_output) encoded_output = tf.concat([encoded_output, [subword.blank]], axis=0) assert encoded_input.shape == encoded_output.shape return encoded_input, encoded_output @tf.function def parse(record): return tf.numpy_function(process, inp=[record], Tout=[tf.int32, tf.int32]) config = Config('config.yml', learning=True) subword = SubwordFeaturizer.load_from_file( config.decoder_config, '/home/joaoalvarenga/datasets/conformer_subwords.subwords') checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( 'checkpoint/lm.ckpt', save_weights_only=True, verbose=1) print(subword.num_classes) batch_size = 32 dataset = tf.data.TextLineDataset( '/media/work/joaoalvarenga/ptwiki-20181125.txt') dataset = dataset.map(parse) dataset = dataset.cache() # dataset = dataset.batch(batch_size, drop_remainder=True) dataset = dataset.padded_batch(batch_size=batch_size, padded_shapes=(tf.TensorShape([None]),