features = np.load(args.filename).reshape([-1, 80, 1]) features = tf.constant(features) input_length = get_reduced_length(tf.shape(features)[0], conformer.time_reduction_factor) if args.beam_width: transcript = conformer.recognize_beam(features[None, ...], input_length[None, ...]) print("Transcript:", transcript[0].numpy().decode("UTF-8")) elif args.timestamp: transcript, stime, etime, _, _ = conformer.recognize_tflite_with_timestamp( signal, tf.constant(text_featurizer.blank, dtype=tf.int32), conformer.predict_net.get_initial_state()) print("Transcript:", transcript) print("Start time:", stime) print("End time:", etime) else: if args.filename.endswith('.wav'): transcript, _, _ = conformer.recognize_tflite( signal, tf.constant(text_featurizer.blank, dtype=tf.int32), conformer.predict_net.get_initial_state()) print("Transcript:", tf.strings.unicode_encode(transcript, "UTF-8").numpy().decode("UTF-8")) else: encoded = conformer.encoder_inference(features) hypothesis = conformer._perform_greedy( encoded, tf.shape(encoded)[0], tf.constant(text_featurizer.blank, dtype=tf.int32), conformer.predict_net.get_initial_state()) transcript = conformer.text_featurizer.indices2upoints(hypothesis.prediction) print("Transcript:", tf.strings.unicode_encode(transcript, "UTF-8").numpy().decode("UTF-8"))
help="Whether to only use cpu") args = parser.parse_args() setup_devices([args.device], cpu=args.cpu) from tensorflow_asr.configs.config import Config from tensorflow_asr.featurizers.speech_featurizers import read_raw_audio from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer from tensorflow_asr.models.conformer import Conformer config = Config(args.config, learning=False) speech_featurizer = TFSpeechFeaturizer(config.speech_config) text_featurizer = CharFeaturizer(config.decoder_config) # build model conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) conformer._build(speech_featurizer.shape) conformer.load_weights(args.saved, by_name=True) conformer.summary(line_length=120) conformer.add_featurizers(speech_featurizer, text_featurizer) signal = read_raw_audio(args.filename) predicted = tf.constant(args.blank, dtype=tf.int32) states = tf.zeros([args.num_rnns, args.nstates, 1, args.statesize], dtype=tf.float32) hyp, _, _ = conformer.recognize_tflite(signal, predicted, states) print("".join([chr(u) for u in hyp]))