Beispiel #1
0
  features = np.load(args.filename).reshape([-1, 80, 1])
  features = tf.constant(features)
input_length = get_reduced_length(tf.shape(features)[0], conformer.time_reduction_factor)

if args.beam_width:
  transcript = conformer.recognize_beam(features[None, ...], input_length[None, ...])
  print("Transcript:", transcript[0].numpy().decode("UTF-8"))
elif args.timestamp:
  transcript, stime, etime, _, _ = conformer.recognize_tflite_with_timestamp(
    signal, tf.constant(text_featurizer.blank, dtype=tf.int32), conformer.predict_net.get_initial_state())
  print("Transcript:", transcript)
  print("Start time:", stime)
  print("End time:", etime)
else:
  if args.filename.endswith('.wav'):
    transcript, _, _ = conformer.recognize_tflite(
      signal, tf.constant(text_featurizer.blank, dtype=tf.int32), conformer.predict_net.get_initial_state())
    print("Transcript:", tf.strings.unicode_encode(transcript, "UTF-8").numpy().decode("UTF-8"))
  else:
    encoded = conformer.encoder_inference(features)
    hypothesis = conformer._perform_greedy(
        encoded, 
        tf.shape(encoded)[0], 
        tf.constant(text_featurizer.blank, dtype=tf.int32), 
        conformer.predict_net.get_initial_state())
    transcript = conformer.text_featurizer.indices2upoints(hypothesis.prediction)
    print("Transcript:", tf.strings.unicode_encode(transcript, "UTF-8").numpy().decode("UTF-8"))




                    help="Whether to only use cpu")

args = parser.parse_args()

setup_devices([args.device], cpu=args.cpu)

from tensorflow_asr.configs.config import Config
from tensorflow_asr.featurizers.speech_featurizers import read_raw_audio
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
from tensorflow_asr.models.conformer import Conformer

config = Config(args.config, learning=False)
speech_featurizer = TFSpeechFeaturizer(config.speech_config)
text_featurizer = CharFeaturizer(config.decoder_config)

# build model
conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
conformer._build(speech_featurizer.shape)
conformer.load_weights(args.saved, by_name=True)
conformer.summary(line_length=120)
conformer.add_featurizers(speech_featurizer, text_featurizer)

signal = read_raw_audio(args.filename)
predicted = tf.constant(args.blank, dtype=tf.int32)
states = tf.zeros([args.num_rnns, args.nstates, 1, args.statesize], dtype=tf.float32)

hyp, _, _ = conformer.recognize_tflite(signal, predicted, states)

print("".join([chr(u) for u in hyp]))