Exemple #1
0
class ConformerTamilASR(object):
    """
    Conformer S based ASR model
    """
    def __init__(self, path='ConformerS.h5'):
        # fetch and load the config of the model
        config = Config('tamil_tech/configs/conformer_new_config.yml', learning=True)

        # load speech and text featurizers
        speech_featurizer = TFSpeechFeaturizer(config.speech_config)
        text_featurizer = CharFeaturizer(config.decoder_config)

        # check if model already exists in given path, else download the model in the given path
        if os.path.exists(path):
          pass
        else:
          print("Downloading Model...")
          file_id = config.file_id
          download_file_from_google_drive(file_id, path)
          print("Downloaded Model Successfully...")
        
        # load model using config
        self.model = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
        # set shape of the featurizer and build the model
        self.model._build(speech_featurizer.shape)
        # load weights of the model
        self.model.load_weights(path, by_name=True)
        # display model summary
        self.model.summary(line_length=120)
        # set featurizers for the model
        self.model.add_featurizers(speech_featurizer, text_featurizer)

        print("Loaded Model...!")
    
    def read_raw_audio(self, audio, sample_rate=16000):
        # if audio path is given, load audio using librosa
        if isinstance(audio, str):
            wave, _ = librosa.load(os.path.expanduser(audio), sr=sample_rate)
        
        # if audio file is in bytes, use soundfile to read audio
        elif isinstance(audio, bytes):
            wave, sr = sf.read(io.BytesIO(audio))
            
            # if audio is stereo, convert it to mono
            try:
                if wave.shape[1] >= 2:
                  wave = np.transpose(wave)[0][:]
            except:
              pass
            
            # get loaded audio as numpy array
            wave = np.asfortranarray(wave)

            # resampel to 16000 kHz
            if sr != sample_rate:
                wave = librosa.resample(wave, sr, sample_rate)
        
        # if numpy array, return audio
        elif isinstance(audio, np.ndarray):
            return audio
        
        else:
            raise ValueError("input audio must be either a path or bytes")
        return wave

    def bytes_to_string(self, array: np.ndarray, encoding: str = "utf-8"):
        # decode text array with utf-8 encoding
        return [transcript.decode(encoding) for transcript in array]

    def infer(self, path, greedy=True, return_text=False):
        # read the audio 
        signal = self.read_raw_audio(path)
        # expand dims to process for a single prediction
        signal = tf.expand_dims(self.model.speech_featurizer.tf_extract(signal), axis=0)
        # predict greedy
        if greedy:
          pred = self.model.recognize(features=signal)
        else:
          # preidct using beam search and language model
          pred = self.model.recognize_beam(features=signal, lm=True)

        if return_text:
          # return predicted transcription
          return self.bytes_to_string(pred.numpy())[0]
        
        # return predicted transcription
        print(self.bytes_to_string(pred.numpy())[0], end=' ')
Exemple #2
0
import numpy as np
np.random.seed(0)
tf.random.set_seed(0)
if args.filename.endswith('.wav'):
  signal = read_raw_audio(args.filename)
  # features = speech_featurizer.tf_extract(signal)
  features = speech_featurizer.extract(signal)
  features = tf.constant(features)
else:
  features = np.load(args.filename).reshape([-1, 80, 1])
  features = tf.constant(features)
input_length = get_reduced_length(tf.shape(features)[0], conformer.time_reduction_factor)

if args.beam_width:
  transcript = conformer.recognize_beam(features[None, ...], input_length[None, ...])
  print("Transcript:", transcript[0].numpy().decode("UTF-8"))
elif args.timestamp:
  transcript, stime, etime, _, _ = conformer.recognize_tflite_with_timestamp(
    signal, tf.constant(text_featurizer.blank, dtype=tf.int32), conformer.predict_net.get_initial_state())
  print("Transcript:", transcript)
  print("Start time:", stime)
  print("End time:", etime)
else:
  if args.filename.endswith('.wav'):
    transcript, _, _ = conformer.recognize_tflite(
      signal, tf.constant(text_featurizer.blank, dtype=tf.int32), conformer.predict_net.get_initial_state())
    print("Transcript:", tf.strings.unicode_encode(transcript, "UTF-8").numpy().decode("UTF-8"))
  else:
    encoded = conformer.encoder_inference(features)
    hypothesis = conformer._perform_greedy(
from tensorflow_asr.configs.config import Config
from tensorflow_asr.featurizers.speech_featurizers import read_raw_audio
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer, SubwordFeaturizer
from tensorflow_asr.models.conformer import Conformer

config = Config(args.config, learning=False)
speech_featurizer = TFSpeechFeaturizer(config.speech_config)
if args.subwords and os.path.exists(args.subwords):
    print("Loading subwords ...")
    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
else:
    text_featurizer = CharFeaturizer(config.decoder_config)
text_featurizer.decoder_config.beam_width = args.beam_width

# build model
conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
conformer._build(speech_featurizer.shape)
conformer.load_weights(args.saved, by_name=True)
conformer.summary(line_length=120)
conformer.add_featurizers(speech_featurizer, text_featurizer)

signal = read_raw_audio(args.filename)

if (args.beam_width):
    transcript = conformer.recognize_beam(signal[None, ...])
else:
    transcript = conformer.recognize(signal[None, ...])

tf.print("Transcript:", transcript[0])