def test_simple_s2t(self): model = Speech2TextForConditionalGeneration.from_pretrained( "facebook/s2t-small-mustc-en-it-st") tokenizer = AutoTokenizer.from_pretrained( "facebook/s2t-small-mustc-en-it-st") feature_extractor = AutoFeatureExtractor.from_pretrained( "facebook/s2t-small-mustc-en-it-st") asr = AutomaticSpeechRecognitionPipeline( model=model, tokenizer=tokenizer, feature_extractor=feature_extractor) waveform = np.tile(np.arange(1000, dtype=np.float32), 34) output = asr(waveform) self.assertEqual(output, {"text": "(Applausi)"}) ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = asr(filename) self.assertEqual( output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."}) filename = ds[40]["file"] with open(filename, "rb") as f: data = f.read() output = asr(data) self.assertEqual( output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."})
def test_simple_wav2vec2(self): model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") tokenizer = AutoTokenizer.from_pretrained( "facebook/wav2vec2-base-960h") feature_extractor = AutoFeatureExtractor.from_pretrained( "facebook/wav2vec2-base-960h") asr = AutomaticSpeechRecognitionPipeline( model=model, tokenizer=tokenizer, feature_extractor=feature_extractor) waveform = np.tile(np.arange(1000, dtype=np.float32), 34) output = asr(waveform) self.assertEqual(output, {"text": ""}) ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = asr(filename) self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"}) filename = ds[40]["file"] with open(filename, "rb") as f: data = f.read() output = asr(data) self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
def get_test_pipeline(self, model, tokenizer, feature_extractor): if tokenizer is None: # Side effect of no Fast Tokenizer class for these model, so skipping # But the slow tokenizer test should still run as they're quite small self.skipTest("No tokenizer available") return # return None, None speech_recognizer = AutomaticSpeechRecognitionPipeline( model=model, tokenizer=tokenizer, feature_extractor=feature_extractor) # test with a raw waveform audio = np.zeros((34000, )) audio2 = np.zeros((14000, )) return speech_recognizer, [audio, audio2]
def test_simple_s2t(self): import numpy as np from datasets import load_dataset model = Speech2TextForConditionalGeneration.from_pretrained( "facebook/s2t-small-mustc-en-it-st") tokenizer = AutoTokenizer.from_pretrained( "facebook/s2t-small-mustc-en-it-st") feature_extractor = AutoFeatureExtractor.from_pretrained( "facebook/s2t-small-mustc-en-it-st") asr = AutomaticSpeechRecognitionPipeline( model=model, tokenizer=tokenizer, feature_extractor=feature_extractor) waveform = np.zeros((34000, )) output = asr(waveform) self.assertEqual(output, { "text": "E questo รจ il motivo per cui non ci siamo mai incontrati." }) ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") filename = ds[0]["file"] output = asr(filename) self.assertEqual( output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."}) filename = ds[0]["file"] with open(filename, "rb") as f: data = f.read() output = asr(data) self.assertEqual( output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."})