def check_training(self, config, input_values, *args): model = TFHubertForCTC(config) # freeze feature encoder model.freeze_feature_encoder() input_values = input_values[:3] input_lengths = tf.constant( [input_values.shape[-1] // i for i in [4, 2, 1]]) max_length_labels = model.hubert._get_feat_extract_output_lengths( input_lengths) labels = ids_tensor( (input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size) length_mask = tf.sequence_mask(input_lengths, dtype=tf.float32) input_values = input_values * length_mask pad_size = max(max_length_labels) - labels.shape[1] labels = tf.pad(labels, ((0, 0), (0, pad_size)), constant_values=-100) loss = model(input_values, labels=labels, training=True).loss self.parent.assertFalse(tf.math.is_inf(loss))
def test_inference_ctc_robust_batched(self): model = TFHubertForCTC.from_pretrained( "facebook/hubert-large-ls960-ft") processor = Wav2Vec2Processor.from_pretrained( "facebook/hubert-large-ls960-ft", do_lower_case=True) input_speech = self._load_datasamples(4) inputs = processor(input_speech, return_tensors="tf", padding=True, sampling_rate=16000) input_values = inputs.input_values attention_mask = inputs.attention_mask logits = model(input_values, attention_mask=attention_mask).logits predicted_ids = tf.argmax(logits, axis=-1) predicted_trans = processor.batch_decode(predicted_ids) EXPECTED_TRANSCRIPTIONS = [ "a man said to the universe sir i exist", "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore", "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around" " him with the thousands of spectators were trivialities not worth thinking about", "his instant of panic was followed by a small sharp blow high on his chest", ] self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
def check_ctc_loss(self, config, input_values, *args): model = TFHubertForCTC(config) input_values = input_values[:3] attention_mask = tf.ones_like(input_values) input_lengths = tf.constant( [input_values.shape[-1] // i for i in [4, 2, 1]]) max_length_labels = model.hubert._get_feat_extract_output_lengths( input_lengths) labels = ids_tensor( (input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size) length_mask = tf.sequence_mask(input_lengths, dtype=tf.float32) # convert values that are over input_lengths to padding input_values = input_values * length_mask attention_mask = attention_mask * length_mask model.config.ctc_loss_reduction = "sum" sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss model.config.ctc_loss_reduction = "mean" mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss self.parent.assertTrue( abs(labels.shape[0] * mean_loss - sum_loss) < 1e-2)
def check_labels_out_of_vocab(self, config, input_values, *args): model = TFHubertForCTC(config) input_lengths = tf.constant([input_values.shape[-1] // i for i in [4, 2, 1]]) max_length_labels = model.hubert._get_feat_extract_output_lengths(input_lengths) labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size + 100) with pytest.raises(ValueError): model(input_values, labels=labels)
def test_inference_ctc_normal(self): model = TFHubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft") processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft", do_lower_case=True) input_speech = self._load_datasamples(1) input_values = processor(input_speech, return_tensors="tf", sampling_rate=16000).input_values logits = model(input_values).logits predicted_ids = tf.argmax(logits, axis=-1) predicted_trans = processor.batch_decode(predicted_ids) EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"] self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
def test_inference_ctc_normal_batched(self): model = TFHubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft") processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft", do_lower_case=True) input_speech = self._load_datasamples(2) input_values = processor(input_speech, return_tensors="tf", padding=True, sampling_rate=16000).input_values logits = model(input_values).logits predicted_ids = tf.argmax(logits, axis=-1) predicted_trans = processor.batch_decode(predicted_ids) EXPECTED_TRANSCRIPTIONS = [ "a man said to the universe sir i exist", "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore", ] self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)