def check_ctc_training(self, config, input_values, *args): config.ctc_zero_infinity = True model = WavLMForCTC(config=config) model.to(torch_device) model.train() # freeze feature encoder model.freeze_feature_encoder() input_values = input_values[:3] input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths)) labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size) # pad input for i in range(len(input_lengths)): input_values[i, input_lengths[i] :] = 0.0 if max_length_labels[i] < labels.shape[-1]: # it's important that we make sure that target lenghts are at least # one shorter than logit lenghts to prevent -inf labels[i, max_length_labels[i] - 1 :] = -100 loss = model(input_values, labels=labels).loss self.parent.assertFalse(torch.isinf(loss).item()) loss.backward()
def check_ctc_loss(self, config, input_values, *args): model = WavLMForCTC(config=config) model.to(torch_device) # make sure that dropout is disabled model.eval() input_values = input_values[:3] attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long) input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths)) labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size) # pad input for i in range(len(input_lengths)): input_values[i, input_lengths[i] :] = 0.0 attention_mask[i, input_lengths[i] :] = 0 model.config.ctc_loss_reduction = "sum" sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item() model.config.ctc_loss_reduction = "mean" mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item() self.parent.assertTrue(isinstance(sum_loss, float)) self.parent.assertTrue(isinstance(mean_loss, float))
def check_labels_out_of_vocab(self, config, input_values, *args): model = WavLMForCTC(config) model.to(torch_device) model.train() input_values = input_values[:3] input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths)) labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size + 100) with pytest.raises(ValueError): model(input_values, labels=labels)