def test_inference_emotion_recognition(self): model = HubertForSequenceClassification.from_pretrained( "superb/hubert-base-superb-er", torch_dtype=torch.float16).to(torch_device) processor = Wav2Vec2FeatureExtractor.from_pretrained( "superb/hubert-base-superb-er") input_data = self._load_superb("er", 4) inputs = processor(input_data["speech"], return_tensors="pt", padding=True) input_values = inputs.input_values.half().to(torch_device) attention_mask = inputs.attention_mask.to(torch_device) with torch.no_grad(): outputs = model(input_values, attention_mask=attention_mask) predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1) expected_labels = [1, 1, 2, 2] # s3prl logits for the same batch expected_logits = torch.tensor([2.8384, 2.3389, 3.8564, 4.5558], dtype=torch.float16, device=torch_device) self.assertListEqual(predicted_ids.tolist(), expected_labels) # TODO: lower the tolerance after merging the padding fix https://github.com/pytorch/fairseq/pull/3572 self.assertTrue( torch.allclose(predicted_logits, expected_logits, atol=1e-1))
def test_inference_speaker_identification(self): model = HubertForSequenceClassification.from_pretrained( "superb/hubert-base-superb-sid", torch_dtype=torch.float16).to(torch_device) processor = Wav2Vec2FeatureExtractor.from_pretrained( "superb/hubert-base-superb-sid") input_data = self._load_superb("si", 4) output_logits = [] with torch.no_grad(): for example in input_data["speech"]: input = processor(example, return_tensors="pt", padding=True) output = model(input.input_values.half().to(torch_device), attention_mask=None) output_logits.append(output.logits[0]) output_logits = torch.stack(output_logits) predicted_logits, predicted_ids = torch.max(output_logits, dim=-1) expected_labels = [5, 1, 1, 3] # s3prl logits for the same batch expected_logits = torch.tensor( [78231.5547, 123166.6094, 122785.4141, 84851.2969], dtype=torch.float16, device=torch_device) self.assertListEqual(predicted_ids.tolist(), expected_labels) # TODO: lower the tolerance after merging the padding fix https://github.com/pytorch/fairseq/pull/3572 self.assertTrue( torch.allclose(predicted_logits, expected_logits, atol=10))
def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path, model_dump_path): """ Copy/paste/tweak model's weights to transformers design. """ checkpoint = torch.load(checkpoint_path, map_location="cpu") if checkpoint["Config"]["downstream_expert"]["modelrc"][ "select"] not in SUPPORTED_MODELS: raise NotImplementedError( f"The supported s3prl models are {SUPPORTED_MODELS}") downstream_dict = checkpoint["Downstream"] hf_congfig = HubertConfig.from_pretrained(config_path) hf_model = HubertForSequenceClassification.from_pretrained( base_model_name, config=hf_congfig) hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( base_model_name, return_attention_mask=True, do_normalize=False) if hf_congfig.use_weighted_layer_sum: hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"] hf_model.projector.weight.data = downstream_dict["projector.weight"] hf_model.projector.bias.data = downstream_dict["projector.bias"] hf_model.classifier.weight.data = downstream_dict[ "model.post_net.linear.weight"] hf_model.classifier.bias.data = downstream_dict[ "model.post_net.linear.bias"] hf_feature_extractor.save_pretrained(model_dump_path) hf_model.save_pretrained(model_dump_path)
def test_inference_keyword_spotting(self): model = HubertForSequenceClassification.from_pretrained( "superb/hubert-base-superb-ks", torch_dtype=torch.float16).to(torch_device) processor = Wav2Vec2FeatureExtractor.from_pretrained( "superb/hubert-base-superb-ks") input_data = self._load_superb("ks", 4) inputs = processor(input_data["speech"], return_tensors="pt", padding=True) input_values = inputs.input_values.half().to(torch_device) attention_mask = inputs.attention_mask.to(torch_device) with torch.no_grad(): outputs = model(input_values, attention_mask=attention_mask) predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1) expected_labels = [2, 6, 10, 9] # s3prl logits for the same batch expected_logits = torch.tensor([7.6692, 17.7795, 11.1562, 11.8232], dtype=torch.float16, device=torch_device) self.assertListEqual(predicted_ids.tolist(), expected_labels) self.assertTrue( torch.allclose(predicted_logits, expected_logits, atol=2e-2))
def test_inference_intent_classification(self): model = HubertForSequenceClassification.from_pretrained("superb/hubert-base-superb-ic").to(torch_device) processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/hubert-base-superb-ic") input_data = self._load_superb("ic", 4) inputs = processor(input_data["speech"], return_tensors="pt", padding=True) input_values = inputs.input_values.to(torch_device) attention_mask = inputs.attention_mask.to(torch_device) with torch.no_grad(): outputs = model(input_values, attention_mask=attention_mask) predicted_logits_action, predicted_ids_action = torch.max(outputs.logits[:, :6], dim=-1) predicted_logits_object, predicted_ids_object = torch.max(outputs.logits[:, 6:20], dim=-1) predicted_logits_location, predicted_ids_location = torch.max(outputs.logits[:, 20:24], dim=-1) expected_labels_action = [1, 0, 4, 3] expected_logits_action = torch.tensor([5.9052, 12.5865, 4.4840, 10.0240], device=torch_device) expected_labels_object = [1, 10, 3, 4] expected_logits_object = torch.tensor([5.5316, 11.7946, 8.1672, 23.2415], device=torch_device) expected_labels_location = [0, 0, 0, 1] expected_logits_location = torch.tensor([5.2053, 8.9577, 10.0447, 8.1481], device=torch_device) self.assertListEqual(predicted_ids_action.tolist(), expected_labels_action) self.assertListEqual(predicted_ids_object.tolist(), expected_labels_object) self.assertListEqual(predicted_ids_location.tolist(), expected_labels_location) # TODO: lower the tolerance after merging the padding fix https://github.com/pytorch/fairseq/pull/3572 self.assertTrue(torch.allclose(predicted_logits_action, expected_logits_action, atol=3e-1)) self.assertTrue(torch.allclose(predicted_logits_object, expected_logits_object, atol=3e-1)) self.assertTrue(torch.allclose(predicted_logits_location, expected_logits_location, atol=3e-1))