コード例 #1
0
    def test_inference_emotion_recognition(self):
        model = HubertForSequenceClassification.from_pretrained(
            "superb/hubert-base-superb-er",
            torch_dtype=torch.float16).to(torch_device)
        processor = Wav2Vec2FeatureExtractor.from_pretrained(
            "superb/hubert-base-superb-er")
        input_data = self._load_superb("er", 4)
        inputs = processor(input_data["speech"],
                           return_tensors="pt",
                           padding=True)

        input_values = inputs.input_values.half().to(torch_device)
        attention_mask = inputs.attention_mask.to(torch_device)
        with torch.no_grad():
            outputs = model(input_values, attention_mask=attention_mask)
        predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1)

        expected_labels = [1, 1, 2, 2]
        # s3prl logits for the same batch
        expected_logits = torch.tensor([2.8384, 2.3389, 3.8564, 4.5558],
                                       dtype=torch.float16,
                                       device=torch_device)

        self.assertListEqual(predicted_ids.tolist(), expected_labels)
        # TODO: lower the tolerance after merging the padding fix https://github.com/pytorch/fairseq/pull/3572
        self.assertTrue(
            torch.allclose(predicted_logits, expected_logits, atol=1e-1))
コード例 #2
0
    def test_inference_speaker_identification(self):
        model = HubertForSequenceClassification.from_pretrained(
            "superb/hubert-base-superb-sid",
            torch_dtype=torch.float16).to(torch_device)
        processor = Wav2Vec2FeatureExtractor.from_pretrained(
            "superb/hubert-base-superb-sid")
        input_data = self._load_superb("si", 4)

        output_logits = []
        with torch.no_grad():
            for example in input_data["speech"]:
                input = processor(example, return_tensors="pt", padding=True)
                output = model(input.input_values.half().to(torch_device),
                               attention_mask=None)
                output_logits.append(output.logits[0])
        output_logits = torch.stack(output_logits)
        predicted_logits, predicted_ids = torch.max(output_logits, dim=-1)

        expected_labels = [5, 1, 1, 3]
        # s3prl logits for the same batch
        expected_logits = torch.tensor(
            [78231.5547, 123166.6094, 122785.4141, 84851.2969],
            dtype=torch.float16,
            device=torch_device)

        self.assertListEqual(predicted_ids.tolist(), expected_labels)
        # TODO: lower the tolerance after merging the padding fix https://github.com/pytorch/fairseq/pull/3572
        self.assertTrue(
            torch.allclose(predicted_logits, expected_logits, atol=10))
コード例 #3
0
def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path,
                             model_dump_path):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    checkpoint = torch.load(checkpoint_path, map_location="cpu")
    if checkpoint["Config"]["downstream_expert"]["modelrc"][
            "select"] not in SUPPORTED_MODELS:
        raise NotImplementedError(
            f"The supported s3prl models are {SUPPORTED_MODELS}")

    downstream_dict = checkpoint["Downstream"]

    hf_congfig = HubertConfig.from_pretrained(config_path)
    hf_model = HubertForSequenceClassification.from_pretrained(
        base_model_name, config=hf_congfig)
    hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
        base_model_name, return_attention_mask=True, do_normalize=False)

    if hf_congfig.use_weighted_layer_sum:
        hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"]

    hf_model.projector.weight.data = downstream_dict["projector.weight"]
    hf_model.projector.bias.data = downstream_dict["projector.bias"]
    hf_model.classifier.weight.data = downstream_dict[
        "model.post_net.linear.weight"]
    hf_model.classifier.bias.data = downstream_dict[
        "model.post_net.linear.bias"]

    hf_feature_extractor.save_pretrained(model_dump_path)
    hf_model.save_pretrained(model_dump_path)
コード例 #4
0
    def test_inference_keyword_spotting(self):
        model = HubertForSequenceClassification.from_pretrained(
            "superb/hubert-base-superb-ks",
            torch_dtype=torch.float16).to(torch_device)
        processor = Wav2Vec2FeatureExtractor.from_pretrained(
            "superb/hubert-base-superb-ks")
        input_data = self._load_superb("ks", 4)
        inputs = processor(input_data["speech"],
                           return_tensors="pt",
                           padding=True)

        input_values = inputs.input_values.half().to(torch_device)
        attention_mask = inputs.attention_mask.to(torch_device)
        with torch.no_grad():
            outputs = model(input_values, attention_mask=attention_mask)
        predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1)

        expected_labels = [2, 6, 10, 9]
        # s3prl logits for the same batch
        expected_logits = torch.tensor([7.6692, 17.7795, 11.1562, 11.8232],
                                       dtype=torch.float16,
                                       device=torch_device)

        self.assertListEqual(predicted_ids.tolist(), expected_labels)
        self.assertTrue(
            torch.allclose(predicted_logits, expected_logits, atol=2e-2))
コード例 #5
0
    def test_inference_intent_classification(self):
        model = HubertForSequenceClassification.from_pretrained("superb/hubert-base-superb-ic").to(torch_device)
        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/hubert-base-superb-ic")
        input_data = self._load_superb("ic", 4)
        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)

        input_values = inputs.input_values.to(torch_device)
        attention_mask = inputs.attention_mask.to(torch_device)
        with torch.no_grad():
            outputs = model(input_values, attention_mask=attention_mask)

        predicted_logits_action, predicted_ids_action = torch.max(outputs.logits[:, :6], dim=-1)
        predicted_logits_object, predicted_ids_object = torch.max(outputs.logits[:, 6:20], dim=-1)
        predicted_logits_location, predicted_ids_location = torch.max(outputs.logits[:, 20:24], dim=-1)

        expected_labels_action = [1, 0, 4, 3]
        expected_logits_action = torch.tensor([5.9052, 12.5865, 4.4840, 10.0240], device=torch_device)
        expected_labels_object = [1, 10, 3, 4]
        expected_logits_object = torch.tensor([5.5316, 11.7946, 8.1672, 23.2415], device=torch_device)
        expected_labels_location = [0, 0, 0, 1]
        expected_logits_location = torch.tensor([5.2053, 8.9577, 10.0447, 8.1481], device=torch_device)

        self.assertListEqual(predicted_ids_action.tolist(), expected_labels_action)
        self.assertListEqual(predicted_ids_object.tolist(), expected_labels_object)
        self.assertListEqual(predicted_ids_location.tolist(), expected_labels_location)

        # TODO: lower the tolerance after merging the padding fix https://github.com/pytorch/fairseq/pull/3572
        self.assertTrue(torch.allclose(predicted_logits_action, expected_logits_action, atol=3e-1))
        self.assertTrue(torch.allclose(predicted_logits_object, expected_logits_object, atol=3e-1))
        self.assertTrue(torch.allclose(predicted_logits_location, expected_logits_location, atol=3e-1))