def test_loading_adapter_weights_without_prefix(self):
        model_base, model_with_head_base = create_twin_models(
            AutoModel, self.config)

        model_with_head = AutoModelWithHeads.from_config(
            model_with_head_base.config)
        setattr(model_with_head, model_with_head.base_model_prefix,
                model_with_head_base)

        model_base.add_adapter("dummy")

        with tempfile.TemporaryDirectory() as temp_dir:
            model_base.save_adapter(temp_dir, "dummy")

            loading_info = {}
            model_with_head.load_adapter(temp_dir, loading_info=loading_info)

        self.assertEqual(0, len(loading_info["missing_keys"]))
        self.assertEqual(0, len(loading_info["unexpected_keys"]))

        # check equal output
        input_ids = self.get_input_samples((1, 128),
                                           config=model_with_head.config)
        output1 = model_with_head(input_ids)
        output2 = model_base(input_ids)
        self.assertEqual(len(output1), len(output2))
        self.assertTrue(torch.equal(output1[0], output2[0]))
    def test_train_single_adapter(self):
        tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name,
                                                  use_fast=False)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelWithHeads.from_config(self.config())

        # add two adapters: one will be trained and the other should be frozen
        model.add_adapter("mrpc")
        model.add_adapter("dummy")
        model.add_classification_head("mrpc")

        self.assertIn("mrpc", model.config.adapters.adapters)
        self.assertIn("dummy", model.config.adapters.adapters)

        # train the mrpc adapter -> should be activated & unfreezed
        model.train_adapter("mrpc")
        self.assertEqual(set(["mrpc"]), model.active_adapters.flatten())

        # all weights of the adapter should be activated
        for k, v in filter_parameters(model, "adapters.mrpc.").items():
            self.assertTrue(v.requires_grad, k)
        # all weights of the adapter not used for training should be freezed
        for k, v in filter_parameters(model, "adapters.dummy.").items():
            self.assertFalse(v.requires_grad, k)
        # weights of the model should be freezed (check on some examples)
        for k, v in filter_parameters(model,
                                      "encoder.layer.0.attention").items():
            self.assertFalse(v.requires_grad, k)

        state_dict_pre = copy.deepcopy(model.state_dict())

        # setup dataset
        data_args = GlueDataTrainingArguments(
            task_name="mrpc",
            data_dir="./tests/fixtures/tests_samples/MRPC",
            overwrite_cache=True)
        train_dataset = GlueDataset(data_args,
                                    tokenizer=tokenizer,
                                    mode="train")
        training_args = TrainingArguments(output_dir="./examples",
                                          do_train=True,
                                          learning_rate=0.1,
                                          max_steps=7,
                                          no_cuda=True)

        # evaluate
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
        )
        trainer.train()

        for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(),
                                        model.state_dict().items()):
            if "mrpc" in k1:
                self.assertFalse(torch.equal(v1, v2))
            else:
                self.assertTrue(torch.equal(v1, v2))
Example #3
0
    def test_parallel_inference_with_heads(self):
        model = AutoModelWithHeads.from_config(self.config())

        model.add_adapter("a")
        model.add_adapter("b")
        model.add_classification_head("a", num_labels=2)
        model.add_classification_head("b", num_labels=3)
        model.eval()

        inputs = {}
        inputs["attention_mask"] = torch.randint(0, 2, size=(2, 128))
        inputs["input_ids"] = self.get_input_samples((2, 128),
                                                     config=model.config)

        # for reference, pass through single adapters
        model.active_adapters = "a"
        model.active_head = "a"
        outputs_a = model(**inputs)
        model.active_adapters = "b"
        model.active_head = "b"
        outputs_b = model(**inputs)

        model.active_adapters = Parallel("a", "b")
        # active_adapters should set parallel heads too
        self.assertEqual(model.active_head, ["a", "b"])
        outputs = model(**inputs)

        self.assertEqual(len(outputs), 2)
        self.assertEqual(outputs[0][0].shape, (2, 2))
        self.assertEqual(outputs[1][0].shape, (2, 3))
        self.assertTrue(torch.allclose(outputs[0][0], outputs_a[0]))
        self.assertTrue(torch.allclose(outputs[1][0], outputs_b[0]))
    def test_loading_adapter_weights_without_prefix(self):
        if self.config_class not in MODEL_WITH_HEADS_MAPPING:
            self.skipTest("Does not support flex heads.")

        model_base, model_with_head_base = create_twin_models(
            self.model_class, self.config)

        model_with_head = AutoModelWithHeads.from_config(
            model_with_head_base.config)
        setattr(model_with_head, model_with_head.base_model_prefix,
                model_with_head_base)

        model_base.add_adapter("dummy")

        with tempfile.TemporaryDirectory() as temp_dir:
            model_base.save_adapter(temp_dir, "dummy")

            loading_info = {}
            model_with_head.load_adapter(temp_dir, loading_info=loading_info)

        self.assertEqual(0, len(loading_info["missing_keys"]))
        self.assertEqual(0, len(loading_info["unexpected_keys"]))

        # check equal output
        input_data = self.get_input_samples((1, 128),
                                            config=model_with_head.config)
        output1 = model_with_head(**input_data)
        output2 = model_base(**input_data)
        self.assertEqual(len(output1), len(output2))
        self.assertTrue(torch.equal(output1[0], output2[0]))
    def test_train_adapter_fusion(self):
        tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name,
                                                  use_fast=False)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelWithHeads.from_config(self.config())
        self.add_head(model, "head")

        # add the adapters to be fused
        model.add_adapter("a")
        model.add_adapter("b")
        model.add_adapter("c")

        self.assertIn("a", model.config.adapters.adapters)
        self.assertIn("b", model.config.adapters.adapters)
        self.assertIn("c", model.config.adapters.adapters)

        # setup fusion
        adapter_setup = Fuse("a", "b", "c")
        model.add_adapter_fusion(adapter_setup)
        model.train_adapter_fusion(adapter_setup)
        model.set_active_adapters(adapter_setup)
        self.assertEqual(adapter_setup, model.active_adapters)

        # all weights of the adapters should be frozen (test for one)
        for k, v in filter_parameters(model, "adapters.a.").items():
            self.assertFalse(v.requires_grad, k)
        # all weights of the fusion layer should be activated
        for k, v in filter_parameters(model, "adapter_fusion_layer").items():
            self.assertTrue(v.requires_grad, k)
        # weights of the model should be freezed (check on some examples)
        for k, v in filter_parameters(model,
                                      "encoder.layer.0.attention").items():
            self.assertFalse(v.requires_grad, k)

        state_dict_pre = copy.deepcopy(model.state_dict())

        # Since our config has a value matrix, make sure it is regularized.
        # We do this by patching the fusion regularization function.
        regularization_called = False
        orig_fusion_regularization_loss = model.base_model.get_fusion_regularization_loss

        def patched_fusion_reg_loss():
            nonlocal regularization_called
            regularization_called = True
            return orig_fusion_regularization_loss()

        model.base_model.get_fusion_regularization_loss = patched_fusion_reg_loss

        self.trainings_run(model, tokenizer)

        for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(),
                                        model.state_dict().items()):
            if ("adapter_fusion_layer" in k1 or "classifier" in k1
                    or "classification_head" in k1 or "score" in k1
                    or "heads" in k1):
                self.assertFalse(torch.equal(v1, v2), k1)
            else:
                self.assertTrue(torch.equal(v1, v2), k1)
        self.assertTrue(regularization_called)
Example #6
0
    def test_load_full_model(self):
        model = AutoModelWithHeads.from_config(self.config())
        model.add_classification_head("dummy", layers=1)

        true_config = model.get_prediction_heads_config()
        with tempfile.TemporaryDirectory() as temp_dir:
            # save
            model.save_pretrained(temp_dir)
            # reload
            model = AutoModelWithHeads.from_pretrained(temp_dir)
        self.assertIn("dummy", model.heads)
        self.assertDictEqual(true_config, model.get_prediction_heads_config())
Example #7
0
    def test_batch_split_head(self):
        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class],
                       "add_classification_head"):
            self.skipTest("No classification head available")
        model = AutoModelWithHeads.from_config(self.config())
        model.add_classification_head("a")
        model.add_classification_head("b")
        model.active_head = BatchSplit("a", "b", batch_sizes=[1, 2])
        in_data = self.get_input_samples((3, 128), config=model.config)

        out = model(**in_data)
        self.assertEqual(2, len(out))
        self.assertEqual((1, 2), out[0][0].shape)
        self.assertEqual((2, 2), out[1][0].shape)
Example #8
0
    def test_delete_head(self):
        model = AutoModelWithHeads.from_config(self.config())
        model.eval()

        name = "test_head"
        self.add_head(model, name)
        self.assertTrue(name in model.heads)
        self.assertTrue(name in model.config.prediction_heads)
        self.assertEqual(name, model.active_head)

        model.delete_head(name)
        self.assertFalse(name in model.heads)
        self.assertFalse(name in model.config.prediction_heads)
        self.assertNotEqual(name, model.active_head)
Example #9
0
    def test_batch_split_adapter_head(self):
        model = AutoModelWithHeads.from_config(self.config())
        self.add_head(model, "a")
        self.add_head(model, "b")
        model.add_adapter("a")
        model.add_adapter("b")
        model.add_adapter("c")
        model.set_active_adapters(
            BatchSplit(Stack("c", "a"), "b", batch_sizes=[2, 1]))

        in_data = self.get_input_samples((3, 128), config=model.config)
        out = model(**in_data)

        self.assertEqual(2, len(out))
        self.assertTrue(isinstance(model.active_head, BatchSplit))
Example #10
0
    def test_invertible_adapter_with_head(self):
        if hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class],
                   "add_masked_lm_head"):
            lm_head = "masked_lm"
        elif hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class],
                     "add_causal_lm_head"):
            lm_head = "casual_lm"
        elif hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class],
                     "add_seq2seq_lm_head"):
            lm_head = "seq2seq_lm"
        else:
            self.skipTest("No masked or causel language model head")

        model = AutoModelWithHeads.from_config(self.config())
        model.add_adapter("test", config="pfeiffer+inv")
        if lm_head == "casual_lm":
            model.add_causal_lm_head("test")
        elif lm_head == "masked_lm":
            model.add_masked_lm_head("test")
        elif lm_head == "seq2seq_lm":
            model.add_seq2seq_lm_head("test")
        else:
            raise RuntimeError("{} is not a valid lm head".format(lm_head))
        model.set_active_adapters("test")

        # Set a hook before the invertible adapter to make sure it's actually called twice:
        # Once after the embedding layer and once in the prediction head.
        calls = 0

        def forward_pre_hook(module, input):
            nonlocal calls
            calls += 1

        inv_adapter = model.base_model.get_invertible_adapter()
        self.assertIsNotNone(inv_adapter)
        inv_adapter.register_forward_pre_hook(forward_pre_hook)

        in_data = self.get_input_samples((self.batch_size, self.seq_length),
                                         config=model.config)
        out = model(**in_data)

        self.assertEqual(
            (self.batch_size, self.seq_length, model.config.vocab_size),
            out[0].shape)
        self.assertEqual(2, calls)
Example #11
0
    def test_parallel_inference_with_wrong_number_of_heads(self):
        model = AutoModelWithHeads.from_config(self.config())
        model.eval()

        model.add_adapter("a")
        model.add_adapter("b")
        self.add_head(model, "a", num_labels=2)

        inputs = self.get_input_samples((2, 128), config=model.config)

        model.active_adapters = Parallel("a", "b")
        model.active_head = ["a"]
        with self.assertRaises(ValueError):
            model(**inputs)

        model.active_head = "a"
        with self.assertRaises(ValueError):
            model(**inputs)
Example #12
0
    def test_parallel_training(self):
        tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelWithHeads.from_config(self.config())

        model.add_adapter("mrpc1")
        model.add_adapter("mrpc2")
        self.add_head(model, "mrpc1", num_labels=2)
        self.add_head(model, "mrpc2", num_labels=3)
        model.active_adapters = Parallel("mrpc1", "mrpc2")
        model.train_adapter(Parallel("mrpc1", "mrpc2"))
        # model.eval()

        # all weights of the adapter should be activated
        for k, v in filter_parameters(model, "adapters.mrpc1.").items():
            self.assertTrue(v.requires_grad, k)
        # all weights of the adapter not used for training should be freezed
        for k, v in filter_parameters(model, "adapters.mrpc2.").items():
            self.assertTrue(v.requires_grad, k)
        # weights of the model should be freezed (check on some examples)
        for k, v in filter_parameters(model, "encoder.layer.0.attention").items():
            self.assertFalse(v.requires_grad, k)

        state_dict_pre = copy.deepcopy(model.state_dict())

        train_dataset = self.dataset(tokenizer)
        training_args = TrainingArguments(
            output_dir="./examples", do_train=True, learning_rate=0.1, max_steps=10, no_cuda=True
        )

        # evaluate
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
        )
        trainer.train()

        for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(), model.state_dict().items()):
            if "mrpc" in k1:
                self.assertFalse(torch.equal(v1, v2), k1)
            else:
                self.assertTrue(torch.equal(v1, v2))
    def test_train_single_adapter(self):
        tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name,
                                                  use_fast=False)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelWithHeads.from_config(self.config())

        # add two adapters: one will be trained and the other should be frozen
        model.add_adapter("mrpc")
        model.add_adapter("dummy")
        self.add_head(model, "mrpc")

        self.assertIn("mrpc", model.config.adapters.adapters)
        self.assertIn("dummy", model.config.adapters.adapters)

        # train the mrpc adapter -> should be activated & unfreezed
        model.train_adapter("mrpc")
        self.assertEqual(set(["mrpc"]), model.active_adapters.flatten())

        # all weights of the adapter should be activated
        for k, v in filter_parameters(model, "adapters.mrpc.").items():
            self.assertTrue(v.requires_grad, k)
        # all weights of the adapter not used for training should be freezed
        for k, v in filter_parameters(model, "adapters.dummy.").items():
            self.assertFalse(v.requires_grad, k)
        # weights of the model should be freezed (check on some examples)
        for k, v in filter_parameters(model,
                                      "encoder.layer.0.attention").items():
            self.assertFalse(v.requires_grad, k)

        state_dict_pre = copy.deepcopy(model.state_dict())

        self.trainings_run(model, tokenizer)

        for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(),
                                        model.state_dict().items()):
            if "mrpc" in k1:
                self.assertFalse(torch.equal(v1, v2))
            else:
                self.assertTrue(torch.equal(v1, v2))
Example #14
0
    def test_batch_split_with_heads(self):
        model = AutoModelWithHeads.from_config(self.config())
        model.add_adapter("a")
        model.add_adapter("b")
        self.add_head(model, "a", num_labels=2)
        self.add_head(model, "b", num_labels=3)
        model.eval()

        inputs = {"input_ids": self.get_input_samples((2, 128), config=model.config)["input_ids"]}
        if isinstance(model, T5ModelWithHeads):
            inputs["decoder_input_ids"] = inputs["input_ids"]

        # for reference, pass through single adapters
        model.active_adapters = "a"
        model.active_head = "a"
        outputs_a = model(**{k: v[:1] for k, v in inputs.items()})
        model.active_adapters = "b"
        model.active_head = "b"
        outputs_b = model(**{k: v[1:] for k, v in inputs.items()})

        model.set_active_adapters(BatchSplit("a", "b", batch_sizes=[1, 1]))
        output = model(**inputs)

        self.assertEqual(2, len(output))
        self.assertTrue(
            torch.allclose(
                output[0]["logits"],
                outputs_a["logits"],
                atol=1e-05,
            )
        )
        self.assertTrue(
            torch.allclose(
                output[1]["logits"],
                outputs_b["logits"],
                atol=1e-05,
            )
        )
Example #15
0
    def test_parallel_training_single_forward_pass(self):
        model = AutoModelWithHeads.from_config(self.config())
        model.eval()

        a1, a2 = self.create_twin_adapters(model, "a")
        b1, b2 = self.create_twin_adapters(model, "b")

        state_dict = model.state_dict()
        for k, v in state_dict.items():
            if a1 in k:
                self.assertTrue(torch.equal(v, state_dict[k.replace(a1, a2)]))
            if b1 in k:
                self.assertTrue(torch.equal(v, state_dict[k.replace(b1, b2)]))

        input_data = self.get_input_samples((3, 128), config=model.config)
        if isinstance(model, T5ModelWithHeads):
            input_data["labels"] = torch.randint(0, 2, (3, 128))
        else:
            input_data["labels"] = torch.randint(0, 2, (3, 1))

        outputs = []
        for adapter in [a1, b1]:
            model.active_head = adapter
            model.set_active_adapters(adapter)
            model.train_adapter(adapter)
            model.eval()
            outputs.append(model(**input_data))

        model.set_active_adapters(Parallel(a2, b2))
        model.train_adapter((Parallel(a2, b2)))
        model.eval()

        parallel_outputs = model(**input_data)

        for out1, out2 in zip(outputs, parallel_outputs.head_outputs):
            self.assertTrue(torch.allclose(out1["loss"], out2["loss"]))
            self.assertTrue(torch.allclose(out1["logits"], out2["logits"], atol=1e-5))
    def test_batch_split_training(self):
        tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name,
                                                  use_fast=False)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelWithHeads.from_config(self.config())

        model.add_adapter("mrpc1")
        model.add_adapter("mrpc2")
        self.add_head(model, "mrpc1")
        self.add_head(model, "mrpc2")
        adapter_setup = BatchSplit("mrpc1", "mrpc2", batch_sizes=[1, 1])
        model.active_adapters = adapter_setup
        model.train_adapter(adapter_setup)

        # all weights of the adapter should be activated
        for k, v in filter_parameters(model, "adapters.mrpc1.").items():
            self.assertTrue(v.requires_grad, k)
        # all weights of the adapter not used for training should be freezed
        for k, v in filter_parameters(model, "adapters.mrpc2.").items():
            self.assertTrue(v.requires_grad, k)
        # weights of the model should be freezed (check on some examples)
        for k, v in filter_parameters(model,
                                      "encoder.layer.0.attention").items():
            self.assertFalse(v.requires_grad, k)

        state_dict_pre = copy.deepcopy(model.state_dict())

        self.trainings_run(model, tokenizer)

        for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(),
                                        model.state_dict().items()):
            if "mrpc" in k1:
                self.assertFalse(torch.equal(v1, v2))
            else:
                self.assertTrue(torch.equal(v1, v2))
Example #17
0
    def test_parallel_training_equivalent_to_single_adapters(self):
        model = AutoModelWithHeads.from_config(self.config())
        model.eval()

        a1, a2 = self.create_twin_adapters(model, "a")
        b1, b2 = self.create_twin_adapters(model, "b")

        dataset = []
        for i in range(3):
            input_data = self.get_input_samples((3, 128), config=model.config)
            if isinstance(model, T5ModelWithHeads):
                input_data["labels"] = torch.randint(0, 2, (3, 128))
            else:
                input_data["labels"] = torch.randint(0, 2, (3, 1))
            dataset.append(input_data)

        for adapter in [a1, b1]:
            model.active_head = adapter
            model.set_active_adapters(adapter)
            model.train_adapter(adapter)
            model.eval()

            model = self.train_model(model, dataset)

        model.set_active_adapters(Parallel(a2, b2))
        model.train_adapter((Parallel(a2, b2)))
        model.eval()

        model = self.train_model(model, dataset)

        state_dict = model.state_dict()
        for k, v in state_dict.items():
            if a1 in k:
                self.assertTrue(torch.allclose(v, state_dict[k.replace(a1, a2)], atol=1e-5))
            if b1 in k:
                self.assertTrue(torch.allclose(v, state_dict[k.replace(b1, b2)], atol=1e-5))