def test_training_load_best_model_at_end_adapter(self):
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        data_args = GlueDataTrainingArguments(
            task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True
        )
        train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train")
        eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev")

        model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
        model.add_adapter("adapter")
        model.train_adapter("adapter")

        training_args = TrainingArguments(
            output_dir="./examples",
            do_train=True,
            learning_rate=0.001,
            max_steps=1,
            save_steps=1,
            remove_unused_columns=False,
            load_best_model_at_end=True,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            num_train_epochs=2,
        )
        trainer = AdapterTrainer(
            model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset
        )
        with self.assertLogs(logger) as cm:
            trainer.train()
            self.assertTrue(any("Loading best adapter(s) from" in line for line in cm.output))
        self.assertEqual(Stack("adapter"), trainer.model.active_adapters)
Ejemplo n.º 2
0
def create_datasets(task_name: str,
                    tokenizer: BertTokenizer,
                    data_dir: Optional[str] = None
                    ) -> Tuple[CustomGlueDataset, CustomGlueDataset]:
    if task_name not in ["mnli", "mnli-2", "hans"]:
        raise ValueError(f"Unrecognized task {task_name}")

    if data_dir is None:
        if task_name in ["mnli", "mnli-2"]:
            data_dir = constants.GLUE_DATA_DIR
        if task_name in ["hans"]:
            data_dir = constants.HANS_DATA_DIR

    data_args = GlueDataTrainingArguments(task_name=task_name,
                                          data_dir=data_dir,
                                          max_seq_length=128)

    train_dataset = CustomGlueDataset(args=data_args,
                                      tokenizer=tokenizer,
                                      mode="train")

    eval_dataset = CustomGlueDataset(args=data_args,
                                     tokenizer=tokenizer,
                                     mode="dev")

    return train_dataset, eval_dataset
    def test_train_single_adapter(self):
        tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name,
                                                  use_fast=False)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelWithHeads.from_config(self.config())

        # add two adapters: one will be trained and the other should be frozen
        model.add_adapter("mrpc")
        model.add_adapter("dummy")
        model.add_classification_head("mrpc")

        self.assertIn("mrpc", model.config.adapters.adapters)
        self.assertIn("dummy", model.config.adapters.adapters)

        # train the mrpc adapter -> should be activated & unfreezed
        model.train_adapter("mrpc")
        self.assertEqual(set(["mrpc"]), model.active_adapters.flatten())

        # all weights of the adapter should be activated
        for k, v in filter_parameters(model, "adapters.mrpc.").items():
            self.assertTrue(v.requires_grad, k)
        # all weights of the adapter not used for training should be freezed
        for k, v in filter_parameters(model, "adapters.dummy.").items():
            self.assertFalse(v.requires_grad, k)
        # weights of the model should be freezed (check on some examples)
        for k, v in filter_parameters(model,
                                      "encoder.layer.0.attention").items():
            self.assertFalse(v.requires_grad, k)

        state_dict_pre = copy.deepcopy(model.state_dict())

        # setup dataset
        data_args = GlueDataTrainingArguments(
            task_name="mrpc",
            data_dir="./tests/fixtures/tests_samples/MRPC",
            overwrite_cache=True)
        train_dataset = GlueDataset(data_args,
                                    tokenizer=tokenizer,
                                    mode="train")
        training_args = TrainingArguments(output_dir="./examples",
                                          do_train=True,
                                          learning_rate=0.1,
                                          max_steps=7,
                                          no_cuda=True)

        # evaluate
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
        )
        trainer.train()

        for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(),
                                        model.state_dict().items()):
            if "mrpc" in k1:
                self.assertFalse(torch.equal(v1, v2))
            else:
                self.assertTrue(torch.equal(v1, v2))
Ejemplo n.º 4
0
    def test_resume_training(self):

        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        data_args = GlueDataTrainingArguments(
            task_name="mrpc",
            data_dir="./tests/fixtures/tests_samples/MRPC",
            overwrite_cache=True)
        train_dataset = GlueDataset(data_args,
                                    tokenizer=tokenizer,
                                    mode="train")

        model = AutoModelForSequenceClassification.from_pretrained(
            "bert-base-uncased")
        model.add_adapter("adapter")
        model.add_adapter("additional_adapter")
        model.set_active_adapters("adapter")

        training_args = TrainingArguments(
            output_dir="./examples",
            do_train=True,
            learning_rate=0.1,
            logging_steps=1,
            max_steps=1,
            save_steps=1,
            remove_unused_columns=False,
        )
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            do_save_adapters=True,
            do_save_full_model=False,
        )

        trainer.train()
        # create second model that should resume the training of the first
        model_resume = AutoModelForSequenceClassification.from_pretrained(
            "bert-base-uncased")
        model_resume.add_adapter("adapter")
        model_resume.add_adapter("additional_adapter")
        model_resume.set_active_adapters("adapter")
        trainer_resume = Trainer(
            model=model_resume,
            args=TrainingArguments(do_train=True,
                                   max_steps=1,
                                   output_dir="./examples"),
            train_dataset=train_dataset,
        )
        trainer_resume.train(resume_from_checkpoint=True)

        self.assertEqual(model.config.adapters.adapters,
                         model_resume.config.adapters.adapters)

        for ((k1, v1), (k2,
                        v2)) in zip(trainer.model.state_dict().items(),
                                    trainer_resume.model.state_dict().items()):
            self.assertEqual(k1, k2)
            if "adapter" in k1:
                self.assertTrue(torch.equal(v1, v2), k1)
Ejemplo n.º 5
0
 def test_default_classification(self):
     MODEL_ID = "bert-base-cased-finetuned-mrpc"
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
     data_args = GlueDataTrainingArguments(
         task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True
     )
     dataset = GlueDataset(data_args, tokenizer=tokenizer, evaluate=True)
     data_collator = DefaultDataCollator()
     batch = data_collator.collate_batch(dataset.features)
     self.assertEqual(batch["labels"].dtype, torch.long)
Ejemplo n.º 6
0
 def test_default_regression(self):
     MODEL_ID = "distilroberta-base"
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
     data_args = GlueDataTrainingArguments(
         task_name="sts-b", data_dir="./tests/fixtures/tests_samples/STS-B", overwrite_cache=True
     )
     dataset = GlueDataset(data_args, tokenizer=tokenizer, evaluate=True)
     data_collator = DefaultDataCollator()
     batch = data_collator.collate_batch(dataset.features)
     self.assertEqual(batch["labels"].dtype, torch.float)
    def test_load_task_adapter_from_hub(self):
        """This test checks if an adapter is loaded from the Hub correctly by evaluating it on some MRPC samples
        and comparing with the expected result.
        """
        for config in ["pfeiffer", "houlsby"]:
            with self.subTest(config=config):
                tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
                model = BertForSequenceClassification.from_pretrained(
                    "bert-base-uncased")

                loading_info = {}
                adapter_name = model.load_adapter("sts/mrpc@ukp",
                                                  config=config,
                                                  version="1",
                                                  loading_info=loading_info)
                model.train_adapter(adapter_name)

                self.assertEqual(0, len(loading_info["missing_keys"]))
                self.assertEqual(0, len(loading_info["unexpected_keys"]))

                self.assertIn(adapter_name, model.config.adapters.adapters)
                self.assertNotIn(adapter_name,
                                 model.base_model.invertible_adapters)

                # check if config is valid
                expected_hash = get_adapter_config_hash(
                    AdapterConfig.load(config))
                real_hash = get_adapter_config_hash(
                    model.config.adapters.get(adapter_name))
                self.assertEqual(expected_hash, real_hash)

                # setup dataset
                data_args = GlueDataTrainingArguments(
                    task_name="mrpc",
                    data_dir="./tests/fixtures/tests_samples/MRPC",
                    overwrite_cache=True)
                eval_dataset = GlueDataset(data_args,
                                           tokenizer=tokenizer,
                                           mode="dev")
                training_args = TrainingArguments(output_dir="./examples",
                                                  no_cuda=True)

                # evaluate
                trainer = Trainer(
                    model=model,
                    args=training_args,
                    eval_dataset=eval_dataset,
                    compute_metrics=self._compute_glue_metrics("mrpc"),
                    adapter_names=["mrpc"],
                )
                result = trainer.evaluate()
                self.assertGreater(result["eval_acc"], 0.9)
    def test_train_adapter_fusion(self):
        for model_name in self.model_names:
            with self.subTest(model_name=model_name):
                tokenizer = AutoTokenizer.from_pretrained(model_name)
                model = AutoModelForSequenceClassification.from_pretrained(model_name)

                # load the adapters to be fused
                model.load_adapter("sts/mrpc@ukp", with_head=False)
                model.load_adapter("sts/qqp@ukp", with_head=False)
                model.load_adapter("sts/sts-b@ukp", with_head=False)

                self.assertIn("mrpc", model.config.adapters.adapters)
                self.assertIn("qqp", model.config.adapters.adapters)
                self.assertIn("sts-b", model.config.adapters.adapters)

                # setup fusion
                adapter_setup = [["mrpc", "qqp", "sts-b"]]
                model.add_fusion(adapter_setup[0])
                model.train_fusion(adapter_setup[0])
                model.set_active_adapters(adapter_setup)
                self.assertEqual(adapter_setup, model.active_adapters)

                # all weights of the adapters should be frozen (test for one)
                for k, v in filter_parameters(model, "text_task_adapters.mrpc").items():
                    self.assertFalse(v.requires_grad, k)
                # all weights of the fusion layer should be activated
                for k, v in filter_parameters(model, "adapter_fusion_layer").items():
                    self.assertTrue(v.requires_grad, k)
                # weights of the model should be freezed (check on some examples)
                for k, v in filter_parameters(model, "encoder.layer.0.attention").items():
                    self.assertFalse(v.requires_grad, k)

                state_dict_pre = copy.deepcopy(model.state_dict())

                # setup dataset
                data_args = GlueDataTrainingArguments(
                    task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True
                )
                train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train")
                training_args = TrainingArguments(
                    output_dir="./examples", do_train=True, learning_rate=0.1, max_steps=5, no_cuda=True
                )

                # evaluate
                trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset,)
                trainer.train()

                for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(), model.state_dict().items()):
                    if "adapter_fusion_layer" in k1 or "classifier" in k1:
                        self.assertFalse(torch.equal(v1, v2), k1)
                    else:
                        self.assertTrue(torch.equal(v1, v2), k1)
Ejemplo n.º 9
0
    def test_trainer_eval_mrpc(self):
        MODEL_ID = "bert-base-cased-finetuned-mrpc"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
        data_args = GlueDataTrainingArguments(
            task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True
        )
        eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev")

        training_args = TrainingArguments(output_dir="./examples", no_cuda=True)
        trainer = Trainer(model=model, args=training_args, eval_dataset=eval_dataset)
        result = trainer.evaluate()
        self.assertLess(result["eval_loss"], 0.2)
    def test_general(self):
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        data_args = GlueDataTrainingArguments(
            task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True
        )
        train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train")

        model = AutoModelWithHeads.from_pretrained("bert-base-uncased")

        model.add_classification_head("task", num_labels=3)

        # add the adapters to be fused
        model.add_adapter("task")
        model.add_adapter("additional_adapter")

        model.train_adapter("task")
        self.assertEqual("task", model.active_head)
        self.assertEqual(Stack("task"), model.active_adapters)
        with TemporaryDirectory() as tempdir:
            training_args = TrainingArguments(
                output_dir=tempdir,
                do_train=True,
                learning_rate=0.1,
                logging_steps=1,
                max_steps=1,
                save_steps=1,
                remove_unused_columns=False,
            )
            trainer = AdapterTrainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
            )

            trainer.train()

            # Check that adapters are actually saved but the full model is not
            files_dir_checkpoint = [file_or_dir for file_or_dir in os.listdir(os.path.join(tempdir, "checkpoint-1"))]
            self.assertTrue("task" in files_dir_checkpoint)
            self.assertTrue("additional_adapter" in files_dir_checkpoint)
            # Check that full model weights are not stored
            self.assertFalse("pytorch_model.bin" in files_dir_checkpoint)

            # this should always be false in the adapter trainer
            self.assertFalse(trainer.args.remove_unused_columns)
            self.assertEqual("task", model.active_head)
            self.assertEqual(Stack("task"), model.active_adapters)
    def run_glue(self, model_name, task_name, fp16):
        model_args = ModelArguments(model_name_or_path=model_name,
                                    cache_dir=self.cache_dir)
        data_args = GlueDataTrainingArguments(
            task_name=task_name,
            data_dir=self.data_dir + "/" + task_name,
            max_seq_length=self.max_seq_length)

        training_args = TrainingArguments(
            output_dir=self.output_dir + "/" + task_name,
            do_train=True,
            do_eval=True,
            per_gpu_train_batch_size=self.train_batch_size,
            learning_rate=self.learning_rate,
            num_train_epochs=self.num_train_epochs,
            local_rank=self.local_rank,
            overwrite_output_dir=self.overwrite_output_dir,
            gradient_accumulation_steps=self.gradient_accumulation_steps,
            fp16=fp16,
            logging_steps=self.logging_steps)

        # Setup logging
        logging.basicConfig(
            format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
            datefmt="%m/%d/%Y %H:%M:%S",
            level=logging.INFO
            if training_args.local_rank in [-1, 0] else logging.WARN,
        )
        logger.warning(
            "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
            training_args.local_rank,
            training_args.device,
            training_args.n_gpu,
            bool(training_args.local_rank != -1),
            training_args.fp16,
        )
        logger.info("Training/evaluation parameters %s", training_args)

        set_seed(training_args.seed)
        onnxruntime.set_seed(training_args.seed)

        try:
            num_labels = glue_tasks_num_labels[data_args.task_name]
            output_mode = glue_output_modes[data_args.task_name]
        except KeyError:
            raise ValueError("Task not found: %s" % (data_args.task_name))

        config = AutoConfig.from_pretrained(
            model_args.config_name
            if model_args.config_name else model_args.model_name_or_path,
            num_labels=num_labels,
            finetuning_task=data_args.task_name,
            cache_dir=model_args.cache_dir,
        )
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name
            if model_args.tokenizer_name else model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
        )

        model = AutoModelForSequenceClassification.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )

        train_dataset = (GlueDataset(data_args, tokenizer=tokenizer)
                         if training_args.do_train else None)

        eval_dataset = (GlueDataset(data_args, tokenizer=tokenizer, mode="dev")
                        if training_args.do_eval else None)

        def compute_metrics(p: EvalPrediction) -> Dict:
            if output_mode == "classification":
                preds = np.argmax(p.predictions, axis=1)
            elif output_mode == "regression":
                preds = np.squeeze(p.predictions)
            return glue_compute_metrics(data_args.task_name, preds,
                                        p.label_ids)

        model_desc = self.model_to_desc(model_name, model)
        # Initialize the ORTTrainer within ORTTransformerTrainer
        trainer = ORTTransformerTrainer(
            model=model,
            model_desc=model_desc,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics,
        )

        # Training
        if training_args.do_train:
            trainer.train()
            trainer.save_model()

        # Evaluation
        results = {}
        if training_args.do_eval and training_args.local_rank in [-1, 0]:
            logger.info("*** Evaluate ***")

            result = trainer.evaluate()

            logger.info("***** Eval results {} *****".format(
                data_args.task_name))
            for key, value in result.items():
                logger.info("  %s = %s", key, value)

            results.update(result)

        return results
Ejemplo n.º 12
0
def tune_transformer(num_samples=8, gpus_per_trial=0, smoke_test=False):
    data_dir_name = "./data" if not smoke_test else "./test_data"
    data_dir = os.path.abspath(os.path.join(os.getcwd(), data_dir_name))
    if not os.path.exists(data_dir):
        os.mkdir(data_dir, 0o755)

    # Change these as needed.
    model_name = "bert-base-uncased" if not smoke_test \
        else "sshleifer/tiny-distilroberta-base"
    task_name = "rte"

    task_data_dir = os.path.join(data_dir, task_name.upper())

    num_labels = glue_tasks_num_labels[task_name]

    config = AutoConfig.from_pretrained(model_name,
                                        num_labels=num_labels,
                                        finetuning_task=task_name)

    # Download and cache tokenizer, model, and features
    print("Downloading and caching Tokenizer")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Triggers tokenizer download to cache
    print("Downloading and caching pre-trained model")
    AutoModelForSequenceClassification.from_pretrained(
        model_name,
        config=config,
    )

    def get_model():
        return AutoModelForSequenceClassification.from_pretrained(
            model_name,
            config=config,
        )

    # Download data.
    download_data(task_name, data_dir)

    data_args = GlueDataTrainingArguments(task_name=task_name,
                                          data_dir=task_data_dir)

    train_dataset = GlueDataset(data_args,
                                tokenizer=tokenizer,
                                mode="train",
                                cache_dir=task_data_dir)
    eval_dataset = GlueDataset(data_args,
                               tokenizer=tokenizer,
                               mode="dev",
                               cache_dir=task_data_dir)

    training_args = TrainingArguments(
        output_dir=".",
        learning_rate=1e-5,  # config
        do_train=True,
        do_eval=True,
        no_cuda=gpus_per_trial <= 0,
        evaluation_strategy="epoch",
        load_best_model_at_end=True,
        num_train_epochs=2,  # config
        max_steps=-1,
        per_device_train_batch_size=16,  # config
        per_device_eval_batch_size=16,  # config
        warmup_steps=0,
        weight_decay=0.1,  # config
        logging_dir="./logs",
        skip_memory_metrics=True,
        report_to="none")

    trainer = Trainer(model_init=get_model,
                      args=training_args,
                      train_dataset=train_dataset,
                      eval_dataset=eval_dataset,
                      compute_metrics=build_compute_metrics_fn(task_name))

    tune_config = {
        "per_device_train_batch_size": 32,
        "per_device_eval_batch_size": 32,
        "num_train_epochs": tune.choice([2, 3, 4, 5]),
        "max_steps": 1 if smoke_test else -1,  # Used for smoke test.
    }

    scheduler = PopulationBasedTraining(time_attr="training_iteration",
                                        metric="eval_acc",
                                        mode="max",
                                        perturbation_interval=1,
                                        hyperparam_mutations={
                                            "weight_decay":
                                            tune.uniform(0.0, 0.3),
                                            "learning_rate":
                                            tune.uniform(1e-5, 5e-5),
                                            "per_device_train_batch_size":
                                            [16, 32, 64],
                                        })

    reporter = CLIReporter(parameter_columns={
        "weight_decay": "w_decay",
        "learning_rate": "lr",
        "per_device_train_batch_size": "train_bs/gpu",
        "num_train_epochs": "num_epochs"
    },
                           metric_columns=[
                               "eval_acc", "eval_loss", "epoch",
                               "training_iteration"
                           ])

    trainer.hyperparameter_search(
        hp_space=lambda _: tune_config,
        backend="ray",
        n_trials=num_samples,
        resources_per_trial={
            "cpu": 1,
            "gpu": gpus_per_trial
        },
        scheduler=scheduler,
        keep_checkpoints_num=1,
        checkpoint_score_attr="training_iteration",
        stop={"training_iteration": 1} if smoke_test else None,
        progress_reporter=reporter,
        local_dir="~/ray_results/",
        name="tune_transformer_pbt",
        log_to_file=True)
Ejemplo n.º 13
0
 def dataset(self, tokenizer):
     data_args = GlueDataTrainingArguments(
         task_name="mrpc",
         data_dir="./tests/fixtures/tests_samples/MRPC",
         overwrite_cache=True)
     return GlueDataset(data_args, tokenizer=tokenizer, mode="train")
Ejemplo n.º 14
0
def train(EXP: str, MODEL_NAME: str, TASK_NAME: str, N_LABELS: int, DELTA: float, WEIGHT_DECAY: float, DEVICE: str) -> float:
    EPOCHS         = 5
    BATCH_SIZE     = 8
    SAMPLES        = 10
    FREEZE         = True
    LOGS           = "logs"
    MAX_SEQ_LENGTH = 128
    LOADER_OPTIONS = { "num_workers": 6, "pin_memory": True }
    LR             = 2e-5
    ADAM_EPSILON   = 1e-8
    N_WARMUP_STEPS = 0
    MAX_GRAD_NORM  = 1
    DATA_DIR       = os.path.join("./dataset/glue/data", TASK_NAME)

    os.makedirs(LOGS, exist_ok=True)
    writer_path = os.path.join(LOGS, f"bayeformers_bert_glue.{EXP}")
    writer_suff = f".DELTA_{DELTA}.WEIGHT_DECAY_{WEIGHT_DECAY}"
    writer      = SummaryWriter(writer_path + writer_suff)
    
    o_model, tokenizer = setup_model(MODEL_NAME, TASK_NAME, N_LABELS)
    o_model            = o_model.to(DEVICE)

    glue          = GlueDataTrainingArguments(TASK_NAME, data_dir=DATA_DIR, max_seq_length=MAX_SEQ_LENGTH)
    train_dataset = GlueDataset(glue, tokenizer=tokenizer)
    test_dataset  = GlueDataset(glue, tokenizer=tokenizer, mode="dev")
    train_loader  = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,  collate_fn=collate, **LOADER_OPTIONS)
    test_loader   = DataLoader(test_dataset,  batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate, **LOADER_OPTIONS)
    
    decay           = [param for name, param in o_model.named_parameters() if name     in ["bias", "LayerNorm.weight"]]
    no_decay        = [param for name, param in o_model.named_parameters() if name not in ["bias", "LayerNorm.weight"]]
    params_decay    = { "params": decay,    "weight_decay": WEIGHT_DECAY }
    params_no_decay = { "params": no_decay, "weight_decay": 0.0 }
    parameters      = [params_decay, params_no_decay]

    criterion = nn.CrossEntropyLoss().to(DEVICE)
    optim     = AdamW(parameters, lr=LR, eps=ADAM_EPSILON)
    scheduler = get_linear_schedule_with_warmup(optim, N_WARMUP_STEPS, EPOCHS)

    report = Report()
    for epoch in tqdm(range(EPOCHS), desc="Epoch"):

        # ============================ TRAIN ======================================
        o_model.train()
        report.reset()
        
        pbar = tqdm(train_loader, desc="Train")
        for inputs in pbar:
            inputs = dic2cuda(inputs, DEVICE)
            labels = inputs["labels"]

            optim.zero_grad()
            logits = o_model(**inputs)[1]
            loss   = criterion(logits.view(-1, N_LABELS), labels.view(-1))
            acc    = (torch.argmax(logits, dim=1) == labels).float().sum()

            loss.backward()
            nn.utils.clip_grad_norm_(o_model.parameters(), MAX_GRAD_NORM)
            optim.step()

            report.total += loss.item()      / len(train_loader)
            report.acc   += acc.item() * 100 / len(train_dataset)

            pbar.set_postfix(total=report.total, acc=report.acc)

        scheduler.step()
        writer.add_scalar("train_nll", report.total, epoch)
        writer.add_scalar("train_acc", report.acc,   epoch)

        # ============================ TEST =======================================
        o_model.eval()
        report.reset()
        
        with torch.no_grad():
            pbar = tqdm(test_loader, desc="Test")
            for inputs in pbar:
                inputs = dic2cuda(inputs, DEVICE)
                labels = inputs["labels"]

                logits = o_model(**inputs)[1]
                loss   = criterion(logits.view(-1, N_LABELS), labels.view(-1))
                acc    = (torch.argmax(logits, dim=1) == labels).float().sum()

                report.total += loss.item()       / len(test_loader)
                report.acc   += acc.item() * 100  / len(test_dataset)

                pbar.set_postfix(total=report.total, acc=report.acc)

        writer.add_scalar("test_nll", report.total, epoch)
        writer.add_scalar("test_acc", report.acc,   epoch)

    # ============================ EVALUTATION ====================================
    b_model                  = to_bayesian(o_model, delta=DELTA, freeze=FREEZE)
    b_model                  = b_model.to(DEVICE)

    b_model.eval()
    report.reset()

    with torch.no_grad():
        pbar = tqdm(test_loader, desc="Bayesian Eval")
        for inputs in pbar:
            inputs = dic2cuda(inputs, DEVICE)
            labels = inputs["labels"]
            B      = inputs["input_ids"].size(0)

            samples = sample_bayesian(b_model, inputs, SAMPLES, B, N_LABELS, DEVICE)
            raw_logits, logits, log_prior, log_variational_posterior = samples

            nll     = criterion(logits, labels.view(-1))            
            loss    = (log_variational_posterior - log_prior) / len(test_loader) + nll
            acc     = (torch.argmax(logits, dim=1) == labels).float().sum()
            acc_std = np.std([(torch.argmax(logits, dim=1) == labels).float().sum().item() for logits in raw_logits])

            report.total                     += loss.item()                      / len(test_loader)
            report.nll                       += nll.item()                       / len(test_loader)
            report.log_prior                 += log_prior.item()                 / len(test_loader)
            report.log_variational_posterior += log_variational_posterior.item() / len(test_loader)
            report.acc                       += acc.item() * 100                 / len(test_dataset)
            report.acc_std                   += acc_std                          / len(test_loader)

            pbar.set_postfix(
                total=report.total,
                nll=report.nll,
                log_prior=report.log_prior,
                log_variational_posterior=report.log_variational_posterior,
                acc=report.acc,
                acc_std=report.acc_std,
            )

    writer.add_scalar("bayesian_eval_nll",     report.nll,     epoch)
    writer.add_scalar("bayesian_eval_acc",     report.acc,     epoch)
    writer.add_scalar("bayesian_eval_acc_std", report.acc_std, epoch)

    decay           = [param for name, param in b_model.named_parameters() if name     in ["bias", "LayerNorm.weight"]]
    no_decay        = [param for name, param in b_model.named_parameters() if name not in ["bias", "LayerNorm.weight"]]
    params_decay    = { "params": decay,    "weight_decay": WEIGHT_DECAY }
    params_no_decay = { "params": no_decay, "weight_decay": 0.0 }
    parameters      = [params_decay, params_no_decay]

    criterion = nn.CrossEntropyLoss().to(DEVICE)
    optim     = AdamW(parameters, lr=LR, eps=ADAM_EPSILON)
    scheduler = get_linear_schedule_with_warmup(optim, N_WARMUP_STEPS, EPOCHS)

    for epoch in tqdm(range(EPOCHS), desc="Bayesian Epoch"):

        # ============================ TRAIN ======================================
        b_model.train()
        report.reset()
        
        pbar = tqdm(train_loader, desc="Bayesian Train")
        for inputs in pbar:
            inputs = dic2cuda(inputs, DEVICE)
            labels = inputs["labels"]
            B      = inputs["input_ids"].size(0)

            optim.zero_grad()
            samples = sample_bayesian(b_model, inputs, SAMPLES, B, N_LABELS, DEVICE)
            raw_logits, logits, log_prior, log_variational_posterior = samples

            nll     = criterion(logits, labels.view(-1))            
            loss    = (log_variational_posterior - log_prior) / len(train_loader) + nll
            acc     = (torch.argmax(logits, dim=1) == labels).float().sum()
            acc_std = np.std([(torch.argmax(logits, dim=1) == labels).float().sum().item() for logits in raw_logits])

            loss.backward()
            nn.utils.clip_grad_norm_(b_model.parameters(), MAX_GRAD_NORM)
            optim.step()

            report.total                     += loss.item()                      / len(train_loader)
            report.nll                       += nll.item()                       / len(train_loader)
            report.log_prior                 += log_prior.item()                 / len(train_loader)
            report.log_variational_posterior += log_variational_posterior.item() / len(train_loader)
            report.acc                       += acc.item() * 100                 / len(train_dataset)
            report.acc_std                   += acc_std                          / len(train_loader)

            pbar.set_postfix(
                total=report.total,
                nll=report.nll,
                log_prior=report.log_prior,
                log_variational_posterior=report.log_variational_posterior,
                acc=report.acc,
                acc_std=acc_std,
            )

        scheduler.step()
        writer.add_scalar("bayesian_train_nll",     report.nll,     epoch)
        writer.add_scalar("bayesian_train_acc",     report.acc,     epoch)
        writer.add_scalar("bayesian_train_acc_std", report.acc_std, epoch)

        # ============================ TEST =======================================
        b_model.eval()
        report.reset()
        
        with torch.no_grad():
            pbar = tqdm(test_loader, desc="Bayesian Test")
            for inputs in pbar:
                inputs = dic2cuda(inputs, DEVICE)
                labels = inputs["labels"]
                B      = inputs["input_ids"].size(0)

                samples = sample_bayesian(b_model, inputs, SAMPLES, B, N_LABELS, DEVICE)
                raw_logits, logits, log_prior, log_variational_posterior = samples

                nll     = criterion(logits, labels.view(-1))
                loss    = (log_variational_posterior - log_prior) / len(test_loader) + nll
                acc     = (torch.argmax(logits, dim=1) == labels).float().sum()
                acc_std = np.std([(torch.argmax(logits, dim=1) == labels).float().sum().item() for logits in raw_logits])

                report.total                     += loss.item()                      / len(test_loader)
                report.nll                       += nll.item()                       / len(test_loader)
                report.log_prior                 += log_prior.item()                 / len(test_loader)
                report.log_variational_posterior += log_variational_posterior.item() / len(test_loader)
                report.acc                       += acc.item() * 100                 / len(test_dataset)
                report.acc_std                   += acc_std                          / len(test_loader)

                pbar.set_postfix(
                    total=report.total,
                    nll=report.nll,
                    log_prior=report.log_prior,
                    log_variational_posterior=report.log_variational_posterior,
                    acc=report.acc,
                    acc_std=report.acc_std,
                )

        writer.add_scalar("bayesian_test_nll",     report.nll,     epoch)
        writer.add_scalar("bayesian_test_acc",     report.acc,     epoch)
        writer.add_scalar("bayesian_test_acc_std", report.acc_std, epoch)

    torch.save({
        "weight_decay": WEIGHT_DECAY,
        "delta"       : DELTA,
        "acc"         : report.acc,
        "acc_std"     : report.acc_std,
        "model"       : b_model.state_dict()
    }, f"{writer_path + writer_suff}.pth")

    return report.acc
    def test_reloading_prediction_head(self):
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        data_args = GlueDataTrainingArguments(
            task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True
        )
        train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train")

        model = AutoModelWithHeads.from_pretrained("bert-base-uncased")

        model.add_classification_head("adapter", num_labels=3)
        model.add_classification_head("dummy", num_labels=2)

        # add the adapters to be fused
        model.add_adapter("adapter")
        model.add_adapter("additional_adapter")

        # setup fusion
        adapter_setup = Fuse("adapter", "additional_adapter")
        model.add_adapter_fusion(adapter_setup)
        model.train_adapter_fusion(adapter_setup)
        model.set_active_adapters(adapter_setup)
        self.assertEqual(adapter_setup, model.active_adapters)
        self.assertEqual("dummy", model.active_head)
        with TemporaryDirectory() as tempdir:
            training_args = TrainingArguments(
                output_dir=tempdir,
                do_train=True,
                learning_rate=0.1,
                logging_steps=1,
                max_steps=1,
                save_steps=1,
                remove_unused_columns=False,
            )
            trainer = AdapterTrainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
            )

            trainer.train()
            # create second model that should resume the training of the first
            model_resume = AutoModelWithHeads.from_pretrained("bert-base-uncased")

            model_resume.add_classification_head("adapter", num_labels=3)
            model_resume.add_classification_head("dummy", num_labels=2)
            model_resume.add_adapter("adapter")
            model_resume.add_adapter("additional_adapter")
            # setup fusion
            adapter_setup = Fuse("adapter", "additional_adapter")
            model_resume.add_adapter_fusion(adapter_setup)
            model_resume.train_adapter_fusion(adapter_setup)
            model_resume.set_active_adapters(adapter_setup)
            trainer_resume = AdapterTrainer(
                model=model_resume,
                args=TrainingArguments(do_train=True, max_steps=1, output_dir=tempdir),
                train_dataset=train_dataset,
            )
            trainer_resume.train(resume_from_checkpoint=True)

            self.assertEqual("dummy", model.active_head)
            self.assertEqual(model.config.adapters.adapters, model_resume.config.adapters.adapters)

            for ((k1, v1), (k2, v2)) in zip(
                trainer.model.state_dict().items(), trainer_resume.model.state_dict().items()
            ):
                self.assertEqual(k1, k2)
                if "adapter" in k1 or "dummy" in k1:
                    self.assertTrue(torch.equal(v1, v2), k1)
Ejemplo n.º 16
0
    def test_resume_training_with_fusion(self):
        def encode_batch(batch):
            """Encodes a batch of input data using the model tokenizer."""
            return tokenizer(batch["sentence1"],
                             batch["sentence2"],
                             max_length=80,
                             truncation=True,
                             padding="max_length")

        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        data_args = GlueDataTrainingArguments(
            task_name="mrpc",
            data_dir="./tests/fixtures/tests_samples/MRPC",
            overwrite_cache=True)
        train_dataset = GlueDataset(data_args,
                                    tokenizer=tokenizer,
                                    mode="train")

        model = AutoModelForSequenceClassification.from_pretrained(
            "bert-base-uncased")
        model.add_adapter("adapter")
        model.add_adapter("additional_adapter")
        model.add_fusion(Fuse("adapter", "additional_adapter"))
        model.set_active_adapters(Fuse("adapter", "additional_adapter"))

        training_args = TrainingArguments(
            output_dir="./examples",
            do_train=True,
            learning_rate=0.1,
            logging_steps=1,
            max_steps=1,
            save_steps=1,
            remove_unused_columns=False,
        )
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            do_save_adapters=True,
            do_save_full_model=False,
            do_save_adapter_fusion=True,
        )

        trainer.train()
        model_resume = AutoModelForSequenceClassification.from_pretrained(
            "bert-base-uncased")
        model_resume.add_adapter("adapter")
        model_resume.add_adapter("additional_adapter")
        model_resume.add_fusion(Fuse("adapter", "additional_adapter"))
        model_resume.set_active_adapters(Fuse("adapter", "additional_adapter"))
        trainer_resume = Trainer(
            model=model_resume,
            args=TrainingArguments(do_train=True,
                                   max_steps=1,
                                   output_dir="./examples"),
            train_dataset=train_dataset,
        )
        trainer_resume.train(resume_from_checkpoint=True)

        self.assertEqual(model.config.adapters.adapters,
                         model_resume.config.adapters.adapters)

        for ((k1, v1), (k2,
                        v2)) in zip(trainer.model.state_dict().items(),
                                    trainer_resume.model.state_dict().items()):
            self.assertEqual(k1, k2)
            if "adapter" in k1:
                self.assertTrue(torch.equal(v1, v2), k1)
    def test_train_adapter_fusion(self):
        tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name,
                                                  use_fast=False)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForSequenceClassification.from_config(self.config())

        # add the adapters to be fused
        model.add_adapter("a")
        model.add_adapter("b")
        model.add_adapter("c")

        self.assertIn("a", model.config.adapters.adapters)
        self.assertIn("b", model.config.adapters.adapters)
        self.assertIn("c", model.config.adapters.adapters)

        # setup fusion
        adapter_setup = Fuse("a", "b", "c")
        model.add_fusion(adapter_setup)
        model.train_fusion(adapter_setup)
        model.set_active_adapters(adapter_setup)
        self.assertEqual(adapter_setup, model.active_adapters)

        # all weights of the adapters should be frozen (test for one)
        for k, v in filter_parameters(model, "adapters.a.").items():
            self.assertFalse(v.requires_grad, k)
        # all weights of the fusion layer should be activated
        for k, v in filter_parameters(model, "adapter_fusion_layer").items():
            self.assertTrue(v.requires_grad, k)
        # weights of the model should be freezed (check on some examples)
        for k, v in filter_parameters(model,
                                      "encoder.layer.0.attention").items():
            self.assertFalse(v.requires_grad, k)

        state_dict_pre = copy.deepcopy(model.state_dict())

        # setup dataset
        data_args = GlueDataTrainingArguments(
            task_name="mrpc",
            data_dir="./tests/fixtures/tests_samples/MRPC",
            overwrite_cache=True)
        train_dataset = GlueDataset(data_args,
                                    tokenizer=tokenizer,
                                    mode="train")
        training_args = TrainingArguments(output_dir="./examples",
                                          do_train=True,
                                          learning_rate=0.1,
                                          max_steps=7,
                                          no_cuda=True)

        # evaluate
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
        )
        trainer.train()

        for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(),
                                        model.state_dict().items()):
            if "adapter_fusion_layer" in k1 or "classifier" in k1 or "classification_head" in k1 or "score" in k1:
                self.assertFalse(torch.equal(v1, v2), k1)
            else:
                self.assertTrue(torch.equal(v1, v2), k1)
Ejemplo n.º 18
0
checkpoint_dir = config["model_checkpoints"]
glue_dataset_folder = config["glue_dataset_folder"]

device_ids = list(range(torch.cuda.device_count()))
print(f"GPU list: {device_ids}")

print(json.dumps([model_config, pretraining_config], indent=4))

########################### Loading Datasets ###########################

tokenizer = utils.get_tokenizer(model_config["max_seq_len"])
model_config["vocab_size"] = len(tokenizer.get_vocab())

data_args = GlueDataTrainingArguments(
    task_name=args.task,
    data_dir=os.path.join(glue_dataset_folder, args.task),
    max_seq_length=model_config["max_seq_len"],
    overwrite_cache=True)
train_dataset = GlueDataset(data_args, tokenizer=tokenizer)
data_loader = DataLoader(train_dataset,
                         batch_size=args.batch_size,
                         shuffle=True,
                         collate_fn=default_data_collator)
num_steps_per_epoch = len(data_loader)
print(f"num_steps_per_epoch: {num_steps_per_epoch}", flush=True)

dev_datasets = {"dev": GlueDataset(data_args, tokenizer=tokenizer, mode="dev")}
if args.task.lower() == "mnli":
    data_args = GlueDataTrainingArguments(
        task_name="mnli-mm",
        data_dir=os.path.join(glue_dataset_folder, args.task),