def test_training_load_best_model_at_end_adapter(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") data_args = GlueDataTrainingArguments( task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True ) train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train") eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev") model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased") model.add_adapter("adapter") model.train_adapter("adapter") training_args = TrainingArguments( output_dir="./examples", do_train=True, learning_rate=0.001, max_steps=1, save_steps=1, remove_unused_columns=False, load_best_model_at_end=True, evaluation_strategy="epoch", save_strategy="epoch", num_train_epochs=2, ) trainer = AdapterTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset ) with self.assertLogs(logger) as cm: trainer.train() self.assertTrue(any("Loading best adapter(s) from" in line for line in cm.output)) self.assertEqual(Stack("adapter"), trainer.model.active_adapters)
def create_datasets(task_name: str, tokenizer: BertTokenizer, data_dir: Optional[str] = None ) -> Tuple[CustomGlueDataset, CustomGlueDataset]: if task_name not in ["mnli", "mnli-2", "hans"]: raise ValueError(f"Unrecognized task {task_name}") if data_dir is None: if task_name in ["mnli", "mnli-2"]: data_dir = constants.GLUE_DATA_DIR if task_name in ["hans"]: data_dir = constants.HANS_DATA_DIR data_args = GlueDataTrainingArguments(task_name=task_name, data_dir=data_dir, max_seq_length=128) train_dataset = CustomGlueDataset(args=data_args, tokenizer=tokenizer, mode="train") eval_dataset = CustomGlueDataset(args=data_args, tokenizer=tokenizer, mode="dev") return train_dataset, eval_dataset
def test_train_single_adapter(self): tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelWithHeads.from_config(self.config()) # add two adapters: one will be trained and the other should be frozen model.add_adapter("mrpc") model.add_adapter("dummy") model.add_classification_head("mrpc") self.assertIn("mrpc", model.config.adapters.adapters) self.assertIn("dummy", model.config.adapters.adapters) # train the mrpc adapter -> should be activated & unfreezed model.train_adapter("mrpc") self.assertEqual(set(["mrpc"]), model.active_adapters.flatten()) # all weights of the adapter should be activated for k, v in filter_parameters(model, "adapters.mrpc.").items(): self.assertTrue(v.requires_grad, k) # all weights of the adapter not used for training should be freezed for k, v in filter_parameters(model, "adapters.dummy.").items(): self.assertFalse(v.requires_grad, k) # weights of the model should be freezed (check on some examples) for k, v in filter_parameters(model, "encoder.layer.0.attention").items(): self.assertFalse(v.requires_grad, k) state_dict_pre = copy.deepcopy(model.state_dict()) # setup dataset data_args = GlueDataTrainingArguments( task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True) train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train") training_args = TrainingArguments(output_dir="./examples", do_train=True, learning_rate=0.1, max_steps=7, no_cuda=True) # evaluate trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, ) trainer.train() for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(), model.state_dict().items()): if "mrpc" in k1: self.assertFalse(torch.equal(v1, v2)) else: self.assertTrue(torch.equal(v1, v2))
def test_resume_training(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") data_args = GlueDataTrainingArguments( task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True) train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train") model = AutoModelForSequenceClassification.from_pretrained( "bert-base-uncased") model.add_adapter("adapter") model.add_adapter("additional_adapter") model.set_active_adapters("adapter") training_args = TrainingArguments( output_dir="./examples", do_train=True, learning_rate=0.1, logging_steps=1, max_steps=1, save_steps=1, remove_unused_columns=False, ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, do_save_adapters=True, do_save_full_model=False, ) trainer.train() # create second model that should resume the training of the first model_resume = AutoModelForSequenceClassification.from_pretrained( "bert-base-uncased") model_resume.add_adapter("adapter") model_resume.add_adapter("additional_adapter") model_resume.set_active_adapters("adapter") trainer_resume = Trainer( model=model_resume, args=TrainingArguments(do_train=True, max_steps=1, output_dir="./examples"), train_dataset=train_dataset, ) trainer_resume.train(resume_from_checkpoint=True) self.assertEqual(model.config.adapters.adapters, model_resume.config.adapters.adapters) for ((k1, v1), (k2, v2)) in zip(trainer.model.state_dict().items(), trainer_resume.model.state_dict().items()): self.assertEqual(k1, k2) if "adapter" in k1: self.assertTrue(torch.equal(v1, v2), k1)
def test_default_classification(self): MODEL_ID = "bert-base-cased-finetuned-mrpc" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) data_args = GlueDataTrainingArguments( task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True ) dataset = GlueDataset(data_args, tokenizer=tokenizer, evaluate=True) data_collator = DefaultDataCollator() batch = data_collator.collate_batch(dataset.features) self.assertEqual(batch["labels"].dtype, torch.long)
def test_default_regression(self): MODEL_ID = "distilroberta-base" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) data_args = GlueDataTrainingArguments( task_name="sts-b", data_dir="./tests/fixtures/tests_samples/STS-B", overwrite_cache=True ) dataset = GlueDataset(data_args, tokenizer=tokenizer, evaluate=True) data_collator = DefaultDataCollator() batch = data_collator.collate_batch(dataset.features) self.assertEqual(batch["labels"].dtype, torch.float)
def test_load_task_adapter_from_hub(self): """This test checks if an adapter is loaded from the Hub correctly by evaluating it on some MRPC samples and comparing with the expected result. """ for config in ["pfeiffer", "houlsby"]: with self.subTest(config=config): tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") model = BertForSequenceClassification.from_pretrained( "bert-base-uncased") loading_info = {} adapter_name = model.load_adapter("sts/mrpc@ukp", config=config, version="1", loading_info=loading_info) model.train_adapter(adapter_name) self.assertEqual(0, len(loading_info["missing_keys"])) self.assertEqual(0, len(loading_info["unexpected_keys"])) self.assertIn(adapter_name, model.config.adapters.adapters) self.assertNotIn(adapter_name, model.base_model.invertible_adapters) # check if config is valid expected_hash = get_adapter_config_hash( AdapterConfig.load(config)) real_hash = get_adapter_config_hash( model.config.adapters.get(adapter_name)) self.assertEqual(expected_hash, real_hash) # setup dataset data_args = GlueDataTrainingArguments( task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True) eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev") training_args = TrainingArguments(output_dir="./examples", no_cuda=True) # evaluate trainer = Trainer( model=model, args=training_args, eval_dataset=eval_dataset, compute_metrics=self._compute_glue_metrics("mrpc"), adapter_names=["mrpc"], ) result = trainer.evaluate() self.assertGreater(result["eval_acc"], 0.9)
def test_train_adapter_fusion(self): for model_name in self.model_names: with self.subTest(model_name=model_name): tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained(model_name) # load the adapters to be fused model.load_adapter("sts/mrpc@ukp", with_head=False) model.load_adapter("sts/qqp@ukp", with_head=False) model.load_adapter("sts/sts-b@ukp", with_head=False) self.assertIn("mrpc", model.config.adapters.adapters) self.assertIn("qqp", model.config.adapters.adapters) self.assertIn("sts-b", model.config.adapters.adapters) # setup fusion adapter_setup = [["mrpc", "qqp", "sts-b"]] model.add_fusion(adapter_setup[0]) model.train_fusion(adapter_setup[0]) model.set_active_adapters(adapter_setup) self.assertEqual(adapter_setup, model.active_adapters) # all weights of the adapters should be frozen (test for one) for k, v in filter_parameters(model, "text_task_adapters.mrpc").items(): self.assertFalse(v.requires_grad, k) # all weights of the fusion layer should be activated for k, v in filter_parameters(model, "adapter_fusion_layer").items(): self.assertTrue(v.requires_grad, k) # weights of the model should be freezed (check on some examples) for k, v in filter_parameters(model, "encoder.layer.0.attention").items(): self.assertFalse(v.requires_grad, k) state_dict_pre = copy.deepcopy(model.state_dict()) # setup dataset data_args = GlueDataTrainingArguments( task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True ) train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train") training_args = TrainingArguments( output_dir="./examples", do_train=True, learning_rate=0.1, max_steps=5, no_cuda=True ) # evaluate trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset,) trainer.train() for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(), model.state_dict().items()): if "adapter_fusion_layer" in k1 or "classifier" in k1: self.assertFalse(torch.equal(v1, v2), k1) else: self.assertTrue(torch.equal(v1, v2), k1)
def test_trainer_eval_mrpc(self): MODEL_ID = "bert-base-cased-finetuned-mrpc" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID) data_args = GlueDataTrainingArguments( task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True ) eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev") training_args = TrainingArguments(output_dir="./examples", no_cuda=True) trainer = Trainer(model=model, args=training_args, eval_dataset=eval_dataset) result = trainer.evaluate() self.assertLess(result["eval_loss"], 0.2)
def test_general(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") data_args = GlueDataTrainingArguments( task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True ) train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train") model = AutoModelWithHeads.from_pretrained("bert-base-uncased") model.add_classification_head("task", num_labels=3) # add the adapters to be fused model.add_adapter("task") model.add_adapter("additional_adapter") model.train_adapter("task") self.assertEqual("task", model.active_head) self.assertEqual(Stack("task"), model.active_adapters) with TemporaryDirectory() as tempdir: training_args = TrainingArguments( output_dir=tempdir, do_train=True, learning_rate=0.1, logging_steps=1, max_steps=1, save_steps=1, remove_unused_columns=False, ) trainer = AdapterTrainer( model=model, args=training_args, train_dataset=train_dataset, ) trainer.train() # Check that adapters are actually saved but the full model is not files_dir_checkpoint = [file_or_dir for file_or_dir in os.listdir(os.path.join(tempdir, "checkpoint-1"))] self.assertTrue("task" in files_dir_checkpoint) self.assertTrue("additional_adapter" in files_dir_checkpoint) # Check that full model weights are not stored self.assertFalse("pytorch_model.bin" in files_dir_checkpoint) # this should always be false in the adapter trainer self.assertFalse(trainer.args.remove_unused_columns) self.assertEqual("task", model.active_head) self.assertEqual(Stack("task"), model.active_adapters)
def run_glue(self, model_name, task_name, fp16): model_args = ModelArguments(model_name_or_path=model_name, cache_dir=self.cache_dir) data_args = GlueDataTrainingArguments( task_name=task_name, data_dir=self.data_dir + "/" + task_name, max_seq_length=self.max_seq_length) training_args = TrainingArguments( output_dir=self.output_dir + "/" + task_name, do_train=True, do_eval=True, per_gpu_train_batch_size=self.train_batch_size, learning_rate=self.learning_rate, num_train_epochs=self.num_train_epochs, local_rank=self.local_rank, overwrite_output_dir=self.overwrite_output_dir, gradient_accumulation_steps=self.gradient_accumulation_steps, fp16=fp16, logging_steps=self.logging_steps) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) set_seed(training_args.seed) onnxruntime.set_seed(training_args.seed) try: num_labels = glue_tasks_num_labels[data_args.task_name] output_mode = glue_output_modes[data_args.task_name] except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) train_dataset = (GlueDataset(data_args, tokenizer=tokenizer) if training_args.do_train else None) eval_dataset = (GlueDataset(data_args, tokenizer=tokenizer, mode="dev") if training_args.do_eval else None) def compute_metrics(p: EvalPrediction) -> Dict: if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) return glue_compute_metrics(data_args.task_name, preds, p.label_ids) model_desc = self.model_to_desc(model_name, model) # Initialize the ORTTrainer within ORTTransformerTrainer trainer = ORTTransformerTrainer( model=model, model_desc=model_desc, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train() trainer.save_model() # Evaluation results = {} if training_args.do_eval and training_args.local_rank in [-1, 0]: logger.info("*** Evaluate ***") result = trainer.evaluate() logger.info("***** Eval results {} *****".format( data_args.task_name)) for key, value in result.items(): logger.info(" %s = %s", key, value) results.update(result) return results
def tune_transformer(num_samples=8, gpus_per_trial=0, smoke_test=False): data_dir_name = "./data" if not smoke_test else "./test_data" data_dir = os.path.abspath(os.path.join(os.getcwd(), data_dir_name)) if not os.path.exists(data_dir): os.mkdir(data_dir, 0o755) # Change these as needed. model_name = "bert-base-uncased" if not smoke_test \ else "sshleifer/tiny-distilroberta-base" task_name = "rte" task_data_dir = os.path.join(data_dir, task_name.upper()) num_labels = glue_tasks_num_labels[task_name] config = AutoConfig.from_pretrained(model_name, num_labels=num_labels, finetuning_task=task_name) # Download and cache tokenizer, model, and features print("Downloading and caching Tokenizer") tokenizer = AutoTokenizer.from_pretrained(model_name) # Triggers tokenizer download to cache print("Downloading and caching pre-trained model") AutoModelForSequenceClassification.from_pretrained( model_name, config=config, ) def get_model(): return AutoModelForSequenceClassification.from_pretrained( model_name, config=config, ) # Download data. download_data(task_name, data_dir) data_args = GlueDataTrainingArguments(task_name=task_name, data_dir=task_data_dir) train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train", cache_dir=task_data_dir) eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev", cache_dir=task_data_dir) training_args = TrainingArguments( output_dir=".", learning_rate=1e-5, # config do_train=True, do_eval=True, no_cuda=gpus_per_trial <= 0, evaluation_strategy="epoch", load_best_model_at_end=True, num_train_epochs=2, # config max_steps=-1, per_device_train_batch_size=16, # config per_device_eval_batch_size=16, # config warmup_steps=0, weight_decay=0.1, # config logging_dir="./logs", skip_memory_metrics=True, report_to="none") trainer = Trainer(model_init=get_model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=build_compute_metrics_fn(task_name)) tune_config = { "per_device_train_batch_size": 32, "per_device_eval_batch_size": 32, "num_train_epochs": tune.choice([2, 3, 4, 5]), "max_steps": 1 if smoke_test else -1, # Used for smoke test. } scheduler = PopulationBasedTraining(time_attr="training_iteration", metric="eval_acc", mode="max", perturbation_interval=1, hyperparam_mutations={ "weight_decay": tune.uniform(0.0, 0.3), "learning_rate": tune.uniform(1e-5, 5e-5), "per_device_train_batch_size": [16, 32, 64], }) reporter = CLIReporter(parameter_columns={ "weight_decay": "w_decay", "learning_rate": "lr", "per_device_train_batch_size": "train_bs/gpu", "num_train_epochs": "num_epochs" }, metric_columns=[ "eval_acc", "eval_loss", "epoch", "training_iteration" ]) trainer.hyperparameter_search( hp_space=lambda _: tune_config, backend="ray", n_trials=num_samples, resources_per_trial={ "cpu": 1, "gpu": gpus_per_trial }, scheduler=scheduler, keep_checkpoints_num=1, checkpoint_score_attr="training_iteration", stop={"training_iteration": 1} if smoke_test else None, progress_reporter=reporter, local_dir="~/ray_results/", name="tune_transformer_pbt", log_to_file=True)
def dataset(self, tokenizer): data_args = GlueDataTrainingArguments( task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True) return GlueDataset(data_args, tokenizer=tokenizer, mode="train")
def train(EXP: str, MODEL_NAME: str, TASK_NAME: str, N_LABELS: int, DELTA: float, WEIGHT_DECAY: float, DEVICE: str) -> float: EPOCHS = 5 BATCH_SIZE = 8 SAMPLES = 10 FREEZE = True LOGS = "logs" MAX_SEQ_LENGTH = 128 LOADER_OPTIONS = { "num_workers": 6, "pin_memory": True } LR = 2e-5 ADAM_EPSILON = 1e-8 N_WARMUP_STEPS = 0 MAX_GRAD_NORM = 1 DATA_DIR = os.path.join("./dataset/glue/data", TASK_NAME) os.makedirs(LOGS, exist_ok=True) writer_path = os.path.join(LOGS, f"bayeformers_bert_glue.{EXP}") writer_suff = f".DELTA_{DELTA}.WEIGHT_DECAY_{WEIGHT_DECAY}" writer = SummaryWriter(writer_path + writer_suff) o_model, tokenizer = setup_model(MODEL_NAME, TASK_NAME, N_LABELS) o_model = o_model.to(DEVICE) glue = GlueDataTrainingArguments(TASK_NAME, data_dir=DATA_DIR, max_seq_length=MAX_SEQ_LENGTH) train_dataset = GlueDataset(glue, tokenizer=tokenizer) test_dataset = GlueDataset(glue, tokenizer=tokenizer, mode="dev") train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate, **LOADER_OPTIONS) test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate, **LOADER_OPTIONS) decay = [param for name, param in o_model.named_parameters() if name in ["bias", "LayerNorm.weight"]] no_decay = [param for name, param in o_model.named_parameters() if name not in ["bias", "LayerNorm.weight"]] params_decay = { "params": decay, "weight_decay": WEIGHT_DECAY } params_no_decay = { "params": no_decay, "weight_decay": 0.0 } parameters = [params_decay, params_no_decay] criterion = nn.CrossEntropyLoss().to(DEVICE) optim = AdamW(parameters, lr=LR, eps=ADAM_EPSILON) scheduler = get_linear_schedule_with_warmup(optim, N_WARMUP_STEPS, EPOCHS) report = Report() for epoch in tqdm(range(EPOCHS), desc="Epoch"): # ============================ TRAIN ====================================== o_model.train() report.reset() pbar = tqdm(train_loader, desc="Train") for inputs in pbar: inputs = dic2cuda(inputs, DEVICE) labels = inputs["labels"] optim.zero_grad() logits = o_model(**inputs)[1] loss = criterion(logits.view(-1, N_LABELS), labels.view(-1)) acc = (torch.argmax(logits, dim=1) == labels).float().sum() loss.backward() nn.utils.clip_grad_norm_(o_model.parameters(), MAX_GRAD_NORM) optim.step() report.total += loss.item() / len(train_loader) report.acc += acc.item() * 100 / len(train_dataset) pbar.set_postfix(total=report.total, acc=report.acc) scheduler.step() writer.add_scalar("train_nll", report.total, epoch) writer.add_scalar("train_acc", report.acc, epoch) # ============================ TEST ======================================= o_model.eval() report.reset() with torch.no_grad(): pbar = tqdm(test_loader, desc="Test") for inputs in pbar: inputs = dic2cuda(inputs, DEVICE) labels = inputs["labels"] logits = o_model(**inputs)[1] loss = criterion(logits.view(-1, N_LABELS), labels.view(-1)) acc = (torch.argmax(logits, dim=1) == labels).float().sum() report.total += loss.item() / len(test_loader) report.acc += acc.item() * 100 / len(test_dataset) pbar.set_postfix(total=report.total, acc=report.acc) writer.add_scalar("test_nll", report.total, epoch) writer.add_scalar("test_acc", report.acc, epoch) # ============================ EVALUTATION ==================================== b_model = to_bayesian(o_model, delta=DELTA, freeze=FREEZE) b_model = b_model.to(DEVICE) b_model.eval() report.reset() with torch.no_grad(): pbar = tqdm(test_loader, desc="Bayesian Eval") for inputs in pbar: inputs = dic2cuda(inputs, DEVICE) labels = inputs["labels"] B = inputs["input_ids"].size(0) samples = sample_bayesian(b_model, inputs, SAMPLES, B, N_LABELS, DEVICE) raw_logits, logits, log_prior, log_variational_posterior = samples nll = criterion(logits, labels.view(-1)) loss = (log_variational_posterior - log_prior) / len(test_loader) + nll acc = (torch.argmax(logits, dim=1) == labels).float().sum() acc_std = np.std([(torch.argmax(logits, dim=1) == labels).float().sum().item() for logits in raw_logits]) report.total += loss.item() / len(test_loader) report.nll += nll.item() / len(test_loader) report.log_prior += log_prior.item() / len(test_loader) report.log_variational_posterior += log_variational_posterior.item() / len(test_loader) report.acc += acc.item() * 100 / len(test_dataset) report.acc_std += acc_std / len(test_loader) pbar.set_postfix( total=report.total, nll=report.nll, log_prior=report.log_prior, log_variational_posterior=report.log_variational_posterior, acc=report.acc, acc_std=report.acc_std, ) writer.add_scalar("bayesian_eval_nll", report.nll, epoch) writer.add_scalar("bayesian_eval_acc", report.acc, epoch) writer.add_scalar("bayesian_eval_acc_std", report.acc_std, epoch) decay = [param for name, param in b_model.named_parameters() if name in ["bias", "LayerNorm.weight"]] no_decay = [param for name, param in b_model.named_parameters() if name not in ["bias", "LayerNorm.weight"]] params_decay = { "params": decay, "weight_decay": WEIGHT_DECAY } params_no_decay = { "params": no_decay, "weight_decay": 0.0 } parameters = [params_decay, params_no_decay] criterion = nn.CrossEntropyLoss().to(DEVICE) optim = AdamW(parameters, lr=LR, eps=ADAM_EPSILON) scheduler = get_linear_schedule_with_warmup(optim, N_WARMUP_STEPS, EPOCHS) for epoch in tqdm(range(EPOCHS), desc="Bayesian Epoch"): # ============================ TRAIN ====================================== b_model.train() report.reset() pbar = tqdm(train_loader, desc="Bayesian Train") for inputs in pbar: inputs = dic2cuda(inputs, DEVICE) labels = inputs["labels"] B = inputs["input_ids"].size(0) optim.zero_grad() samples = sample_bayesian(b_model, inputs, SAMPLES, B, N_LABELS, DEVICE) raw_logits, logits, log_prior, log_variational_posterior = samples nll = criterion(logits, labels.view(-1)) loss = (log_variational_posterior - log_prior) / len(train_loader) + nll acc = (torch.argmax(logits, dim=1) == labels).float().sum() acc_std = np.std([(torch.argmax(logits, dim=1) == labels).float().sum().item() for logits in raw_logits]) loss.backward() nn.utils.clip_grad_norm_(b_model.parameters(), MAX_GRAD_NORM) optim.step() report.total += loss.item() / len(train_loader) report.nll += nll.item() / len(train_loader) report.log_prior += log_prior.item() / len(train_loader) report.log_variational_posterior += log_variational_posterior.item() / len(train_loader) report.acc += acc.item() * 100 / len(train_dataset) report.acc_std += acc_std / len(train_loader) pbar.set_postfix( total=report.total, nll=report.nll, log_prior=report.log_prior, log_variational_posterior=report.log_variational_posterior, acc=report.acc, acc_std=acc_std, ) scheduler.step() writer.add_scalar("bayesian_train_nll", report.nll, epoch) writer.add_scalar("bayesian_train_acc", report.acc, epoch) writer.add_scalar("bayesian_train_acc_std", report.acc_std, epoch) # ============================ TEST ======================================= b_model.eval() report.reset() with torch.no_grad(): pbar = tqdm(test_loader, desc="Bayesian Test") for inputs in pbar: inputs = dic2cuda(inputs, DEVICE) labels = inputs["labels"] B = inputs["input_ids"].size(0) samples = sample_bayesian(b_model, inputs, SAMPLES, B, N_LABELS, DEVICE) raw_logits, logits, log_prior, log_variational_posterior = samples nll = criterion(logits, labels.view(-1)) loss = (log_variational_posterior - log_prior) / len(test_loader) + nll acc = (torch.argmax(logits, dim=1) == labels).float().sum() acc_std = np.std([(torch.argmax(logits, dim=1) == labels).float().sum().item() for logits in raw_logits]) report.total += loss.item() / len(test_loader) report.nll += nll.item() / len(test_loader) report.log_prior += log_prior.item() / len(test_loader) report.log_variational_posterior += log_variational_posterior.item() / len(test_loader) report.acc += acc.item() * 100 / len(test_dataset) report.acc_std += acc_std / len(test_loader) pbar.set_postfix( total=report.total, nll=report.nll, log_prior=report.log_prior, log_variational_posterior=report.log_variational_posterior, acc=report.acc, acc_std=report.acc_std, ) writer.add_scalar("bayesian_test_nll", report.nll, epoch) writer.add_scalar("bayesian_test_acc", report.acc, epoch) writer.add_scalar("bayesian_test_acc_std", report.acc_std, epoch) torch.save({ "weight_decay": WEIGHT_DECAY, "delta" : DELTA, "acc" : report.acc, "acc_std" : report.acc_std, "model" : b_model.state_dict() }, f"{writer_path + writer_suff}.pth") return report.acc
def test_reloading_prediction_head(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") data_args = GlueDataTrainingArguments( task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True ) train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train") model = AutoModelWithHeads.from_pretrained("bert-base-uncased") model.add_classification_head("adapter", num_labels=3) model.add_classification_head("dummy", num_labels=2) # add the adapters to be fused model.add_adapter("adapter") model.add_adapter("additional_adapter") # setup fusion adapter_setup = Fuse("adapter", "additional_adapter") model.add_adapter_fusion(adapter_setup) model.train_adapter_fusion(adapter_setup) model.set_active_adapters(adapter_setup) self.assertEqual(adapter_setup, model.active_adapters) self.assertEqual("dummy", model.active_head) with TemporaryDirectory() as tempdir: training_args = TrainingArguments( output_dir=tempdir, do_train=True, learning_rate=0.1, logging_steps=1, max_steps=1, save_steps=1, remove_unused_columns=False, ) trainer = AdapterTrainer( model=model, args=training_args, train_dataset=train_dataset, ) trainer.train() # create second model that should resume the training of the first model_resume = AutoModelWithHeads.from_pretrained("bert-base-uncased") model_resume.add_classification_head("adapter", num_labels=3) model_resume.add_classification_head("dummy", num_labels=2) model_resume.add_adapter("adapter") model_resume.add_adapter("additional_adapter") # setup fusion adapter_setup = Fuse("adapter", "additional_adapter") model_resume.add_adapter_fusion(adapter_setup) model_resume.train_adapter_fusion(adapter_setup) model_resume.set_active_adapters(adapter_setup) trainer_resume = AdapterTrainer( model=model_resume, args=TrainingArguments(do_train=True, max_steps=1, output_dir=tempdir), train_dataset=train_dataset, ) trainer_resume.train(resume_from_checkpoint=True) self.assertEqual("dummy", model.active_head) self.assertEqual(model.config.adapters.adapters, model_resume.config.adapters.adapters) for ((k1, v1), (k2, v2)) in zip( trainer.model.state_dict().items(), trainer_resume.model.state_dict().items() ): self.assertEqual(k1, k2) if "adapter" in k1 or "dummy" in k1: self.assertTrue(torch.equal(v1, v2), k1)
def test_resume_training_with_fusion(self): def encode_batch(batch): """Encodes a batch of input data using the model tokenizer.""" return tokenizer(batch["sentence1"], batch["sentence2"], max_length=80, truncation=True, padding="max_length") tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") data_args = GlueDataTrainingArguments( task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True) train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train") model = AutoModelForSequenceClassification.from_pretrained( "bert-base-uncased") model.add_adapter("adapter") model.add_adapter("additional_adapter") model.add_fusion(Fuse("adapter", "additional_adapter")) model.set_active_adapters(Fuse("adapter", "additional_adapter")) training_args = TrainingArguments( output_dir="./examples", do_train=True, learning_rate=0.1, logging_steps=1, max_steps=1, save_steps=1, remove_unused_columns=False, ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, do_save_adapters=True, do_save_full_model=False, do_save_adapter_fusion=True, ) trainer.train() model_resume = AutoModelForSequenceClassification.from_pretrained( "bert-base-uncased") model_resume.add_adapter("adapter") model_resume.add_adapter("additional_adapter") model_resume.add_fusion(Fuse("adapter", "additional_adapter")) model_resume.set_active_adapters(Fuse("adapter", "additional_adapter")) trainer_resume = Trainer( model=model_resume, args=TrainingArguments(do_train=True, max_steps=1, output_dir="./examples"), train_dataset=train_dataset, ) trainer_resume.train(resume_from_checkpoint=True) self.assertEqual(model.config.adapters.adapters, model_resume.config.adapters.adapters) for ((k1, v1), (k2, v2)) in zip(trainer.model.state_dict().items(), trainer_resume.model.state_dict().items()): self.assertEqual(k1, k2) if "adapter" in k1: self.assertTrue(torch.equal(v1, v2), k1)
def test_train_adapter_fusion(self): tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelForSequenceClassification.from_config(self.config()) # add the adapters to be fused model.add_adapter("a") model.add_adapter("b") model.add_adapter("c") self.assertIn("a", model.config.adapters.adapters) self.assertIn("b", model.config.adapters.adapters) self.assertIn("c", model.config.adapters.adapters) # setup fusion adapter_setup = Fuse("a", "b", "c") model.add_fusion(adapter_setup) model.train_fusion(adapter_setup) model.set_active_adapters(adapter_setup) self.assertEqual(adapter_setup, model.active_adapters) # all weights of the adapters should be frozen (test for one) for k, v in filter_parameters(model, "adapters.a.").items(): self.assertFalse(v.requires_grad, k) # all weights of the fusion layer should be activated for k, v in filter_parameters(model, "adapter_fusion_layer").items(): self.assertTrue(v.requires_grad, k) # weights of the model should be freezed (check on some examples) for k, v in filter_parameters(model, "encoder.layer.0.attention").items(): self.assertFalse(v.requires_grad, k) state_dict_pre = copy.deepcopy(model.state_dict()) # setup dataset data_args = GlueDataTrainingArguments( task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True) train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train") training_args = TrainingArguments(output_dir="./examples", do_train=True, learning_rate=0.1, max_steps=7, no_cuda=True) # evaluate trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, ) trainer.train() for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(), model.state_dict().items()): if "adapter_fusion_layer" in k1 or "classifier" in k1 or "classification_head" in k1 or "score" in k1: self.assertFalse(torch.equal(v1, v2), k1) else: self.assertTrue(torch.equal(v1, v2), k1)
checkpoint_dir = config["model_checkpoints"] glue_dataset_folder = config["glue_dataset_folder"] device_ids = list(range(torch.cuda.device_count())) print(f"GPU list: {device_ids}") print(json.dumps([model_config, pretraining_config], indent=4)) ########################### Loading Datasets ########################### tokenizer = utils.get_tokenizer(model_config["max_seq_len"]) model_config["vocab_size"] = len(tokenizer.get_vocab()) data_args = GlueDataTrainingArguments( task_name=args.task, data_dir=os.path.join(glue_dataset_folder, args.task), max_seq_length=model_config["max_seq_len"], overwrite_cache=True) train_dataset = GlueDataset(data_args, tokenizer=tokenizer) data_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, collate_fn=default_data_collator) num_steps_per_epoch = len(data_loader) print(f"num_steps_per_epoch: {num_steps_per_epoch}", flush=True) dev_datasets = {"dev": GlueDataset(data_args, tokenizer=tokenizer, mode="dev")} if args.task.lower() == "mnli": data_args = GlueDataTrainingArguments( task_name="mnli-mm", data_dir=os.path.join(glue_dataset_folder, args.task),