def convert_parlai_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_json_path): """ Copy/paste/tweak model's weights to our BERT structure. """ model = torch.load(checkpoint_path, map_location="cpu") sd = model["model"] cfg = BartConfig.from_json_file(config_json_path) m = BartForConditionalGeneration(cfg) valid_keys = m.model.state_dict().keys() failures = [] mapping = {} for k, v in sd.items(): if k in IGNORE_KEYS: continue new_k = rename_state_dict_key(k) if new_k not in valid_keys: failures.append([k, new_k]) else: mapping[new_k] = v if cfg.normalize_before: # Blenderbot-3B checkpoints. Rename layernorm_embedding -> layer_norm rename_layernorm_keys(sd) m.model.load_state_dict(mapping, strict=True) m.half() m.save_pretrained(pytorch_dump_folder_path)
def pre_init(self, hparams): self.output_dir = Path(hparams.output_dir) self.output_dir.mkdir(exist_ok=True) teacher = BartForConditionalGeneration.from_pretrained( hparams.teacher).eval() student_updates = { "decoder_layers": hparams.student_decoder_layers, "encoder_layers": hparams.student_encoder_layers, } if hparams.length_penalty != -1: student_updates["length_penalty"] = hparams.length_penalty d_layers_to_copy = get_layers_to_copy( student_updates["decoder_layers"], teacher.config.decoder_layers) e_layers_to_copy: List = get_layers_to_copy( student_updates["encoder_layers"], teacher.config.encoder_layers) hparams.d_layer_to_copy = d_layers_to_copy hparams.e_layer_to_copy = e_layers_to_copy kw = teacher.config.to_diff_dict() kw.update(student_updates) # Copy weights student_cfg = BartConfig(**kw) student = BartForConditionalGeneration(student_cfg) student, _ = init_student(student, teacher) save_dir = self.output_dir.joinpath("student") self.copy_to_student(d_layers_to_copy, e_layers_to_copy, hparams, student, teacher) student.save_pretrained(save_dir) hparams.model_name_or_path = str(save_dir) return student, student_cfg, teacher
def train( config: TrainConfig, model: BartForConditionalGeneration, train_dataloader: DataLoader, dev_dataloader: DataLoader, optimizer: Adam, logger: logging.Logger, device=torch.device, ): """ 지정된 Epoch만큼 모델을 학습시키는 함수입니다. """ model.to(device) global_step = 0 for epoch in range(1, config.num_epochs + 1): model.train() loss_sum = 0.0 for data in train_dataloader: global_step += 1 data = _change_device(data, device) optimizer.zero_grad() output = model.forward( input_ids=data[0], attention_mask=data[1], decoder_input_ids=data[2], labels=data[3], decoder_attention_mask=data[4], return_dict=True, ) loss = output["loss"] loss.backward() loss_sum += loss.item() nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() if global_step % config.train_log_interval == 0: mean_loss = loss_sum / config.train_log_interval logger.info( f"Epoch {epoch} Step {global_step} " f"Loss {mean_loss:.4f} Perplexity {math.exp(mean_loss):8.2f}" ) loss_sum = 0.0 if global_step % config.dev_log_interval == 0: _validate(model, dev_dataloader, logger, device) if global_step % config.save_interval == 0: model.save_pretrained(f"{config.save_model_file_prefix}_{global_step}")