def __init__(self, hparams): assert Path(hparams.data_dir).exists() self.output_dir = Path(hparams.output_dir) self.output_dir.mkdir(exist_ok=True) save_dir = self.output_dir.joinpath("student") hparams.model_name_or_path = str(save_dir) # Tell lightning we are training the student teacher = AutoModelForSeq2SeqLM.from_pretrained(hparams.teacher).eval() use_task_specific_params(teacher, hparams.task) # We copy good generation parameters to student by default student, e_layer_ids, d_layer_ids = create_student_by_copying_alternating_layers( teacher, e=hparams.student_encoder_layers, d=hparams.student_decoder_layers, save_path=save_dir ) if hparams.length_penalty != -1: student.config.length_penalty = hparams.length_penalty super().__init__(hparams, model=student, config=student.config) model_type = student.config.model_type self.e_layer_ids, self.d_layer_ids = e_layer_ids, d_layer_ids # type: List[int], List[int] if model_type == "t5": teacher_encoder_layers = len(teacher.get_encoder().block) teacher_decoder_layers = len(teacher.get_decoder().block) else: teacher_encoder_layers = teacher.config.encoder_layers teacher_decoder_layers = teacher.config.decoder_layers self.different_encoder = hparams.student_encoder_layers != teacher_encoder_layers self.different_decoder = hparams.student_decoder_layers != teacher_decoder_layers self.teacher = teacher freeze_params(self.teacher) if not self.different_encoder: # To save RAM, delete teacher encoder and freeze student encoder. try: del self.teacher.model.encoder except AttributeError: # T5 del self.teacher.encoder # Intermediate supervision: Decide which layers to supervise if hparams.supervise_forward: self.e_matches = get_layers_to_supervise(n_student=len(self.e_layer_ids), n_teacher=teacher_encoder_layers) self.d_matches = get_layers_to_supervise(n_student=len(self.d_layer_ids), n_teacher=teacher_decoder_layers) else: # student layer should emulate hidden states of the teacher layer it was copied from self.e_matches = self.e_layer_ids self.d_matches = self.d_layer_ids self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean") self.temperature = 2.0 self.alpha_mlm = hparams.alpha_mlm self.alpha_ce = hparams.alpha_ce self.alpha_hid = hparams.alpha_hid gc.collect() torch.cuda.empty_cache()
def __init__(self, hparams): assert Path(hparams.data_dir).exists() self.output_dir = Path(hparams.output_dir) self.output_dir.mkdir(exist_ok=True) save_dir = self.output_dir.joinpath("student") hparams.model_name_or_path = str( save_dir) # Tell lightning we are training the student teacher = AutoModelForSeq2SeqLM.from_pretrained(hparams.teacher).eval() use_task_specific_params( teacher, hparams.task ) # We copy good generation parameters to student by default if hparams.student is not None: student = AutoModelForSeq2SeqLM.from_pretrained(hparams.student) use_task_specific_params(student, hparams.task) e_layer_ids, d_layer_ids = None, None else: student, e_layer_ids, d_layer_ids = create_student_by_copying_alternating_layers( teacher, e=hparams.student_encoder_layers, d=hparams.student_decoder_layers, save_path=save_dir) if hparams.length_penalty != -1: student.config.length_penalty = hparams.length_penalty hparams.tokenizer_name = hparams.teacher # Use teacher's tokenizer super().__init__(hparams, model=student, config=student.config) assert ( student.config.model_type == teacher.config.model_type ), f"teacher, student model types should be the same, got {student.config.model_type} != {teacher.config.model_type}" if student.config.model_type == "t5": student_encoder_layers = len(student.get_encoder().block) student_decoder_layers = len(student.get_decoder().block) teacher_encoder_layers = len(teacher.get_encoder().block) teacher_decoder_layers = len(teacher.get_decoder().block) else: student_encoder_layers = student.config.encoder_layers student_decoder_layers = student.config.decoder_layers teacher_encoder_layers = teacher.config.encoder_layers teacher_decoder_layers = teacher.config.decoder_layers self.different_base_models = not (hparams.student is None or hparams.teacher == hparams.student) self.do_calc_hidden_loss = ( not self.different_base_models) and hparams.alpha_hid > 0 self.different_encoder = self.different_base_models or ( student_encoder_layers != teacher_encoder_layers) # self.different_encoder determines whether we need to run the teacher encoder self.teacher = teacher freeze_params(self.teacher) if not self.different_encoder: # To save RAM, delete teacher encoder and freeze student encoder. try: del self.teacher.model.encoder except AttributeError: # T5 del self.teacher.encoder if e_layer_ids is None: e_layer_ids = list(range(student_encoder_layers)) if d_layer_ids is None: d_layer_ids = list(range(student_decoder_layers)) self.e_layer_ids, self.d_layer_ids = e_layer_ids, d_layer_ids # type: List[int], List[int] if self.do_calc_hidden_loss: # Intermediate supervision: Decide which layers to supervise if hparams.supervise_forward: self.e_matches = get_layers_to_supervise( n_student=len(self.e_layer_ids), n_teacher=teacher_encoder_layers) self.d_matches = get_layers_to_supervise( n_student=len(self.d_layer_ids), n_teacher=teacher_decoder_layers) else: # student layer should emulate hidden states of the teacher layer it was copied from self.e_matches = self.e_layer_ids self.d_matches = self.d_layer_ids else: self.e_matches = None self.d_matches = None self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean") self.temperature = 2.0 self.alpha_mlm = hparams.alpha_mlm self.alpha_ce = hparams.alpha_ce self.alpha_hid = hparams.alpha_hid gc.collect() torch.cuda.empty_cache()