Esempio n. 1
0
    def __init__(self, hparams):
        assert Path(hparams.data_dir).exists()
        self.output_dir = Path(hparams.output_dir)
        self.output_dir.mkdir(exist_ok=True)

        save_dir = self.output_dir.joinpath("student")

        hparams.model_name_or_path = str(save_dir)  # Tell lightning we are training the student
        teacher = AutoModelForSeq2SeqLM.from_pretrained(hparams.teacher).eval()
        use_task_specific_params(teacher, hparams.task)  # We copy good generation parameters to student by default
        student, e_layer_ids, d_layer_ids = create_student_by_copying_alternating_layers(
            teacher, e=hparams.student_encoder_layers, d=hparams.student_decoder_layers, save_path=save_dir
        )
        if hparams.length_penalty != -1:
            student.config.length_penalty = hparams.length_penalty
        super().__init__(hparams, model=student, config=student.config)
        model_type = student.config.model_type
        self.e_layer_ids, self.d_layer_ids = e_layer_ids, d_layer_ids  # type: List[int], List[int]

        if model_type == "t5":
            teacher_encoder_layers = len(teacher.get_encoder().block)
            teacher_decoder_layers = len(teacher.get_decoder().block)
        else:
            teacher_encoder_layers = teacher.config.encoder_layers
            teacher_decoder_layers = teacher.config.decoder_layers

        self.different_encoder = hparams.student_encoder_layers != teacher_encoder_layers
        self.different_decoder = hparams.student_decoder_layers != teacher_decoder_layers

        self.teacher = teacher
        freeze_params(self.teacher)

        if not self.different_encoder:  # To save RAM, delete teacher encoder and freeze student encoder.
            try:
                del self.teacher.model.encoder
            except AttributeError:  # T5
                del self.teacher.encoder
        # Intermediate supervision: Decide which layers to supervise
        if hparams.supervise_forward:
            self.e_matches = get_layers_to_supervise(n_student=len(self.e_layer_ids), n_teacher=teacher_encoder_layers)
            self.d_matches = get_layers_to_supervise(n_student=len(self.d_layer_ids), n_teacher=teacher_decoder_layers)
        else:  # student layer should emulate hidden states of the teacher layer it was copied from
            self.e_matches = self.e_layer_ids
            self.d_matches = self.d_layer_ids

        self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean")
        self.temperature = 2.0
        self.alpha_mlm = hparams.alpha_mlm
        self.alpha_ce = hparams.alpha_ce
        self.alpha_hid = hparams.alpha_hid
        gc.collect()
        torch.cuda.empty_cache()
Esempio n. 2
0
    def __init__(self, hparams):
        assert Path(hparams.data_dir).exists()
        self.output_dir = Path(hparams.output_dir)
        self.output_dir.mkdir(exist_ok=True)

        save_dir = self.output_dir.joinpath("student")

        hparams.model_name_or_path = str(
            save_dir)  # Tell lightning we are training the student
        teacher = AutoModelForSeq2SeqLM.from_pretrained(hparams.teacher).eval()
        use_task_specific_params(
            teacher, hparams.task
        )  # We copy good generation parameters to student by default
        if hparams.student is not None:
            student = AutoModelForSeq2SeqLM.from_pretrained(hparams.student)
            use_task_specific_params(student, hparams.task)
            e_layer_ids, d_layer_ids = None, None
        else:
            student, e_layer_ids, d_layer_ids = create_student_by_copying_alternating_layers(
                teacher,
                e=hparams.student_encoder_layers,
                d=hparams.student_decoder_layers,
                save_path=save_dir)

        if hparams.length_penalty != -1:
            student.config.length_penalty = hparams.length_penalty
        hparams.tokenizer_name = hparams.teacher  # Use teacher's tokenizer
        super().__init__(hparams, model=student, config=student.config)
        assert (
            student.config.model_type == teacher.config.model_type
        ), f"teacher, student model types should be the same, got {student.config.model_type} != {teacher.config.model_type}"

        if student.config.model_type == "t5":
            student_encoder_layers = len(student.get_encoder().block)
            student_decoder_layers = len(student.get_decoder().block)
            teacher_encoder_layers = len(teacher.get_encoder().block)
            teacher_decoder_layers = len(teacher.get_decoder().block)
        else:
            student_encoder_layers = student.config.encoder_layers
            student_decoder_layers = student.config.decoder_layers
            teacher_encoder_layers = teacher.config.encoder_layers
            teacher_decoder_layers = teacher.config.decoder_layers

        self.different_base_models = not (hparams.student is None or
                                          hparams.teacher == hparams.student)
        self.do_calc_hidden_loss = (
            not self.different_base_models) and hparams.alpha_hid > 0
        self.different_encoder = self.different_base_models or (
            student_encoder_layers != teacher_encoder_layers)
        # self.different_encoder determines whether we need to run the teacher encoder
        self.teacher = teacher
        freeze_params(self.teacher)

        if not self.different_encoder:  # To save RAM, delete teacher encoder and freeze student encoder.
            try:
                del self.teacher.model.encoder
            except AttributeError:  # T5
                del self.teacher.encoder

        if e_layer_ids is None:
            e_layer_ids = list(range(student_encoder_layers))
        if d_layer_ids is None:
            d_layer_ids = list(range(student_decoder_layers))

        self.e_layer_ids, self.d_layer_ids = e_layer_ids, d_layer_ids  # type: List[int], List[int]

        if self.do_calc_hidden_loss:  # Intermediate supervision: Decide which layers to supervise
            if hparams.supervise_forward:
                self.e_matches = get_layers_to_supervise(
                    n_student=len(self.e_layer_ids),
                    n_teacher=teacher_encoder_layers)
                self.d_matches = get_layers_to_supervise(
                    n_student=len(self.d_layer_ids),
                    n_teacher=teacher_decoder_layers)
            else:  # student layer should emulate hidden states of the teacher layer it was copied from
                self.e_matches = self.e_layer_ids
                self.d_matches = self.d_layer_ids
        else:
            self.e_matches = None
            self.d_matches = None

        self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean")
        self.temperature = 2.0
        self.alpha_mlm = hparams.alpha_mlm
        self.alpha_ce = hparams.alpha_ce
        self.alpha_hid = hparams.alpha_hid
        gc.collect()
        torch.cuda.empty_cache()