def pre_init(self, hparams): self.output_dir = Path(hparams.output_dir) self.output_dir.mkdir(exist_ok=True) teacher = BartForConditionalGeneration.from_pretrained( hparams.teacher).eval() student_updates = { "decoder_layers": hparams.student_decoder_layers, "encoder_layers": hparams.student_encoder_layers, } if hparams.length_penalty != -1: student_updates["length_penalty"] = hparams.length_penalty d_layers_to_copy = get_layers_to_copy( student_updates["decoder_layers"], teacher.config.decoder_layers) e_layers_to_copy: List = get_layers_to_copy( student_updates["encoder_layers"], teacher.config.encoder_layers) hparams.d_layer_to_copy = d_layers_to_copy hparams.e_layer_to_copy = e_layers_to_copy kw = teacher.config.to_diff_dict() kw.update(student_updates) # Copy weights student_cfg = BartConfig(**kw) student = BartForConditionalGeneration(student_cfg) student, _ = init_student(student, teacher) save_dir = self.output_dir.joinpath("student") self.copy_to_student(d_layers_to_copy, e_layers_to_copy, hparams, student, teacher) student.save_pretrained(save_dir) hparams.model_name_or_path = str(save_dir) return student, student_cfg, teacher
def pre_init(self, hparams): raise NotImplementedError("T5 Distillation does not work yet") self.output_dir = Path(hparams.output_dir) self.output_dir.mkdir(exist_ok=True) teacher = T5ForConditionalGeneration.from_pretrained(hparams.teacher) n_layer = hparams.student_decoder_layers assert n_layer == hparams.student_encoder_layers # TODO(SS): relax this constraint so that we can do 12-6. d_layers_to_copy = get_layers_to_copy(n_layer, len(teacher.decoder.block)) e_layers_to_copy: List = get_layers_to_copy(n_layer, len(teacher.encoder.block)) student_updates = {"num_layers": n_layer} hparams.d_layer_to_copy = d_layers_to_copy hparams.e_layer_to_copy = e_layers_to_copy kw = teacher.config.to_diff_dict() kw.update(student_updates) # Copy weights student_cfg = T5Config(**kw) student = T5ForConditionalGeneration(student_cfg) student, _ = init_student(student, teacher) self.copy_to_student(d_layers_to_copy, e_layers_to_copy, hparams, student, teacher) Path(hparams.output_dir).mkdir(exist_ok=True) task_specific_params = student.config.task_specific_params if task_specific_params is not None: student.config.update(task_specific_params.get( "summarization", {})) # TODO: dont hardcode save_dir = self.output_dir.joinpath("student") save_dir.mkdir(exist_ok=True) student.save_pretrained(save_dir) hparams.model_name_or_path = str(save_dir) return student, student_cfg, teacher
def pre_init(self, hparams): self.output_dir = Path(hparams.output_dir) self.output_dir.mkdir(exist_ok=True) teacher = AutoModelForSeq2SeqLM.from_pretrained(hparams.teacher).eval() student_updates = { "decoder_layers": hparams.student_decoder_layers, "encoder_layers": hparams.student_encoder_layers, } if hparams.length_penalty != -1: student_updates["length_penalty"] = hparams.length_penalty e_layers_to_copy: List = get_layers_to_copy(student_updates["encoder_layers"], teacher.config.encoder_layers) hparams.e_layer_to_copy = e_layers_to_copy d_layers_to_copy: List = get_layers_to_copy(student_updates["decoder_layers"], teacher.config.decoder_layers) if hparams.supervise_forward: hparams.d_matches = get_layers_to_supervise( student_updates["decoder_layers"], teacher.config.decoder_layers ) else: hparams.d_matches = d_layers_to_copy hparams.d_layer_to_copy = d_layers_to_copy kw = teacher.config.to_diff_dict() kw.update(student_updates) # Copy weights student_cfg = teacher.config_class(**kw) student = type(teacher)(student_cfg) student, _ = init_student(student, teacher) save_dir = self.output_dir.joinpath("student") self.copy_to_student(d_layers_to_copy, e_layers_to_copy, hparams, student, teacher) student.save_pretrained(save_dir) hparams.model_name_or_path = str(save_dir) return student, student_cfg, teacher
def pre_init(self, hparams): teacher = T5ForConditionalGeneration.from_pretrained(hparams.teacher) n_layer = hparams.student_decoder_layers assert n_layer == hparams.student_encoder_layers # TODO(SS): relax this d_layers_to_copy = get_layers_to_copy(n_layer, len(teacher.decoder.block)) e_layers_to_copy: List = get_layers_to_copy(n_layer, len(teacher.encoder.block)) student_updates = {"num_layers": n_layer} hparams.d_layer_to_copy = d_layers_to_copy hparams.e_layer_to_copy = e_layers_to_copy kw = teacher.config.to_diff_dict() kw.update(student_updates) # Copy weights student_cfg = T5Config(**kw) student = T5ForConditionalGeneration(student_cfg) student, _ = init_student(student, teacher) self.copy_to_student(d_layers_to_copy, e_layers_to_copy, hparams, student, teacher) Path(hparams.output_dir).mkdir(exist_ok=True) task_specific_params = student.config.task_specific_params if task_specific_params is not None: student.config.update(task_specific_params.get( "summarization", {})) return d_layers_to_copy, student, student_cfg, teacher
def pre_init(self, hparams): # Dump empty student model at a path, then call from_pretrained on it teacher = BartForConditionalGeneration.from_pretrained( hparams.teacher).eval() student_updates = { "decoder_layers": hparams.student_decoder_layers, "encoder_layers": hparams.student_encoder_layers, } d_layers_to_copy = get_layers_to_copy( student_updates["decoder_layers"], teacher.config.decoder_layers) e_layers_to_copy: List = get_layers_to_copy( student_updates["encoder_layers"], teacher.config.encoder_layers) hparams.d_layer_to_copy = d_layers_to_copy hparams.e_layer_to_copy = e_layers_to_copy kw = teacher.config.to_diff_dict() kw.update(student_updates) # Copy weights student_cfg = BartConfig(**kw) student = BartForConditionalGeneration(student_cfg) student, _ = init_student(student, teacher) self.copy_to_student(d_layers_to_copy, e_layers_to_copy, hparams, student, teacher) Path(hparams.output_dir).mkdir(exist_ok=True) return d_layers_to_copy, student, student_cfg, teacher