def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config_path = os.path.join(args.model_name_or_path, 'model_config.json') cfg_dict = dict(json.loads(open(config_path).read())) num_labels = cfg_dict['num_classes'] model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_labels) origin_model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_labels) sp_config = supernet(expand_ratio=[1.0, args.width_mult]) model = Convert(sp_config).convert(model) ofa_model = OFA(model) sd = paddle.load( os.path.join(args.model_name_or_path, 'model_state.pdparams')) ofa_model.model.set_state_dict(sd) best_config = utils.dynabert_config(ofa_model, args.width_mult) ofa_model.export(best_config, input_shapes=[[1, args.max_seq_length], [1, args.max_seq_length]], input_dtypes=['int64', 'int64'], origin_model=origin_model) for name, sublayer in origin_model.named_sublayers(): if isinstance(sublayer, paddle.nn.MultiHeadAttention): sublayer.num_heads = int(args.width_mult * sublayer.num_heads) output_dir = os.path.join(args.sub_model_output_dir, "model_width_%.5f" % args.width_mult) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = origin_model model_to_save.save_pretrained(output_dir) if args.static_sub_model != None: export_static_model(origin_model, args.static_sub_model, args.max_seq_length)
def _dynabert_export(task_name, ofa_model, dynabert_config, output_dir): from paddleslim.nas.ofa import OFA, DistillConfig, utils ofa_model.model.base_model_class.forward = auto_model_forward ofa_model._add_teacher = False _recover_transormer_func() for width_mult in dynabert_config.width_mult_list: model_dir = os.path.join(output_dir, str(width_mult)) state_dict = paddle.load( os.path.join(model_dir, "model_state.pdparams")) if "cmrc2018" in task_name: origin_model = AutoModelForQuestionAnswering.from_pretrained( model_dir) elif task_name == "msra_ner": origin_model = AutoModelForTokenClassification.from_pretrained( model_dir) else: origin_model = AutoModelForSequenceClassification.from_pretrained( model_dir) ofa_model.model.set_state_dict(state_dict) best_config = utils.dynabert_config(ofa_model, width_mult) origin_model_new = ofa_model.export( best_config, input_shapes=[[1, 1], [1, 1]], input_dtypes=['int64', 'int64'], origin_model=origin_model) for name, sublayer in origin_model_new.named_sublayers(): if isinstance(sublayer, paddle.nn.MultiHeadAttention): sublayer.num_heads = int(width_mult * sublayer.num_heads) input_shape = [ paddle.static.InputSpec( shape=[None, None], dtype='int64'), paddle.static.InputSpec( shape=[None, None], dtype='int64') ] pruned_infer_model_dir = os.path.join( model_dir, dynabert_config.output_filename_prefix) net = paddle.jit.to_static(origin_model_new, input_spec=input_shape) paddle.jit.save(net, pruned_infer_model_dir)
def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.task_name = args.task_name.lower() metric_class = METRIC_CLASSES[args.task_name] args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] train_ds = load_dataset('glue', args.task_name, splits="train") tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) trans_func = partial( convert_example, tokenizer=tokenizer, label_list=train_ds.label_list, max_seq_length=args.max_seq_length) train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64" if train_ds.label_list else "float32") # label ): fn(samples) train_data_loader = DataLoader( dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) if args.task_name == "mnli": dev_ds_matched, dev_ds_mismatched = load_dataset( 'glue', args.task_name, splits=["dev_matched", "dev_mismatched"]) dev_ds_matched = dev_ds_matched.map(trans_func, lazy=True) dev_ds_mismatched = dev_ds_mismatched.map(trans_func, lazy=True) dev_batch_sampler_matched = paddle.io.BatchSampler( dev_ds_matched, batch_size=args.batch_size, shuffle=False) dev_data_loader_matched = DataLoader( dataset=dev_ds_matched, batch_sampler=dev_batch_sampler_matched, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_batch_sampler_mismatched = paddle.io.BatchSampler( dev_ds_mismatched, batch_size=args.batch_size, shuffle=False) dev_data_loader_mismatched = DataLoader( dataset=dev_ds_mismatched, batch_sampler=dev_batch_sampler_mismatched, collate_fn=batchify_fn, num_workers=0, return_list=True) else: dev_ds = load_dataset('glue', args.task_name, splits='dev') dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader( dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_labels = 1 if train_ds.label_list == None else len(train_ds.label_list) model = model_class.from_pretrained( args.model_name_or_path, num_classes=num_labels) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) # Step1: Initialize a dictionary to save the weights from the origin BERT model. origin_weights = {} for name, param in model.named_parameters(): origin_weights[name] = param # Step2: Convert origin model to supernet. sp_config = supernet(expand_ratio=args.width_mult_list) model = Convert(sp_config).convert(model) # Use weights saved in the dictionary to initialize supernet. utils.set_state_dict(model, origin_weights) del origin_weights # Step3: Define teacher model. teacher_model = model_class.from_pretrained( args.model_name_or_path, num_classes=num_labels) # Step4: Config about distillation. mapping_layers = ['bert.embeddings'] for idx in range(model.bert.config['num_hidden_layers']): mapping_layers.append('bert.encoder.layers.{}'.format(idx)) default_distill_config = { 'lambda_distill': 0.1, 'teacher_model': teacher_model, 'mapping_layers': mapping_layers, } distill_config = DistillConfig(**default_distill_config) # Step5: Config in supernet training. ofa_model = OFA(model, distill_config=distill_config, elastic_order=['width']) criterion = paddle.nn.loss.CrossEntropyLoss( ) if train_ds.label_list else paddle.nn.loss.MSELoss() metric = metric_class() if args.task_name == "mnli": dev_data_loader = (dev_data_loader_matched, dev_data_loader_mismatched) # Step6: Calculate the importance of neurons and head, # and then reorder them according to the importance. head_importance, neuron_importance = nlp_utils.compute_neuron_head_importance( args.task_name, ofa_model.model, dev_data_loader, loss_fct=criterion, num_layers=model.bert.config['num_hidden_layers'], num_heads=model.bert.config['num_attention_heads']) reorder_neuron_head(ofa_model.model, head_importance, neuron_importance) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=ofa_model.model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in ofa_model.model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): # Step7: Set current epoch and task. ofa_model.set_epoch(epoch) ofa_model.set_task('width') for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, labels = batch for width_mult in args.width_mult_list: # Step8: Broadcast supernet config from width_mult, # and use this config in supernet training. net_config = utils.dynabert_config(ofa_model, width_mult) ofa_model.set_net_config(net_config) logits, teacher_logits = ofa_model( input_ids, segment_ids, attention_mask=[None, None]) rep_loss = ofa_model.calc_distill_loss() if args.task_name == 'sts-b': logit_loss = 0.0 else: logit_loss = soft_cross_entropy(logits, teacher_logits.detach()) loss = rep_loss + args.lambda_logit * logit_loss loss.backward() optimizer.step() lr_scheduler.step() ofa_model.model.clear_grad() if global_step % args.logging_steps == 0: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0: if args.task_name == "mnli": evaluate( teacher_model, criterion, metric, dev_data_loader_matched, width_mult=100) evaluate( teacher_model, criterion, metric, dev_data_loader_mismatched, width_mult=100) else: evaluate( teacher_model, criterion, metric, dev_data_loader, width_mult=100) for idx, width_mult in enumerate(args.width_mult_list): net_config = utils.dynabert_config(ofa_model, width_mult) ofa_model.set_net_config(net_config) tic_eval = time.time() if args.task_name == "mnli": acc = evaluate(ofa_model, criterion, metric, dev_data_loader_matched, width_mult) evaluate(ofa_model, criterion, metric, dev_data_loader_mismatched, width_mult) print("eval done total : %s s" % (time.time() - tic_eval)) else: acc = evaluate(ofa_model, criterion, metric, dev_data_loader, width_mult) print("eval done total : %s s" % (time.time() - tic_eval)) if (not args.n_gpu > 1 ) or paddle.distributed.get_rank() == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)
def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config_path = os.path.join(args.model_name_or_path, 'model_config.json') cfg_dict = dict(json.loads(open(config_path).read())) kept_layers_index = {} if args.depth_mult < 1.0: depth = round(cfg_dict["init_args"][0]['num_hidden_layers'] * args.depth_mult) cfg_dict["init_args"][0]['num_hidden_layers'] = depth for idx, i in enumerate(range(1, depth + 1)): kept_layers_index[idx] = math.floor(i / args.depth_mult) - 1 os.rename(config_path, config_path + '_bak') with open(config_path, "w", encoding="utf-8") as f: f.write(json.dumps(cfg_dict, ensure_ascii=False)) num_labels = cfg_dict['num_classes'] model = model_class.from_pretrained( args.model_name_or_path, num_classes=num_labels) origin_model = model_class.from_pretrained( args.model_name_or_path, num_classes=num_labels) os.rename(config_path + '_bak', config_path) sp_config = supernet(expand_ratio=[1.0, args.width_mult]) model = Convert(sp_config).convert(model) ofa_model = OFA(model) sd = paddle.load( os.path.join(args.model_name_or_path, 'model_state.pdparams')) if len(kept_layers_index) == 0: ofa_model.model.set_state_dict(sd) else: for name, params in ofa_model.model.named_parameters(): if 'encoder' not in name: params.set_value(sd[name]) else: idx = int(name.strip().split('.')[3]) mapping_name = name.replace( '.' + str(idx) + '.', '.' + str(kept_layers_index[idx]) + '.') params.set_value(sd[mapping_name]) best_config = utils.dynabert_config(ofa_model, args.width_mult) for name, sublayer in ofa_model.model.named_sublayers(): if isinstance(sublayer, paddle.nn.MultiHeadAttention): sublayer.num_heads = int(args.width_mult * sublayer.num_heads) ofa_model.export( best_config, input_shapes=[[1, args.max_seq_length], [1, args.max_seq_length]], input_dtypes=['int64', 'int64'], origin_model=origin_model) for name, sublayer in origin_model.named_sublayers(): if isinstance(sublayer, paddle.nn.MultiHeadAttention): sublayer.num_heads = int(args.width_mult * sublayer.num_heads) output_dir = os.path.join(args.sub_model_output_dir, "model_width_%.5f" % args.width_mult) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = origin_model model_to_save.save_pretrained(output_dir) if args.static_sub_model != None: export_static_model(origin_model, args.static_sub_model, args.max_seq_length)
def _dynabert_training(self, task_name, ofa_model, model, teacher_model, train_dataloader, eval_dataloader, width_mult_list, criterion, num_train_epochs, output_dir): metric = Accuracy() if task_name == "msra_ner": metric = ChunkEvaluator(label_list=self.train_dataset.label_list) @paddle.no_grad() def evaluate(model, criterion, data_loader, width_mult=1.0): model.eval() all_start_logits = [] all_end_logits = [] metric.reset() for batch in data_loader: if "cmrc2018" in task_name: input_ids, token_type_ids = batch['input_ids'], batch[ 'token_type_ids'] logits = model( input_ids, token_type_ids, attention_mask=[None, None]) if width_mult == 100: start_logits_tensor, end_logits_tensor = logits else: start_logits_tensor, end_logits_tensor = logits[0] for idx in range(start_logits_tensor.shape[0]): if len(all_start_logits) % 1000 == 0 and len( all_start_logits): logger.info("Processing example: %d" % len(all_start_logits)) all_start_logits.append(start_logits_tensor.numpy()[idx]) all_end_logits.append(end_logits_tensor.numpy()[idx]) else: input_ids, segment_ids, labels = batch['input_ids'], batch[ 'token_type_ids'], batch['labels'] logits = model( input_ids, segment_ids, attention_mask=[None, None]) if isinstance(logits, tuple): logits = logits[0] loss = criterion(logits, labels) if task_name == "msra_ner": preds = logits.argmax(axis=2) num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute( batch['seq_len'], preds, batch['labels']) metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) else: correct = metric.compute(logits, labels) metric.update(correct) if "cmrc2018" in task_name: n_best_size = 20 max_answer_length = 50 all_predictions, _, _ = compute_prediction( self.eval_examples, self.eval_dataset, (all_start_logits, all_end_logits), False, n_best_size, max_answer_length) res = squad_evaluate( examples=[raw_data for raw_data in self.eval_examples], preds=all_predictions, is_whitespace_splited=False) if width_mult == 100: logger.info("teacher model, EM: %f, F1: %f" % (res['exact'], res['f1'])) else: logger.info("width_mult: %s, EM: %f, F1: %f, " % (str(width_mult), res['exact'], res['f1'])) res = res['exact'] else: res = metric.accumulate() # Teacher model's evaluation if task_name == "msra_ner": if width_mult == 100: logger.info( "teacher model, eval loss: %f, precision: %f, recall: %f, f1_score: %f" % (paddle.mean(loss).numpy(), res[0], res[1], res[2])) else: logger.info( "width_mult: %s, eval loss: %f, precision: %f, recall: %f, f1_score: %f" % (str(width_mult), paddle.mean(loss).numpy(), res[0], res[1], res[2])) res = res[2] else: if width_mult == 100: logger.info("teacher model, eval loss: %f, acc: %s, " % (loss.numpy(), res)) else: logger.info("width_mult: %s, eval loss: %f, acc: %s, " % (str(width_mult), loss.numpy(), res)) model.train() return res from paddleslim.nas.ofa import OFA, DistillConfig, utils global_step = 0 lambda_logit = 1.0 tic_train = time.time() best_acc = 0.0 acc = 0.0 logger.info("DynaBERT training starts. This period will cost some time.") for epoch in range(num_train_epochs): # Step7: Set current epoch and task. ofa_model.set_epoch(epoch) ofa_model.set_task('width') for step, batch in enumerate(train_dataloader): global_step += 1 if "cmrc2018" in task_name: input_ids, token_type_ids, start_positions, end_positions = batch[ 'input_ids'], batch['token_type_ids'], batch[ 'start_positions'], batch['end_positions'] else: input_ids, token_type_ids, labels = batch['input_ids'], batch[ 'token_type_ids'], batch['labels'] for width_mult in width_mult_list: # Step8: Broadcast supernet config from width_mult, # and use this config in supernet training. net_config = utils.dynabert_config(ofa_model, width_mult) ofa_model.set_net_config(net_config) logits, teacher_logits = ofa_model( input_ids, token_type_ids, attention_mask=[None, None]) rep_loss = ofa_model.calc_distill_loss() if "cmrc2018" in task_name: logit_loss = (soft_cross_entropy(logits[0], teacher_logits[0].detach()) \ + \ soft_cross_entropy(logits[1], teacher_logits[1].detach()))/2 else: logit_loss = soft_cross_entropy(logits, teacher_logits.detach()) loss = rep_loss + lambda_logit * logit_loss loss.backward() self.optimizer.step() self.lr_scheduler.step() self.optimizer.clear_grad() if global_step % self.args.logging_steps == 0: if paddle.distributed.get_rank() == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, self.args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if "cmrc2018" not in task_name and global_step % self.args.save_steps == 0: tic_eval = time.time() evaluate( teacher_model, criterion, eval_dataloader, width_mult=100) logger.info("eval done total : %s s" % (time.time() - tic_eval)) for idx, width_mult in enumerate(width_mult_list): net_config = utils.dynabert_config(ofa_model, width_mult) ofa_model.set_net_config(net_config) tic_eval = time.time() acc = evaluate(ofa_model, criterion, eval_dataloader, width_mult) if acc > best_acc: best_acc = acc if paddle.distributed.get_rank() == 0: output_dir_width = os.path.join(output_dir, str(width_mult)) if not os.path.exists(output_dir_width): os.makedirs(output_dir_width) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir_width) logger.info("eval done total : %s s" % (time.time() - tic_eval)) if global_step > self.args.num_training_steps: if best_acc == 0.0: output_dir_width = os.path.join(output_dir, str(width_mult)) if not os.path.exists(output_dir_width): os.makedirs(output_dir_width) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir_width) logger.info("Best acc: %.4f" % (best_acc)) return ofa_model if "cmrc2018" in task_name: tic_eval = time.time() evaluate(teacher_model, criterion, eval_dataloader, width_mult=100) logger.info("eval done total : %s s" % (time.time() - tic_eval)) for idx, width_mult in enumerate(width_mult_list): net_config = utils.dynabert_config(ofa_model, width_mult) ofa_model.set_net_config(net_config) tic_eval = time.time() acc = evaluate(ofa_model, criterion, eval_dataloader, width_mult) if acc > best_acc: best_acc = acc if paddle.distributed.get_rank() == 0: output_dir_width = os.path.join(output_dir, str(width_mult)) if not os.path.exists(output_dir_width): os.makedirs(output_dir_width) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir_width) logger.info("eval done total : %s s" % (time.time() - tic_eval)) logger.info("Best acc: %.4f" % (best_acc)) return ofa_model
def test_dynabert(self): self.model = TestModel() sp_net_config = supernet(expand_ratio=[0.5, 1.0]) self.model = Convert(sp_net_config).convert(self.model) ofa_model = OFA(self.model) config = dynabert_config(ofa_model, 0.5)
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.task_name = args.task_name.lower() metric_class = METRIC_CLASSES[args.task_name] args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] train_ds = load_dataset('clue', args.task_name, splits='train') tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, label_list=train_ds.label_list, tokenizer=tokenizer, max_seq_length=args.max_seq_length) train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64" if train_ds.label_list else "float32") # label ): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_ds = load_dataset('clue', args.task_name, splits='dev') dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_labels = 1 if train_ds.label_list == None else len(train_ds.label_list) model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_labels) # Step1: Initialize a dictionary to save the weights from the origin PPMiniLM model. origin_weights = model.state_dict() # Step2: Convert origin model to supernet. sp_config = supernet(expand_ratio=[1.0]) model = Convert(sp_config).convert(model) # Use weights saved in the dictionary to initialize supernet. utils.set_state_dict(model, origin_weights) del origin_weights super_sd = paddle.load( os.path.join(args.model_name_or_path, 'model_state.pdparams')) model.set_state_dict(super_sd) # Step3: Define teacher model. teacher_model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_labels) # Step4: Config about distillation. mapping_layers = ['ppminilm.embeddings'] for idx in range(model.ppminilm.config['num_hidden_layers']): mapping_layers.append('ppminilm.encoder.layers.{}'.format(idx)) default_distill_config = { 'lambda_distill': 0.1, 'teacher_model': teacher_model, 'mapping_layers': mapping_layers, } distill_config = DistillConfig(**default_distill_config) # Step5: Config in supernet training. ofa_model = OFA(model, distill_config=distill_config, elastic_order=['width']) criterion = paddle.nn.loss.CrossEntropyLoss( ) if train_ds.label_list else paddle.nn.loss.MSELoss() metric = metric_class() #### Step6: Calculate the importance of neurons and head, #### and then reorder them according to the importance. head_importance, neuron_importance = nlp_utils.compute_neuron_head_importance( args.task_name, ofa_model.model, dev_data_loader, loss_fct=criterion, num_layers=model.ppminilm.config['num_hidden_layers'], num_heads=model.ppminilm.config['num_attention_heads']) reorder_neuron_head(ofa_model.model, head_importance, neuron_importance) if paddle.distributed.get_world_size() > 1: ofa_model.model = paddle.DataParallel(ofa_model.model) if args.max_steps > 0: num_training_steps = args.max_steps num_train_epochs = math.ceil(num_training_steps / len(train_data_loader)) else: num_training_steps = len(train_data_loader) * args.num_train_epochs num_train_epochs = args.num_train_epochs warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)) global_step = 0 tic_train = time.time() best_res = 0.0 for epoch in range(num_train_epochs): # Step7: Set current epoch and task. ofa_model.set_epoch(epoch) ofa_model.set_task('width') for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, labels = batch for width_mult in args.width_mult_list: # Step8: Broadcast supernet config from width_mult, # and use this config in supernet training. net_config = utils.dynabert_config(ofa_model, width_mult) ofa_model.set_net_config(net_config) logits, teacher_logits = ofa_model(input_ids, segment_ids, attention_mask=[None, None]) rep_loss = ofa_model.calc_distill_loss() logit_loss = soft_cross_entropy(logits, teacher_logits.detach()) loss = rep_loss + args.lambda_logit * logit_loss loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: tic_eval = time.time() evaluate(teacher_model, metric, dev_data_loader, width_mult=100) print("eval done total : %s s" % (time.time() - tic_eval)) for idx, width_mult in enumerate(args.width_mult_list): net_config = utils.dynabert_config(ofa_model, width_mult) ofa_model.set_net_config(net_config) tic_eval = time.time() res = evaluate(ofa_model, metric, dev_data_loader, width_mult) print("eval done total : %s s" % (time.time() - tic_eval)) if best_res < res: output_dir = args.output_dir if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) best_res = res if global_step >= num_training_steps: print("best_res: ", best_res) return print("best_res: ", best_res)
depth_mult_list = [1.0] else: ofa_model.set_task('depth') depth_mult_list = run_config.elastic_depth for step, d in enumerate( tqdm( train_ds.start(place), desc='training')): ids, sids, label = d accumulate_gradients = dict() for param in opt._parameter_list: accumulate_gradients[param.name] = 0.0 for depth_mult in depth_mult_list: for width_mult in args.width_mult_list: net_config = utils.dynabert_config( ofa_model, width_mult, depth_mult=depth_mult) ofa_model.set_net_config(net_config) student_output, teacher_output = ofa_model( ids, sids, labels=label, num_layers=model_cfg['num_hidden_layers']) loss, student_logit, student_reps = student_output[ 0], student_output[1], student_output[2]['hiddens'] teacher_logit, teacher_reps = teacher_output[ 1], teacher_output[2]['hiddens'] if ofa_model.task == 'depth': depth_mult = ofa_model.current_config['depth'] depth = round(model_cfg['num_hidden_layers'] *