def build_program(main_program, startup_program, image_shape, archs, args, is_train): with static.program_guard(main_program, startup_program): data_loader, data, label, drop_path_prob, drop_path_mask = create_data_loader( image_shape, is_train, args) logits, logits_aux = archs(data, drop_path_prob, drop_path_mask, is_train, 10) top1 = paddle.metric.accuracy(input=logits, label=label, k=1) top5 = paddle.metric.accuracy(input=logits, label=label, k=5) loss = paddle.mean(F.softmax_with_cross_entropy(logits, label)) if is_train: if auxiliary: loss_aux = paddle.mean( F.softmax_with_cross_entropy(logits_aux, label)) loss = loss + auxiliary_weight * loss_aux step_per_epoch = int(trainset_num / args.batch_size) learning_rate = paddle.optimizer.lr.CosineAnnealingDecay( lr, T_max=step_per_epoch * args.retain_epoch) optimizer = paddle.optimizer.Momentum( learning_rate, momentum, weight_decay=paddle.regularizer.L2Decay(weight_decay), grad_clip=nn.ClipGradByGlobalNorm(clip_norm=5.0)) optimizer.minimize(loss) outs = [loss, top1, top5] else: outs = [loss, top1, top5] return outs, (data, label), data_loader
def train(args, model, train_data_loader, dev_data_loader, metric, rank): num_examples = len(train_data_loader) * args.batch_size * args.n_gpu max_train_steps = args.epochs * len(train_data_loader) if rank == 0: print("Num train examples: %d" % num_examples) print("Max train steps: %d" % max_train_steps) print("Warmup proportion: %d" % args.warmup_proportion) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_train_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = AdamW(learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)) loss_fn = DGULossFunction(args.task_name) load_ckpt(args, model, optimizer) step = 0 best_metric = 0.0 total_time = 0.0 for epoch in range(args.epochs): if rank == 0: print('\nEpoch %d/%d' % (epoch + 1, args.epochs)) batch_start_time = time.time() for batch in train_data_loader: step += 1 input_ids, segment_ids, labels = batch logits = model(input_ids, segment_ids) loss = loss_fn(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() total_time += (time.time() - batch_start_time) if rank == 0: if step % args.logging_steps == 0: print_logs(args, step, logits, labels, loss, total_time, metric) total_time = 0.0 if step % args.save_steps == 0 or step == max_train_steps: save_ckpt(model, optimizer, args.output_dir, step) if args.do_eval: print('\nEval begin...') metric_out = evaluation(args, model, dev_data_loader, metric) if metric_out > best_metric: best_metric = metric_out save_ckpt(model, optimizer, args.output_dir, 'best') print('Best model, step: %d\n' % step) batch_start_time = time.time()
def do_train(args): device = paddle.set_device(args.select_device) # Define dataloader train_loader, eval_loader, src_vocab_size, tgt_vocab_size, eos_id = create_train_loader( args) model = paddle.Model( Seq2SeqAttnModel(src_vocab_size, tgt_vocab_size, args.hidden_size, args.hidden_size, args.num_layers, args.dropout, eos_id)) grad_clip = nn.ClipGradByGlobalNorm(args.max_grad_norm) optimizer = paddle.optimizer.Adam(learning_rate=args.learning_rate, parameters=model.parameters(), grad_clip=grad_clip) ppl_metric = Perplexity() model.prepare(optimizer, CrossEntropyCriterion(), ppl_metric) print(args) if args.init_from_ckpt: model.load(args.init_from_ckpt) print("Loaded checkpoint from %s" % args.init_from_ckpt) model.fit(train_data=train_loader, eval_data=eval_loader, epochs=args.max_epoch, eval_freq=1, save_freq=1, save_dir=args.model_path, log_freq=args.log_freq)
def __call__(self, learning_rate, model=None): if not isinstance(model, (list, tuple)): model = [model] if self.clip_grad_by_norm is not None: grad_clip = nn.ClipGradByGlobalNorm( clip_norm=self.clip_grad_by_norm) else: grad_clip = None if self.regularizer and self.regularizer != 'None': reg_type = self.regularizer['type'] + 'Decay' reg_factor = self.regularizer['factor'] regularization = getattr(regularizer, reg_type)(reg_factor) else: regularization = None optim_args = self.optimizer.copy() optim_type = optim_args['type'] del optim_args['type'] optim_args['weight_decay'] = regularization op = getattr(optimizer, optim_type) params = [] for m in model: if m is not None: params.extend(m.parameters()) return op(learning_rate=learning_rate, parameters=params, grad_clip=grad_clip, **optim_args)
def __call__(self, learning_rate, model=None): if self.clip_grad_by_norm is not None: grad_clip = nn.ClipGradByGlobalNorm( clip_norm=self.clip_grad_by_norm) else: grad_clip = None if self.regularizer and self.regularizer != 'None': reg_type = self.regularizer['type'] + 'Decay' reg_factor = self.regularizer['factor'] regularization = getattr(regularizer, reg_type)(reg_factor) else: regularization = None optim_args = self.optimizer.copy() optim_type = optim_args['type'] del optim_args['type'] if optim_type != 'AdamW': optim_args['weight_decay'] = regularization op = getattr(optimizer, optim_type) if 'param_groups' in optim_args: assert isinstance(optim_args['param_groups'], list), '' param_groups = optim_args.pop('param_groups') params, visited = [], [] for group in param_groups: assert isinstance(group, dict) and 'params' in group and isinstance( group['params'], list), '' _params = { n: p for n, p in model.named_parameters() if any([k in n for k in group['params']]) } _group = group.copy() _group.update({'params': list(_params.values())}) params.append(_group) visited.extend(list(_params.keys())) ext_params = [ p for n, p in model.named_parameters() if n not in visited ] if len(ext_params) < len(model.parameters()): params.append({'params': ext_params}) elif len(ext_params) > len(model.parameters()): raise RuntimeError else: params = model.parameters() return op(learning_rate=learning_rate, parameters=params, grad_clip=grad_clip, **optim_args)
def create_optimizer(self, dy_model, config): lr = config.get("hyper_parameters.optimizer.learning_rate", 0.0001) weight_decay = config.get("hyper_parameters.optimizer.weight_decay", 0.01) optimizer = paddle.optimizer.AdamW( learning_rate=lr, weight_decay=weight_decay, grad_clip=nn.ClipGradByGlobalNorm(clip_norm=5.0), parameters=dy_model.parameters()) return optimizer
def build_optimizer(self, args, learning_rate, model, **kwargs): if getattr(args, "max_grad_norm", None) is not None: grad_clip = nn.ClipGradByGlobalNorm(args.max_grad_norm) else: grad_clip = None optimizer = paddle.optimizer.Adam(learning_rate=learning_rate, parameters=model.parameters(), grad_clip=grad_clip) return optimizer
def _initialize_optimizer(self, args): self.lr_scheduler = NoamDecay(1 / (args.warmup_steps * (args.lr**2)), args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in self.model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] self.optimizer = AdamW( learning_rate=self.lr_scheduler, parameters=self.model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm))
def __call__(self, learning_rate, model=None): if self.clip_grad_by_norm is not None: grad_clip = nn.ClipGradByGlobalNorm( clip_norm=self.clip_grad_by_norm) else: grad_clip = None if self.regularizer and self.regularizer != 'None': reg_type = self.regularizer['type'] + 'Decay' reg_factor = self.regularizer['factor'] regularization = getattr(regularizer, reg_type)(reg_factor) else: regularization = None optim_args = self.optimizer.copy() optim_type = optim_args['type'] del optim_args['type'] if optim_type != 'AdamW': optim_args['weight_decay'] = regularization op = getattr(optimizer, optim_type) if 'without_weight_decay_params' in optim_args: keys = optim_args['without_weight_decay_params'] params = [{ 'params': [ p for n, p in model.named_parameters() if any([k in n for k in keys]) ], 'weight_decay': 0. }, { 'params': [ p for n, p in model.named_parameters() if all([k not in n for k in keys]) ] }] del optim_args['without_weight_decay_params'] else: params = model.parameters() return op(learning_rate=learning_rate, parameters=params, grad_clip=grad_clip, **optim_args)
def build_optimizer(self, args, learning_rate, model, **kwargs): if getattr(args, "max_grad_norm", None) is not None: grad_clip = nn.ClipGradByGlobalNorm(args.max_grad_norm) else: grad_clip = None decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "layer_norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=learning_rate, beta1=args.beta1, beta2=args.beta2, epsilon=args.epsilon, parameters=model.parameters(), grad_clip=grad_clip, weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) return optimizer
def __call__(self, learning_rate, params=None): if self.clip_grad_by_norm is not None: grad_clip = nn.ClipGradByGlobalNorm( clip_norm=self.clip_grad_by_norm) else: grad_clip = None if self.regularizer and self.regularizer != 'None': reg_type = self.regularizer['type'] + 'Decay' reg_factor = self.regularizer['factor'] regularization = getattr(regularizer, reg_type)(reg_factor) else: regularization = None optim_args = self.optimizer.copy() optim_type = optim_args['type'] del optim_args['type'] op = getattr(optimizer, optim_type) return op(learning_rate=learning_rate, parameters=params, weight_decay=regularization, grad_clip=grad_clip, **optim_args)
def train(): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() model = ErnieForGeneration.from_pretrained(args.model_name_or_path) if "ernie-tiny" in args.model_name_or_path: tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path) elif "ernie" in args.model_name_or_path: tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path) elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path: tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path) elif "electra" in args.model_name_or_path: tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path) else: tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) if args.init_checkpoint: model_state = paddle.load(args.init_checkpoint) model.set_state_dict(model_state) train_dataset, dev_dataset = Poetry.get_datasets(['train', 'dev']) attn_id = tokenizer.vocab[ '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]'] tgt_type_id = model.sent_emb.weight.shape[0] - 1 trans_func = convert_example(tokenizer=tokenizer, attn_id=attn_id, tgt_type_id=tgt_type_id, max_encode_len=args.max_encode_len, max_decode_len=args.max_decode_len, noise_prob=args.noise_prob, use_random_noice=args.use_random_noice) train_dataset = train_dataset.apply(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # attn_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_labels ): after_padding(fn(samples)) train_data_loader = DataLoader(dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_dataset = dev_dataset.apply(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_dataset, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_dataset, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) label_num = model.word_emb.weight.shape[0] if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) max_steps = len(train_data_loader) * args.num_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_steps, args.warmup_proportion) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=nn.ClipGradByGlobalNorm(1.0), apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) rouge1 = Rouge1() rouge2 = Rouge2() global_step = 1 tic_train = time.time() for epoch in range(args.num_epochs): for step, batch in enumerate(train_data_loader, start=1): (src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels, _) = batch # import pdb; pdb.set_trace() _, __, info = model(src_ids, sent_ids=src_sids, pos_ids=src_pids, attn_bias=mask_src_2_src, encode_only=True) cached_k, cached_v = info['caches'] _, __, info = model(tgt_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_tgt_2_srctgt, past_cache=(cached_k, cached_v), encode_only=True) cached_k2, cached_v2 = info['caches'] past_cache_k = [ paddle.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2) ] past_cache_v = [ paddle.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2) ] if args.label_smooth > 0.: tgt_labels = nn.functional.label_smooth( nn.functional.one_hot(tgt_labels, label_num), epsilon=args.label_smooth) loss, _, __ = model(attn_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_attn_2_srctgtattn, past_cache=(past_cache_k, past_cache_v), tgt_labels=tgt_labels, tgt_pos=paddle.nonzero(attn_ids == attn_id)) if global_step % args.logging_steps == 0: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s, lr: %.3e" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train), lr_scheduler.get_lr())) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % args.save_steps == 0 and ( (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0): evaluate(model, dev_data_loader, tokenizer, rouge1, rouge2, attn_id, tgt_type_id, args) output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) global_step += 1
def train(): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() model = ErnieForGeneration.from_pretrained(args.model_name_or_path) if "ernie-tiny" in args.model_name_or_path: tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path) elif "ernie" in args.model_name_or_path: tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path) elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path: tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path) elif "electra" in args.model_name_or_path: tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path) else: tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) if args.init_checkpoint: model_state = paddle.load(args.init_checkpoint) model.set_state_dict(model_state) train_dataset, dev_dataset = load_dataset( 'poetry', splits=('train', 'dev'), lazy=False) attn_id = tokenizer.vocab[ '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]'] tgt_type_id = model.sent_emb.weight.shape[0] - 1 trans_func = convert_example( tokenizer=tokenizer, attn_id=attn_id, tgt_type_id=tgt_type_id, max_encode_len=args.max_encode_len, max_decode_len=args.max_decode_len, noise_prob=args.noise_prob, use_random_noice=args.use_random_noice) train_dataset = train_dataset.map(trans_func) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_pids Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # src_tids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_pids Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # tgt_tids Pad(axis=0, pad_val=tokenizer.pad_token_id), # attn_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_labels ): after_padding(fn(samples)) train_data_loader = DataLoader( dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_dataset = dev_dataset.map(trans_func) dev_data_loader = DataLoader( dataset=dev_dataset, batch_size=args.batch_size, collate_fn=batchify_fn, num_workers=0, return_list=True) label_num = model.word_emb.weight.shape[0] train_model = StackModel(model) if paddle.distributed.get_world_size() > 1: # All 'forward' outputs derived from the module parameters using in DataParallel # must participate in the calculation of losses and subsequent gradient calculations. # So we use StackModel here to make the model only output loss in its 'forward' function. train_model = paddle.DataParallel(train_model) max_steps = len(train_data_loader) * args.num_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=nn.ClipGradByGlobalNorm(1.0), apply_decay_param_fun=lambda x: x in decay_params) rouge1 = Rouge1() rouge2 = Rouge2() global_step = 1 tic_train = time.time() for epoch in range(args.num_epochs): for step, batch in enumerate(train_data_loader, start=1): (src_ids, src_tids, src_pids, tgt_ids, tgt_tids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels, _) = batch # import pdb; pdb.set_trace() if args.label_smooth > 0.: tgt_labels = nn.functional.label_smooth( nn.functional.one_hot(tgt_labels, label_num), epsilon=args.label_smooth) tgt_pos = paddle.nonzero(attn_ids == attn_id) loss = train_model(src_ids, src_tids, src_pids, tgt_ids, tgt_tids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels, tgt_pos) if global_step % args.logging_steps == 0: if paddle.distributed.get_rank() == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s, lr: %.3e" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train), lr_scheduler.get_lr())) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 and paddle.distributed.get_rank( ) == 0: evaluate(model, dev_data_loader, tokenizer, rouge1, rouge2, attn_id, tgt_type_id, args) output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) global_step += 1
def train(args, model, train_data_loader, dev_data_loader, metric, rank): num_examples = len(train_data_loader) * args.batch_size * args.n_gpu max_train_steps = args.epochs * len(train_data_loader) warmup_steps = int(max_train_steps * args.warmup_proportion) if rank == 0: print("Num train examples: %d" % num_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) factor_fn = partial(compute_lr_factor, warmup_steps=warmup_steps, max_train_steps=max_train_steps) lr_scheduler = LambdaDecay(args.learning_rate, factor_fn) optimizer = AdamW(learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ params.name for params in model.parameters() if not any(nd in params.name for nd in ['bias', 'norm']) ], grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) loss_fn = DGULossFunction(args.task_name) load_ckpt(args, model, optimizer) step = 0 best_metric = 0.0 total_time = 0.0 for epoch in range(args.epochs): if rank == 0: print('\nEpoch %d/%d' % (epoch + 1, args.epochs)) batch_start_time = time.time() for batch in train_data_loader: step += 1 input_ids, segment_ids, labels = batch logits = model(input_ids, segment_ids) loss = loss_fn(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() total_time += (time.time() - batch_start_time) if rank == 0: if step % args.logging_steps == 0: print_logs(args, step, logits, labels, loss, total_time, metric) total_time = 0.0 if step % args.save_steps == 0 or step == max_train_steps: save_ckpt(model, optimizer, args.output_dir, step) if args.do_eval: print('\nEval begin...') metric_out = evaluation(args, model, dev_data_loader, metric) if metric_out > best_metric: best_metric = metric_out save_ckpt(model, optimizer, args.output_dir, 'best') print('Best model, step: %d\n' % step) batch_start_time = time.time()
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.task_name = args.task_name.lower() metric_class = METRIC_CLASSES[args.task_name] args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] train_ds = load_dataset('clue', args.task_name, splits="train") tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_ds.label_list, max_seq_length=args.max_seq_length) train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64" if train_ds.label_list else "float32") # label ): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_ds = load_dataset('clue', args.task_name, splits='dev') dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_classes = 1 if train_ds.label_list == None else len( train_ds.label_list) model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_classes) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) if args.max_steps > 0: num_training_steps = args.max_steps num_train_epochs = math.ceil(num_training_steps / len(train_data_loader)) else: num_training_steps = len(train_data_loader) * args.num_train_epochs num_train_epochs = args.num_train_epochs warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)) loss_fct = paddle.nn.loss.CrossEntropyLoss( ) if train_ds.label_list else paddle.nn.loss.MSELoss() metric = metric_class() best_acc = 0.0 global_step = 0 tic_train = time.time() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, labels = batch logits = model(input_ids, segment_ids) loss = loss_fct(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: tic_eval = time.time() acc = evaluate(model, loss_fct, metric, dev_data_loader) print("eval done total : %s s" % (time.time() - tic_eval)) if acc > best_acc: best_acc = acc if global_step >= num_training_steps: print("best_acc: ", best_acc) return print("best_acc: ", best_acc)
def do_train(args): assert args.batch_size % args.gradient_accumulation_steps == 0, \ "Please make sure argmument `batch_size` must be divisible by `gradient_accumulation_steps`." paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.task_name = args.task_name.lower() metric_class = METRIC_CLASSES[args.task_name] args.batch_size = int(args.batch_size / args.gradient_accumulation_steps) train_ds, dev_ds = load_dataset( 'clue', args.task_name, splits=('train', 'dev')) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) trans_func = partial( convert_example, label_list=train_ds.label_list, tokenizer=tokenizer, max_seq_length=args.max_seq_length) train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.batch_size, shuffle=False) batchify_fn = DataCollatorWithPadding(tokenizer) train_data_loader = DataLoader( dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_data_loader = DataLoader( dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_classes = 1 if train_ds.label_list == None else len(train_ds.label_list) model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, num_classes=num_classes) if args.dropout != 0.1: update_model_dropout(model, args.dropout) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) if args.max_steps > 0: num_training_steps = args.max_steps / args.gradient_accumulation_steps num_train_epochs = math.ceil(num_training_steps / len(train_data_loader)) else: num_training_steps = len( train_data_loader ) * args.num_train_epochs / args.gradient_accumulation_steps num_train_epochs = args.num_train_epochs warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)) loss_fct = paddle.nn.loss.CrossEntropyLoss( ) if train_ds.label_list else paddle.nn.loss.MSELoss() metric = metric_class() best_acc = 0.0 global_step = 0 tic_train = time.time() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): labels = batch.pop("labels") logits = model(**batch) loss = loss_fct(logits, labels) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: global_step += 1 optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: tic_eval = time.time() acc = evaluate(model, loss_fct, metric, dev_data_loader) print("eval done total : %s s" % (time.time() - tic_eval)) if acc > best_acc: best_acc = acc output_dir = args.output_dir if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if global_step >= num_training_steps: print("best_acc: ", best_acc) return print("best_acc: ", best_acc)
def finetune( self, train_path, dev_path=None, save_dir="ernie_gen_result", init_ckpt_path=None, use_gpu=True, max_steps=500, batch_size=8, max_encode_len=50, max_decode_len=50, learning_rate=5e-5, warmup_proportion=0.1, weight_decay=0.1, noise_prob=0, label_smooth=0, beam_width=5, length_penalty=1.0, log_interval=100, save_interval=200, ): """ finetune with the specified dataset. Args: train_path(str): the train dataset path. dev_path(str): the dev dataset path. save_dir(str): the model params and dev dataset predict result save path. init_ckpt_path(str): incremental training load path. use_gpu(bool): use gpu or not. max_steps(int): max training steps. batch_size(int): the batch size. max_encode_len(int): the max encode length. max_decode_len(int): the max decode length. learning_rate(float): the learning rate. warmup_proportion(float): the warmup proportion. weight_decay(float): the weight decay magnitude. noise_prob(float): the nosie probability. see the ernie gen paper for details. label_smooth(float): the label smooth magnitude. beam_width(int): the beam size during evaluating the dev dataset. length_penalty(float): the length penalty during evaluating the dev dataset. log_interval(int): the log interval. save_interval(int): the save interval. dev set will be evaluated after saving. Return: result(dict): A Dictionary of shape:: { last_save_path(str): last model save path. last_ppl(float): last model ppl. } """ paddle.disable_static() paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') if init_ckpt_path is not None: logger.info('loading checkpoint from %s' % init_ckpt_path) sd = paddle.load(init_ckpt_path) self.model.set_state_dict(sd) train_dataset = self._load_dataset(train_path) attn_id = self.tokenizer.vocab['[MASK]'] trans_func = convert_example(tokenizer=self.tokenizer, attn_id=attn_id, tgt_type_id=1, max_encode_len=max_encode_len, max_decode_len=max_decode_len, noise_prob=noise_prob) train_dataset = train_dataset.map(trans_func) train_batch_sampler = paddle.io.BatchSampler(train_dataset, batch_size=batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=self.tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=self.tokenizer.pad_token_id), # src_pids Pad(axis=0, pad_val=self.tokenizer.pad_token_type_id), # src_tids Pad(axis=0, pad_val=self.tokenizer.pad_token_id), # tgt_ids Pad(axis=0, pad_val=self.tokenizer.pad_token_id), # tgt_pids Pad(axis=0, pad_val=self.tokenizer.pad_token_type_id), # tgt_tids Pad(axis=0, pad_val=self.tokenizer.pad_token_id), # attn_ids Pad(axis=0, pad_val=self.tokenizer.pad_token_id), # tgt_labels ): after_padding(fn(samples)) train_data_loader = DataLoader(dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) if dev_path: dev_dataset = self._load_dataset(dev_path) dev_dataset = dev_dataset.map(trans_func) dev_data_loader = DataLoader(dataset=dev_dataset, batch_size=batch_size, collate_fn=batchify_fn, num_workers=0, return_list=True) label_num = self.model.word_emb.weight.shape[0] train_model = StackModel(self.model) lr_scheduler = LinearDecayWithWarmup(learning_rate, max_steps, warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [p.name for n, p in self.model.named_parameters() if not any(nd in n for nd in ["bias", "norm"])] optimizer = paddle.optimizer.AdamW(learning_rate=lr_scheduler, parameters=self.model.parameters(), weight_decay=weight_decay, grad_clip=nn.ClipGradByGlobalNorm(1.0), apply_decay_param_fun=lambda x: x in decay_params) rouge1 = Rouge1() rouge2 = Rouge2() global_step = 1 if save_dir and not os.path.exists(save_dir): os.makedirs(save_dir) while True: for batch in train_data_loader: (src_ids, src_tids, src_pids, tgt_ids, tgt_tids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels, _) = batch if label_smooth > 0.: tgt_labels = nn.functional.label_smooth(nn.functional.one_hot(tgt_labels, label_num), epsilon=label_smooth) tgt_pos = paddle.nonzero(attn_ids == attn_id) loss = train_model(src_ids, src_tids, src_pids, tgt_ids, tgt_tids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels, tgt_pos) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % log_interval == 0 and paddle.distributed.get_rank() == 0: loss_np = loss.numpy() ppl = np.exp(loss_np) logger.info('[step %d / %d]train loss %.5f, ppl %.5f, elr %.3e' % (global_step, max_steps, loss_np, ppl, lr_scheduler.get_lr())) if save_dir and global_step % save_interval == 0 and global_step > 0: loss_np = loss.numpy() ppl = np.exp(loss_np) save_name = "step_%s_ppl_%.5f.params" % (global_step, ppl) save_path = os.path.join(save_dir, save_name) logger.info("save the model in %s" % save_path) paddle.save(self.model.state_dict(), save_path) if dev_path: self._evaluate(self.model, dev_data_loader, self.tokenizer, rouge1, rouge2, attn_id, max_decode_len, max_encode_len, beam_width, length_penalty) if global_step >= max_steps: break global_step += 1 if global_step >= max_steps: break if global_step % save_interval != 0: loss_np = loss.numpy() ppl = np.exp(loss_np) logger.info('[final step %d]train loss %.5f, ppl %.5f, elr %.3e' % (global_step, loss_np, ppl, lr_scheduler.get_lr())) if save_dir: save_name = "step_%s_ppl_%.5f.pdparams" % (global_step, ppl) save_path = os.path.join(save_dir, save_name) logger.info("save the model in %s" % save_path) paddle.save(self.model.state_dict(), save_path) if dev_path: self._evaluate(self.model, dev_data_loader, self.tokenizer, rouge1, rouge2, attn_id, max_decode_len, max_encode_len, beam_width, length_penalty) result = { "last_save_path": "%s" % save_path, "last_ppl": ppl[0], } return result
def train(args): paddle.set_device(args.device) world_size = dist.get_world_size() if world_size > 1: dist.init_parallel_env() set_seed(args.seed) model = UnifiedTransformerLMHeadModel.from_pretrained( args.model_name_or_path) tokenizer = UnifiedTransformerTokenizer.from_pretrained( args.model_name_or_path) if world_size > 1: model = paddle.DataParallel(model) train_ds, dev_ds = load_dataset('duconv', splits=('train', 'dev')) train_ds, train_data_loader = create_data_loader(train_ds, tokenizer, args, 'train') dev_ds, dev_data_loader = create_data_loader(dev_ds, tokenizer, args, 'dev') lr_scheduler = NoamDecay(1 / (args.warmup_steps * (args.lr**2)), args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = AdamW(learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)) step = 0 total_time = 0.0 best_ppl = 1e9 for epoch in range(args.epochs): print('\nEpoch %d/%d' % (epoch + 1, args.epochs)) batch_start_time = time.time() for inputs in train_data_loader: step += 1 labels = inputs[-1] logits = model(*inputs[:-1]) loss = F.cross_entropy(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() total_time += (time.time() - batch_start_time) if step % args.logging_steps == 0: ppl = paddle.exp(loss) print( 'step %d - loss: %.4f - ppl: %.4f - lr: %.7f - %.3fs/step' % (step, loss, ppl, optimizer.get_lr(), total_time / args.logging_steps)) total_time = 0.0 if step % args.save_steps == 0: ppl = evaluation(model, dev_data_loader) if dist.get_rank() == 0: save_ckpt(model, tokenizer, args.save_dir, step) if ppl < best_ppl: best_ppl = ppl save_ckpt(model, tokenizer, args.save_dir, 'best') print('Saved step {} as best model.\n'.format(step)) batch_start_time = time.time() print('\nTraining completed.')
def main(args): paddle.set_device('gpu' if args.n_gpus else 'cpu') paddle.seed(args.seed) world_size = dist.get_world_size() rank = dist.get_rank() if world_size > 1: dist.init_parallel_env() model = UnifiedTransformerLMHeadModel.from_pretrained( args.model_name_or_path) tokenizer = UnifiedTransformerTokenizer.from_pretrained( args.model_name_or_path) if world_size > 1: model = paddle.DataParallel(model) train_dataset = DialogueDataset(args.train_data_path, args.batch_size, tokenizer.pad_token_id, tokenizer.cls_token_id, args.sort_pool_size, args.seed, mode='train') train_dataloader = DataLoader(train_dataset, return_list=True, batch_size=None) valid_dataset = DialogueDataset(args.valid_data_path, args.batch_size, tokenizer.pad_token_id, tokenizer.cls_token_id, args.sort_pool_size, mode='valid') valid_dataloader = DataLoader(valid_dataset, return_list=True, batch_size=None) lr_scheduler = NoamDecay(1 / (args.warmup_steps * (args.lr**2)), args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = AdamW(learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)) step = 0 total_time = 0.0 for epoch in range(args.epochs): if rank == 0: print('\nEpoch %d/%d' % (epoch + 1, args.epochs)) batch_start_time = time.time() for inputs in train_dataloader: step += 1 token_ids, type_ids, pos_ids, generation_mask, tgt_label, tgt_pos = inputs logits = model(token_ids, type_ids, pos_ids, generation_mask, tgt_pos) loss = F.cross_entropy(logits, tgt_label) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() total_time += (time.time() - batch_start_time) if rank == 0: if step % args.logging_steps == 0: ppl = paddle.exp(loss) print( 'step %d - loss: %.4f - ppl: %.4f - lr: %.7f - %.3fs/step' % (step, loss, ppl, optimizer.get_lr(), total_time / args.logging_steps)) total_time = 0.0 if step % args.save_steps == 0: evaluation(model, valid_dataloader) save_ckpt(model, tokenizer, args.save_dir, step) batch_start_time = time.time()
def train(args): paddle.set_device(args.device) n_procs = dist.get_world_size() rank = dist.get_rank() if n_procs > 1: dist.init_parallel_env() vocab = load_vocab(args.vocab_file, args.max_characters_per_token) elmo = ELMo(args.batch_size, args.char_embed_dim, args.projection_dim, vocab.size, dropout=args.dropout, num_layers=args.num_layers, num_highways=args.num_highways, char_vocab_size=vocab.char_size) if n_procs > 1: elmo = paddle.DataParallel(elmo) elmo.train() gloabl_norm_clip = nn.ClipGradByGlobalNorm(args.max_grad_norm) optimizer = paddle.optimizer.Adagrad(learning_rate=args.lr, parameters=elmo.parameters(), initial_accumulator_value=1.0, grad_clip=gloabl_norm_clip) elmo_loss = ELMoLoss() # Loads pre-trained parameters. if args.init_from_ckpt: weight_state_dict = paddle.load(args.init_from_ckpt + '.pdparams') opt_state_dict = paddle.load(args.init_from_ckpt + '.pdopt') elmo.set_state_dict(weight_state_dict) optimizer.set_state_dict(opt_state_dict) print("Loaded checkpoint from %s" % args.init_from_ckpt) train_dataset = OneBillionWordDataset(args.train_data_path, vocab, args.batch_size, args.unroll_steps, n_procs=n_procs, rank=rank, mode='train', shuffle=True, seed=args.seed) train_dataloader = DataLoader(train_dataset, return_list=True, batch_size=None) n_tokens_per_batch = args.batch_size * args.unroll_steps * n_procs n_steps_per_epoch = int(train_dataset.number_of_tokens / n_tokens_per_batch) n_steps_total = args.epochs * n_steps_per_epoch print("Training for %s epochs and %s steps" % (args.epochs, n_steps_total)) total_time = 0.0 batch_start_time = time.time() for step, inputs in enumerate(train_dataloader, start=1): ids, next_ids, ids_reverse, next_ids_reverse = inputs outputs = elmo([ids, ids_reverse]) loss = elmo_loss(outputs, [next_ids, next_ids_reverse]) ppl = paddle.exp(loss) loss *= args.unroll_steps loss.backward() optimizer.step() optimizer.clear_grad() total_time += (time.time() - batch_start_time) if step % args.log_freq == 0: print("step %d/%d - loss: %.4f - Perplexity: %.4f - %.3fs/step" % (step, n_steps_total, loss.numpy()[0], ppl.numpy()[0], total_time / args.log_freq)) total_time = 0.0 if rank == 0 and step % args.save_freq == 0: save_params(elmo, optimizer, args.save_dir, step) if step == n_steps_total: # training done if rank == 0: save_params(elmo, optimizer, args.save_dir, 'final') break batch_start_time = time.time()
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.task_name = args.task_name.lower() metric_class = METRIC_CLASSES[args.task_name] args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] train_ds = load_dataset('clue', args.task_name, splits='train') tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, label_list=train_ds.label_list, tokenizer=tokenizer, max_seq_length=args.max_seq_length) train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64" if train_ds.label_list else "float32") # label ): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_ds = load_dataset('clue', args.task_name, splits='dev') dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_labels = 1 if train_ds.label_list == None else len(train_ds.label_list) model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_labels) # Step1: Initialize a dictionary to save the weights from the origin PPMiniLM model. origin_weights = model.state_dict() # Step2: Convert origin model to supernet. sp_config = supernet(expand_ratio=[1.0]) model = Convert(sp_config).convert(model) # Use weights saved in the dictionary to initialize supernet. utils.set_state_dict(model, origin_weights) del origin_weights super_sd = paddle.load( os.path.join(args.model_name_or_path, 'model_state.pdparams')) model.set_state_dict(super_sd) # Step3: Define teacher model. teacher_model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_labels) # Step4: Config about distillation. mapping_layers = ['ppminilm.embeddings'] for idx in range(model.ppminilm.config['num_hidden_layers']): mapping_layers.append('ppminilm.encoder.layers.{}'.format(idx)) default_distill_config = { 'lambda_distill': 0.1, 'teacher_model': teacher_model, 'mapping_layers': mapping_layers, } distill_config = DistillConfig(**default_distill_config) # Step5: Config in supernet training. ofa_model = OFA(model, distill_config=distill_config, elastic_order=['width']) criterion = paddle.nn.loss.CrossEntropyLoss( ) if train_ds.label_list else paddle.nn.loss.MSELoss() metric = metric_class() #### Step6: Calculate the importance of neurons and head, #### and then reorder them according to the importance. head_importance, neuron_importance = nlp_utils.compute_neuron_head_importance( args.task_name, ofa_model.model, dev_data_loader, loss_fct=criterion, num_layers=model.ppminilm.config['num_hidden_layers'], num_heads=model.ppminilm.config['num_attention_heads']) reorder_neuron_head(ofa_model.model, head_importance, neuron_importance) if paddle.distributed.get_world_size() > 1: ofa_model.model = paddle.DataParallel(ofa_model.model) if args.max_steps > 0: num_training_steps = args.max_steps num_train_epochs = math.ceil(num_training_steps / len(train_data_loader)) else: num_training_steps = len(train_data_loader) * args.num_train_epochs num_train_epochs = args.num_train_epochs warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)) global_step = 0 tic_train = time.time() best_res = 0.0 for epoch in range(num_train_epochs): # Step7: Set current epoch and task. ofa_model.set_epoch(epoch) ofa_model.set_task('width') for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, labels = batch for width_mult in args.width_mult_list: # Step8: Broadcast supernet config from width_mult, # and use this config in supernet training. net_config = utils.dynabert_config(ofa_model, width_mult) ofa_model.set_net_config(net_config) logits, teacher_logits = ofa_model(input_ids, segment_ids, attention_mask=[None, None]) rep_loss = ofa_model.calc_distill_loss() logit_loss = soft_cross_entropy(logits, teacher_logits.detach()) loss = rep_loss + args.lambda_logit * logit_loss loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: tic_eval = time.time() evaluate(teacher_model, metric, dev_data_loader, width_mult=100) print("eval done total : %s s" % (time.time() - tic_eval)) for idx, width_mult in enumerate(args.width_mult_list): net_config = utils.dynabert_config(ofa_model, width_mult) ofa_model.set_net_config(net_config) tic_eval = time.time() res = evaluate(ofa_model, metric, dev_data_loader, width_mult) print("eval done total : %s s" % (time.time() - tic_eval)) if best_res < res: output_dir = args.output_dir if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) best_res = res if global_step >= num_training_steps: print("best_res: ", best_res) return print("best_res: ", best_res)