def _initialize_optimizer(self, args): self.lr_scheduler = NoamDecay(1 / (args.warmup_steps * (args.lr**2)), args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in self.model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] self.optimizer = AdamW( learning_rate=self.lr_scheduler, parameters=self.model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm))
def _create_optimizer(self, model): scheduler = self._create_scheduler() clip = paddle.nn.ClipGradByNorm(clip_norm=1.0) return AdamW( parameters=model.parameters(), grad_clip=clip, learning_rate=scheduler, apply_decay_param_fun=lambda x: x in self.wd_params, weight_decay=self.args.weight_decay), scheduler
def train(args, model, train_data_loader, dev_data_loader, metric, rank): num_examples = len(train_data_loader) * args.batch_size * args.n_gpu max_train_steps = args.epochs * len(train_data_loader) if rank == 0: print("Num train examples: %d" % num_examples) print("Max train steps: %d" % max_train_steps) print("Warmup proportion: %d" % args.warmup_proportion) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_train_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = AdamW(learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)) loss_fn = DGULossFunction(args.task_name) load_ckpt(args, model, optimizer) step = 0 best_metric = 0.0 total_time = 0.0 for epoch in range(args.epochs): if rank == 0: print('\nEpoch %d/%d' % (epoch + 1, args.epochs)) batch_start_time = time.time() for batch in train_data_loader: step += 1 input_ids, segment_ids, labels = batch logits = model(input_ids, segment_ids) loss = loss_fn(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() total_time += (time.time() - batch_start_time) if rank == 0: if step % args.logging_steps == 0: print_logs(args, step, logits, labels, loss, total_time, metric) total_time = 0.0 if step % args.save_steps == 0 or step == max_train_steps: save_ckpt(model, optimizer, args.output_dir, step) if args.do_eval: print('\nEval begin...') metric_out = evaluation(args, model, dev_data_loader, metric) if metric_out > best_metric: best_metric = metric_out save_ckpt(model, optimizer, args.output_dir, 'best') print('Best model, step: %d\n' % step) batch_start_time = time.time()
class ModelOperation(object): """ModelTrain""" def __init__(self): self.cur_process_num = paddle.distributed.get_world_size( ) # PADDLE_TRAINERS_NUM 的值,默认值为1 self.cur_process_rank = paddle.distributed.get_rank( ) # PADDLE_TRAINER_ID 的值,默认值为0 self.model_class = { "uniLM": (UnifiedTransformerLMHeadModel, UnifiedTransformerTokenizer), } self.data_helper = None def _initialize_run_env(self, device, seed): assert device in ("cpu", "gpu", "xpu"), \ f"param device({device}) must be in ('cpu', 'gpu', 'xpu')!!!" paddle.set_device(device) if self.cur_process_num > 1: paddle.distributed.init_parallel_env() if seed: self.set_seed(seed) def _initialize_model(self, model_type, pretrained_model_path): assert os.path.exists(pretrained_model_path), \ f"model path {pretrained_model_path} must exists!!!" logging.info(f"initialize model from {pretrained_model_path}") model_class, tokenizer_class = self.model_class[model_type] self.tokenizer = tokenizer_class.from_pretrained(pretrained_model_path) self.model = model_class.from_pretrained(pretrained_model_path) if self.cur_process_num > 1: self.model = paddle.DataParallel(self.model) def _initialize_optimizer(self, args): self.lr_scheduler = NoamDecay(1 / (args.warmup_steps * (args.lr**2)), args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in self.model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] self.optimizer = AdamW( learning_rate=self.lr_scheduler, parameters=self.model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)) def _start_train(self, args): # load train data loader train_dataset = DialogueDataset(args.train_data_path, args.batch_size, self.tokenizer.pad_token_id, self.tokenizer.cls_token_id, args.sort_pool_size, args.seed, mode='train') train_data_loader = DataLoader(train_dataset, return_list=True, batch_size=None) # initialize optimizer self._initialize_optimizer(args) global_step = 0 tic_train = time.time() for epoch in range(args.train_epochs): for batch in train_data_loader: # logging.info(f"Epoch: {epoch+1}/{args.train_epochs}, step is {step}") global_step += 1 token_ids, type_ids, pos_ids, generation_mask, tgt_label, tgt_pos = batch logits = self.model(token_ids, type_ids, pos_ids, generation_mask, tgt_pos) loss = F.cross_entropy(logits, tgt_label) if global_step % args.logging_steps == 0: logging.info( f"global step {global_step}, epoch: {epoch+1}/{args.train_epochs}," f" loss: {loss}, speed: {args.logging_steps / (time.time() - tic_train):.2f} step/s" ) tic_train = time.time() loss.backward() self.optimizer.step() self.lr_scheduler.step() self.optimizer.clear_gradients() if self.cur_process_rank == 0: output_dir = \ os.path.join(args.output_dir, "model_{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = \ self.model._layers if isinstance(self.model, paddle.DataParallel) else self.model model_to_save.save_pretrained(output_dir) self.tokenizer.save_pretrained(output_dir) print('Saving checkpoint to:', output_dir) @paddle.no_grad() def evaluation(self, args): self.model.eval() valid_dataset = DialogueDataset(args.valid_data_path, args.batch_size, self.tokenizer.pad_token_id, self.tokenizer.cls_token_id, args.sort_pool_size, args.seed, mode='valid') valid_data_loader = DataLoader(valid_dataset, return_list=True, batch_size=None) total_tokens = 0 total_loss = 0.0 start_time = time.time() step = 0 for inputs in valid_data_loader: step += 1 token_ids, type_ids, pos_ids, generation_mask, tgt_label, tgt_pos = inputs logits = self.model(token_ids, type_ids, pos_ids, generation_mask, tgt_pos) loss = F.cross_entropy(logits, tgt_label, reduction='sum') total_loss += loss.numpy()[0] total_tokens += tgt_label.shape[0] avg_loss = total_loss / total_tokens ppl = math.exp(avg_loss) avg_speed = (time.time() - start_time) / step logging.info('loss: %.4f - ppl: %.4f - %.3fs/step\n' % (avg_loss, ppl, avg_speed)) self.model.train() @paddle.no_grad() def _infer(self, data_loader): self.model.eval() total_time = 0.0 start_time = time.time() responses = [] for step, inputs in enumerate(data_loader, 1): logging.info(f"step is {step}") token_ids, type_ids, pos_ids, generation_mask = inputs ids, scores = self.model.generate( input_ids=token_ids, token_type_ids=type_ids, position_ids=pos_ids, attention_mask=generation_mask, max_length=args.max_dec_len, min_length=args.min_dec_len, decode_strategy=args.decode_strategy, temperature=args.temperature, top_k=args.top_k, top_p=args.top_p, num_beams=args.num_beams, length_penalty=args.length_penalty, early_stopping=args.early_stopping, num_return_sequences=args.num_samples) total_time += (time.time() - start_time) if step % args.logging_steps == 0: logging.info( f'step {step} - {total_time / args.logging_steps:.3f}s/step' ) total_time = 0.0 results = select_response(ids, scores, self.tokenizer, args.max_dec_len, args.num_samples) responses.extend(results) start_time = time.time() self.model.train() return responses def predict(self, args): # [1]. initialize dataset loader test_dataset = DialogueDataset(args.test_data_path, args.batch_size, self.tokenizer.pad_token_id, self.tokenizer.cls_token_id, args.sort_pool_size, args.seed, mode='test') valid_data_loader = DataLoader(test_dataset, return_list=True, batch_size=None) # [2]. do inference responses = self._infer(valid_data_loader) # [3]. save result output_path = os.path.join(args.output_dir, "predict.txt") with open(output_path, 'w', encoding='utf-8') as f: for response in responses: f.write(response + '\n') def train_and_eval(self, args): self._initialize_run_env(args.device, args.seed) self._initialize_model(args.model_type, args.pretrained_model_path) # start training if args.do_train: logging.info("start training...") self._start_train(args) logging.info("train success.") # start evaluation if args.do_eval: logging.info("start evaluating...") self.evaluation(args) logging.info("evaluate success.") # start predicting if args.do_predict: logging.info("start predicting...") self.predict(args) logging.info("predict success.") @staticmethod def set_seed(random_seed): random.seed(random_seed) np.random.seed(random_seed) paddle.seed(random_seed)
def run(args): paddle.set_device(args.device) world_size = dist.get_world_size() if world_size > 1: dist.init_parallel_env() set_seed(args.seed) model = UNIMOLMHeadModel.from_pretrained(args.model_name_or_path) tokenizer = UNIMOTokenizer.from_pretrained(args.model_name_or_path) if world_size > 1: model = paddle.DataParallel(model) train_ds = load_dataset(args.dataset_name, splits='train', data_files=args.train_file) dev_ds = load_dataset(args.dataset_name, splits='dev', data_files=args.predict_file) train_ds, train_data_loader = create_data_loader(train_ds, tokenizer, args, 'train') dev_ds, dev_data_loader = create_data_loader(dev_ds, tokenizer, args, 'test') if args.do_train: num_training_steps = args.epochs * len(train_data_loader) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_propotion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = AdamW(learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, beta1=args.beta1, beta2=args.beta2, epsilon=args.epsilon, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=paddle.nn.ClipGradByGlobalNorm( args.max_grad_norm)) step = 0 total_time = 0.0 for epoch in range(args.epochs): print('\nEpoch %d/%d' % (epoch + 1, args.epochs)) batch_start_time = time.time() for inputs in train_data_loader: step += 1 labels = inputs[-1] logits = model(*inputs[:-1]) labels = paddle.nn.functional.one_hot( labels, num_classes=logits.shape[-1]) labels = paddle.nn.functional.label_smooth(labels) loss = F.cross_entropy(logits, labels, soft_label=True) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() total_time += (time.time() - batch_start_time) if step % args.logging_steps == 0: ppl = paddle.exp(loss) print( 'step %d - loss: %.4f - ppl: %.4f - lr: %.7f - %.3fs/step' % (step, loss, ppl, optimizer.get_lr(), total_time / args.logging_steps)) total_time = 0.0 if step % args.save_steps == 0 or step >= num_training_steps: if dist.get_rank() == 0: save_ckpt(model, tokenizer, args.save_dir, step) print('Saved step {} model.\n'.format(step)) if args.do_predict: model_eval = model._layers if isinstance( model, paddle.DataParallel) else model evaluation(model_eval, dev_data_loader, args, tokenizer) batch_start_time = time.time() print('\nTraining completed.') elif args.do_predict: model_eval = model._layers if isinstance( model, paddle.DataParallel) else model evaluation(model_eval, dev_data_loader, args, tokenizer)
def train(args, model, train_data_loader, dev_data_loader, metric, rank): num_examples = len(train_data_loader) * args.batch_size * args.n_gpu max_train_steps = args.epochs * len(train_data_loader) warmup_steps = int(max_train_steps * args.warmup_proportion) if rank == 0: print("Num train examples: %d" % num_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) factor_fn = partial(compute_lr_factor, warmup_steps=warmup_steps, max_train_steps=max_train_steps) lr_scheduler = LambdaDecay(args.learning_rate, factor_fn) optimizer = AdamW(learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ params.name for params in model.parameters() if not any(nd in params.name for nd in ['bias', 'norm']) ], grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) loss_fn = DGULossFunction(args.task_name) load_ckpt(args, model, optimizer) step = 0 best_metric = 0.0 total_time = 0.0 for epoch in range(args.epochs): if rank == 0: print('\nEpoch %d/%d' % (epoch + 1, args.epochs)) batch_start_time = time.time() for batch in train_data_loader: step += 1 input_ids, segment_ids, labels = batch logits = model(input_ids, segment_ids) loss = loss_fn(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() total_time += (time.time() - batch_start_time) if rank == 0: if step % args.logging_steps == 0: print_logs(args, step, logits, labels, loss, total_time, metric) total_time = 0.0 if step % args.save_steps == 0 or step == max_train_steps: save_ckpt(model, optimizer, args.output_dir, step) if args.do_eval: print('\nEval begin...') metric_out = evaluation(args, model, dev_data_loader, metric) if metric_out > best_metric: best_metric = metric_out save_ckpt(model, optimizer, args.output_dir, 'best') print('Best model, step: %d\n' % step) batch_start_time = time.time()
def main(args): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, handlers=[ logging.FileHandler( os.path.join(args.output_dir, "run.log"), mode="w", encoding="utf-8", ) ], ) logger.info("********** Configuration Arguments **********") for arg, value in sorted(vars(args).items()): logger.info(f"{arg}: {value}") logger.info("**************************************************") set_seed(args) # metric and label label_name = GLUE_PROCESSED[args.task_name][1] if label_name: label2id = dict(zip(label_name, range(len(label_name)))) else: label2id = None metric_list = GLUE_METRICS[args.task_name] generate_max_length = label_length_map[args.task_name] writer = get_writer(args) # get model and tokenizer model = T5ForConditionalGeneration.from_pretrained(args.model_name_or_path) tokenizer = T5Tokenizer.from_pretrained(args.model_name_or_path) # get dataloader train_dataloader = get_train_dataloader(tokenizer, args) if args.task_name == "mnli": dev_dataloader_match = get_mnli_dev_dataloader(tokenizer, args, matched=True) dev_dataloader_mismatch = get_mnli_dev_dataloader(tokenizer, args, matched=False) else: dev_dataloader = get_dev_dataloader(tokenizer, args) num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps > 0: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) else: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch # get lr_scheduler lr_scheduler = get_scheduler( learning_rate=args.learning_rate, scheduler_type=args.scheduler_type, num_warmup_steps=args.warmup_steps if args.warmup_steps > 0 else args.warmup_radio, num_training_steps=args.max_train_steps, ) total_batch_size = args.train_batch_size * args.gradient_accumulation_steps decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = AdamW( learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, ) if args.use_amp: scaler = GradScaler(init_loss_scaling=args.scale_loss) logger.info("********** Running training **********") logger.info(f" Num examples = {len(train_dataloader.dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info(f" Instantaneous train batch size = {args.train_batch_size}") logger.info(f" Instantaneous eval batch size = {args.eval_batch_size}") logger.info( f" Total train batch size (w. accumulation) = {total_batch_size}") logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") progress_bar = tqdm(range(args.max_train_steps)) global_steps = 0 tr_loss, logging_loss = 0.0, 0.0 for _ in range(args.num_train_epochs): for step, batch in enumerate(train_dataloader): model.train() with auto_cast(args.use_amp, custom_white_list=["layer_norm", "softmax"]): source_ids, source_mask, labels, target_mask = batch outputs = model( input_ids=source_ids, attention_mask=source_mask, labels=labels, decoder_attention_mask=target_mask, ) loss = outputs[0] / args.gradient_accumulation_steps tr_loss += loss.item() if args.use_amp: scaler.scale(loss).backward() else: loss.backward() if (step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1): if args.use_amp: scaler.minimize(optimizer, loss) else: optimizer.step() lr_scheduler.step() optimizer.clear_grad() progress_bar.update(1) global_steps += 1 if args.logging_steps > 0 and global_steps % args.logging_steps == 0: writer.add_scalar("lr", lr_scheduler.get_lr(), global_steps) writer.add_scalar( "loss", (tr_loss - logging_loss) / args.logging_steps, global_steps, ) logger.info( "global_steps {} - lr: {:.10f} loss: {:.10f}".format( global_steps, lr_scheduler.get_lr(), (tr_loss - logging_loss) / args.logging_steps, )) logging_loss = tr_loss if args.save_steps > 0 and global_steps % args.save_steps == 0: logger.info("********** Running evaluating **********") logger.info(f"********** Step {global_steps} **********") output_dir = os.path.join(args.output_dir, f"step-{global_steps}") os.makedirs(output_dir, exist_ok=True) if args.task_name == "mnli": matched_results = evaluate( model, dev_dataloader_match, tokenizer, label2id, metric_list, generate_max_length, ) for k, v in matched_results.items(): writer.add_scalar(f"eval/matched_{k}", v, global_steps) logger.info(f" {k} = {v}") mismatched_results = evaluate( model, dev_dataloader_mismatch, tokenizer, label2id, metric_list, generate_max_length, ) for k, v in mismatched_results.items(): writer.add_scalar(f"eval/mismatched_{k}", v, global_steps) logger.info(f" {k} = {v}") else: eval_results = evaluate( model, dev_dataloader, tokenizer, label2id, metric_list, generate_max_length, ) for k, v in eval_results.items(): writer.add_scalar(f"eval/{k}", v, global_steps) logger.info(f" {k} = {v}") model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) logger.info("********** Evaluating Done **********") if global_steps >= args.max_train_steps: logger.info("********** Running evaluating **********") logger.info(f"********** Step {global_steps} **********") output_dir = os.path.join(args.output_dir, f"step-{global_steps}") os.makedirs(output_dir, exist_ok=True) if args.task_name == "mnli": matched_results = evaluate( model, dev_dataloader_match, tokenizer, label2id, metric_list, generate_max_length, ) for k, v in matched_results.items(): writer.add_scalar(f"eval/matched_{k}", v, global_steps) logger.info(f" {k} = {v}") mismatched_results = evaluate( model, dev_dataloader_mismatch, tokenizer, label2id, metric_list, generate_max_length, ) for k, v in mismatched_results.items(): writer.add_scalar(f"eval/mismatched_{k}", v, global_steps) logger.info(f" {k} = {v}") else: eval_results = evaluate( model, dev_dataloader, tokenizer, label2id, metric_list, generate_max_length, ) for k, v in eval_results.items(): writer.add_scalar(f"eval/{k}", v, global_steps) logger.info(f" {k} = {v}") model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) logger.info("********** Evaluating Done **********") logger.info("********** Training Done **********") return
def train(args): paddle.set_device(args.device) world_size = dist.get_world_size() if world_size > 1: dist.init_parallel_env() set_seed(args.seed) model = UnifiedTransformerLMHeadModel.from_pretrained( args.model_name_or_path) tokenizer = UnifiedTransformerTokenizer.from_pretrained( args.model_name_or_path) if world_size > 1: model = paddle.DataParallel(model) train_ds, dev_ds = load_dataset('duconv', splits=('train', 'dev')) train_ds, train_data_loader = create_data_loader(train_ds, tokenizer, args, 'train') dev_ds, dev_data_loader = create_data_loader(dev_ds, tokenizer, args, 'dev') lr_scheduler = NoamDecay(1 / (args.warmup_steps * (args.lr**2)), args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = AdamW(learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)) step = 0 total_time = 0.0 best_ppl = 1e9 for epoch in range(args.epochs): print('\nEpoch %d/%d' % (epoch + 1, args.epochs)) batch_start_time = time.time() for inputs in train_data_loader: step += 1 labels = inputs[-1] logits = model(*inputs[:-1]) loss = F.cross_entropy(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() total_time += (time.time() - batch_start_time) if step % args.logging_steps == 0: ppl = paddle.exp(loss) print( 'step %d - loss: %.4f - ppl: %.4f - lr: %.7f - %.3fs/step' % (step, loss, ppl, optimizer.get_lr(), total_time / args.logging_steps)) total_time = 0.0 if step % args.save_steps == 0: ppl = evaluation(model, dev_data_loader) if dist.get_rank() == 0: save_ckpt(model, tokenizer, args.save_dir, step) if ppl < best_ppl: best_ppl = ppl save_ckpt(model, tokenizer, args.save_dir, 'best') print('Saved step {} as best model.\n'.format(step)) batch_start_time = time.time() print('\nTraining completed.')
def main(args): paddle.set_device('gpu' if args.n_gpus else 'cpu') paddle.seed(args.seed) world_size = dist.get_world_size() rank = dist.get_rank() if world_size > 1: dist.init_parallel_env() model = UnifiedTransformerLMHeadModel.from_pretrained( args.model_name_or_path) tokenizer = UnifiedTransformerTokenizer.from_pretrained( args.model_name_or_path) if world_size > 1: model = paddle.DataParallel(model) train_dataset = DialogueDataset(args.train_data_path, args.batch_size, tokenizer.pad_token_id, tokenizer.cls_token_id, args.sort_pool_size, args.seed, mode='train') train_dataloader = DataLoader(train_dataset, return_list=True, batch_size=None) valid_dataset = DialogueDataset(args.valid_data_path, args.batch_size, tokenizer.pad_token_id, tokenizer.cls_token_id, args.sort_pool_size, mode='valid') valid_dataloader = DataLoader(valid_dataset, return_list=True, batch_size=None) lr_scheduler = NoamDecay(1 / (args.warmup_steps * (args.lr**2)), args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = AdamW(learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)) step = 0 total_time = 0.0 for epoch in range(args.epochs): if rank == 0: print('\nEpoch %d/%d' % (epoch + 1, args.epochs)) batch_start_time = time.time() for inputs in train_dataloader: step += 1 token_ids, type_ids, pos_ids, generation_mask, tgt_label, tgt_pos = inputs logits = model(token_ids, type_ids, pos_ids, generation_mask, tgt_pos) loss = F.cross_entropy(logits, tgt_label) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() total_time += (time.time() - batch_start_time) if rank == 0: if step % args.logging_steps == 0: ppl = paddle.exp(loss) print( 'step %d - loss: %.4f - ppl: %.4f - lr: %.7f - %.3fs/step' % (step, loss, ppl, optimizer.get_lr(), total_time / args.logging_steps)) total_time = 0.0 if step % args.save_steps == 0: evaluation(model, valid_dataloader) save_ckpt(model, tokenizer, args.save_dir, step) batch_start_time = time.time()
def train(args, model, tokenizer): set_seed(args) generate_max_length = args.max_target_length writer = get_writer(args) # Distributed Setting if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() model = paddle.DataParallel(model) # get dataloader train_dataloader = get_train_dataloader( model=model, tokenizer=tokenizer, args=args, ) eval_tasks = load_eval_tasks(model=model, tokenizer=tokenizer, args=args) if args.do_eval else None def math_ceil(x, y): return math.ceil(x / float(y)) num_update_steps_per_epoch = math_ceil(len(train_dataloader), args.gradient_accumulation_steps) if args.logging_steps > num_update_steps_per_epoch: args.logging_steps = num_update_steps_per_epoch if args.max_steps > 0: args.num_train_epochs = math_ceil(args.max_steps, num_update_steps_per_epoch) else: args.max_steps = args.num_train_epochs * num_update_steps_per_epoch # get lr_scheduler lr_scheduler = get_scheduler( learning_rate=args.learning_rate, scheduler_type=args.lr_scheduler_type, num_warmup_steps=args.warmup_steps if args.warmup_steps > 0 else args.warmup_ratio, num_training_steps=args.max_steps, ) total_batch_size = (args.per_device_train_batch_size * args.gradient_accumulation_steps * paddle.distributed.get_world_size()) decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] grad_clip = paddle.nn.ClipGradByGlobalNorm(args.max_grad_norm) optimizer = AdamW( learning_rate=lr_scheduler, beta1=args.adam_beta1, beta2=args.adam_beta2, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=grad_clip, ) if args.use_amp: scaler = GradScaler(init_loss_scaling=args.scale_loss) logger.info("********** Running training **********") logger.info(f" Num examples = {len(train_dataloader.dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Device train batch size = {args.per_device_train_batch_size}") logger.info( f" Device eval batch size = {args.per_device_eval_batch_size}") logger.info( f" Total train batch size (w. accumulation) = {total_batch_size}") logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_steps}") global_steps = 0 tr_loss, logging_loss = 0.0, 0.0 best_score = 0.0 def logging_lr_loss(): cur_lr = lr_scheduler.get_lr() cur_loss = (tr_loss - logging_loss) / args.logging_steps writer.add_scalar("lr", cur_lr, global_steps) writer.add_scalar("loss", cur_loss, global_steps) logger.info(f"global_steps {global_steps}/{args.max_steps}" f" - lr: {cur_lr:.10f} loss: {cur_loss:.10f}") for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_dataloader): model.train() with auto_cast(args.use_amp, custom_white_list=["layer_norm", "softmax"]): outputs = model(**batch) loss = outputs[0] / args.gradient_accumulation_steps tr_loss += loss.item() if args.use_amp: scaler.scale(loss).backward() else: loss.backward() if (step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1): if args.use_amp: scaler.minimize(optimizer, loss) else: optimizer.step() lr_scheduler.step() optimizer.clear_grad() global_steps += 1 if (args.logging_steps > 0 and global_steps % args.logging_steps == 0): if paddle.distributed.get_rank() == 0: logging_lr_loss() logging_loss = tr_loss save_checkpoint(tokenizer, model, os.path.join(args.output_dir, f"ckpt_epoch{epoch}")) if args.do_eval and paddle.distributed.get_rank() == 0: logger.info(f"********** Running evaluating **********") logger.info(f"************* Epoch {epoch} ************") eval_overall_results, eval_predictions = eval_all_tasks( eval_tasks=eval_tasks, model=model, tokenizer=tokenizer, generate_max_length=generate_max_length, ) for line in better_print_multi(eval_overall_results).split('\n'): logger.info(line) if args.metric_for_best_model not in eval_overall_results: raise ValueError(f"Main metric {args.metric_for_best_model} " f"is not in {eval_overall_results.keys()}.") logger.info("********** Evaluating Done **********") current_score = eval_overall_results[args.metric_for_best_model] if current_score > best_score: logger.info("********** Saving Model **********") best_score = current_score save_checkpoint(tokenizer, model, os.path.join(args.output_dir, f"best")) best_ckpt_file = os.path.join(args.output_dir, "best", "model_state.pdparams") if os.path.exists(best_ckpt_file): logger.info(f"Load best checkpoint from {best_ckpt_file}") model.load_dict(paddle.load(best_ckpt_file)) save_checkpoint(tokenizer, model, args.output_dir)