def __init__(self): super(GPT2_Base_CN, self).__init__() # 加载 PaddleNLP 自带的预训练中文 GPT2 模型 self.model = GPT2ForPretraining.from_pretrained('gpt2-base-cn') # 设置模型为评估状态 self.model.eval() # 加载编解码器 self.tokenizer = GPT2ChineseTokenizer.from_pretrained('gpt2-base-cn') # 初始化编码器 _ = self.tokenizer.encode('_')
def __init__(self): super(GPT2_CPM_LM, self).__init__() # 实例化模型 gpt2 = GPT2Model(vocab_size=30000, hidden_size=2560, num_hidden_layers=32, num_attention_heads=32, intermediate_size=10240, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=1024, type_vocab_size=1, initializer_range=0.02, pad_token_id=0) self.model = GPT2ForPretraining(gpt2) # 读取CPM-LM模型参数(FP16) state_dict = paddle.load( os.path.join(self.directory, 'CPM-LM.pdparams')) # FP16 -> FP32 for param in state_dict: state_dict[param] = state_dict[param].astype('float32') # 设置模型参数 self.model.set_dict(state_dict) # 将模型设置为评估状态 self.model.eval() # 加载编解码器 self.tokenizer = GPT2ChineseTokenizer( vocab_file=os.path.join(self.directory, 'vocab.json'), model_file=os.path.join(self.directory, 'chinese_vocab.model')) # 初始化编码器 _ = self.tokenizer.encode('_')
def do_eval(args): assert args.device in [ "cpu", "gpu", "xpu" ], "Invalid device! Available device should be cpu, gpu, or xpu." paddle.set_device(args.device) model_class, tokenizer_class = MODEL_CLASSES[args.model_name_or_path] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) if args.init_checkpoint_path is not None: model = GPT2ForPretraining( GPT2Model(**model_class.pretrained_init_configuration[ args.model_name_or_path])) logger.info("Load model checkpoint from %s" % args.init_checkpoint_path) model_dict = paddle.load(os.path.join(args.init_checkpoint_path)) model.set_dict(model_dict) else: model = model_class.from_pretrained(args.model_name_or_path) tic_eval = time.time() eval_data_loader = create_eval_dataset(args) model.eval() total_score = 0 score_name = "loss" if not args.cloze_eval else "number correct" with paddle.no_grad(): for step, batch in enumerate(eval_data_loader): tokens, loss_mask, attention_mask, position_ids, labels = batch preds = model(tokens, position_ids, attention_mask) if not args.cloze_eval: masked_lm_loss = paddle.nn.functional.cross_entropy( preds, labels, reduction="none") loss = paddle.sum(masked_lm_loss * loss_mask) total_score += loss.numpy() / (args.num_tokenized_tokens - 1) else: outputs = paddle.argmax(preds, -1) acc = paddle.cast(outputs == labels, 'float32') acc = paddle.where(paddle.cast(loss_mask, 'bool'), acc, paddle.ones_like(acc)) acc = paddle.sum(paddle.prod(acc, -1)) total_score += acc.numpy() if step % args.logging_steps == 0: logger.info( "step %d, batch: %d, %s: %f, speed: %.2f step/s" % (step, step, score_name, total_score, args.logging_steps / (time.time() - tic_eval))) tic_eval = time.time() if not args.cloze_eval: total_loss = float(total_score) ppl = math.exp(min(20, total_loss)) token_ratio = (args.num_tokenized_tokens - 1) / (args.num_original_tokens - 1) adjusted_ppl = math.exp(min(20, total_loss * token_ratio)) string = ' validation results on {} | '.format(args.eval_path) string += 'avg loss: {:.4E} | '.format(total_loss) string += 'ppl: {:.4E} | '.format(ppl) string += 'adjusted ppl: {:.4E} | '.format(adjusted_ppl) string += 'token ratio: {} |'.format(token_ratio) else: num_correct = float(total_score) acc = float(num_correct / args.num_examples) string = ' validation results on {} | '.format(args.eval_path) string += 'number correct: {:.4E} | '.format(num_correct) string += 'total examples: {:.4E} | '.format(args.num_examples) string += 'avg accuracy: {:.4E}'.format(acc) logger.info(string)
def do_train(args): assert args.device in [ "cpu", "gpu", "xpu" ], "Invalid device! Available device should be cpu, gpu, or xpu." paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() worker_index = paddle.distributed.get_rank() worker_num = paddle.distributed.get_world_size() set_seed(args) worker_init = WorkerInitObj(args.seed + paddle.distributed.get_rank()) model_class, tokenizer_class = MODEL_CLASSES[args.model_name_or_path] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) eod_id = tokenizer.command_name_map["eod"].Id model = GPT2ForPretraining( GPT2Model(**model_class.pretrained_init_configuration[ args.model_name_or_path])) # creat the critrion for the gpt model criterion = GPT2PretrainingCriterion() if args.decay_steps is None: args.decay_steps = args.max_steps warmup_step = args.warmup_rate * args.decay_steps lr_scheduler = lr.CosineAnnealingWithWarmupDecay( max_lr=args.max_lr, min_lr=args.min_lr, warmup_step=warmup_step, decay_step=args.decay_steps) clip = None if args.grad_clip > 0: clip = paddle.nn.ClipGradByNorm(clip_norm=args.grad_clip) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=clip, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): files = [ os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if (os.path.isfile(os.path.join(args.input_dir, f)) and "npz_" not in str(f)) ] files.sort() num_files = len(files) for f_id in range(num_files): data_file = files[f_id] train_data_loader = create_pretrained_dataset( args, data_file, worker_init, worker_index, eod_id=eod_id) for step, batch in enumerate(train_data_loader): global_step += 1 tokens, loss_mask, attention_mask, position_ids, labels = batch loss_mask.stop_gradient = True attention_mask.stop_gradient = True preds = model(tokens, position_ids, attention_mask) loss = criterion(preds, labels, loss_mask) if global_step % args.logging_steps == 0: if worker_index == 0: logger.info( "global step %d, epoch: %d, lr: %.10f, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, optimizer.get_lr(), step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % args.save_steps == 0: if worker_index == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) if global_step >= args.max_steps: del train_data_loader return del train_data_loader