def load_checkpoint(self, cfg, checkpoint_path, resume=None): if os.path.exists(checkpoint_path + '_net_G.pdparams'): if resume is None: resume = False else: current_epoch = 0 current_iteration = 0 print("No checkpoint found.") return current_epoch, current_iteration net_G_dict, opt_G_dict = dg.load_dygraph(checkpoint_path + '_net_G') if not self.is_inference: net_D_dict, opt_D_dict = dg.load_dygraph(checkpoint_path + '_net_D') current_epoch, current_iteration = int( checkpoint_path.split('_')[-4]), int( checkpoint_path.split('_')[-2]) if resume: self.net_G.set_dict(net_G_dict) self.net_D.set_dict(net_D_dict) # self.opt_G.set_dict(opt_G_dict) self.opt_D.set_dict(opt_D_dict) print("Load from: {}".format(checkpoint_path)) else: self.net_G.set_dict(net_G_dict) print("Load generator weights only.") print("Done with loading the checkpoint.") return current_epoch, current_iteration
def build(): model = InceptionV4() path = '/home/aistudio/vid2vid/model/backbones/inceptionv4' state_dict, _ = dg.load_dygraph(path) model.set_dict(state_dict) print("load pretrained inception v4 models from path " + path) return model
def build_hand_model(): hand_model = HandPose() state_dict, _ = dg.load_dygraph( '/home/aistudio/openpose/pretrained_models/pose_hand_21_102000.pdparams' ) hand_model.load_dict(state_dict) return hand_model
def build_face_model(): face_model = FacePose() state_dict, _ = dg.load_dygraph( '/home/aistudio/openpose/pretrained_models/pose_face_70_iter_116000.pdparams' ) face_model.load_dict(state_dict) return face_model
def extract_and_convert(input_dir, output_dir): if not os.path.exists(output_dir): os.makedirs(output_dir) config = json.load( open(os.path.join(input_dir, 'ernie_config.json'), 'rt', encoding='utf-8')) print('=' * 20 + 'save vocab file' + '=' * 20) shutil.copyfile(os.path.join(input_dir, 'vocab.txt'), os.path.join(output_dir, 'vocab.txt')) print('=' * 20 + 'extract weights' + '=' * 20) state_dict = [] weight_map = build_params_map(attention_num=config['num_hidden_layers']) with fluid.dygraph.guard(): paddle_paddle_params, _ = D.load_dygraph( os.path.join(input_dir, 'params')) for weight_name, weight_value in paddle_paddle_params.items(): if weight_name not in weight_map.keys(): continue #print(weight_name, weight_value.shape) if 'w_0' in weight_name \ or 'post_att_layer_norm_scale' in weight_name \ or 'post_ffn_layer_norm_scale' in weight_name \ or 'cls_out_w' in weight_name: weight_value = weight_value.transpose() state_dict.append({ 'name': weight_map[weight_name], 'data': Tensor(weight_value) }) print(weight_name, '->', weight_map[weight_name], weight_value.shape) save_checkpoint(state_dict, os.path.join(output_dir, "ernie.ckpt"))
def _initialize(self, line=4, word=7): """ initialize with the necessary elements """ if line not in [4, 8]: raise ValueError("The line could only be 4 or 8.") if word not in [5, 7]: raise ValueError("The word could only be 5 or 7.") self.line = line assets_path = os.path.join(self.directory, "assets") gen_checkpoint_path = os.path.join( assets_path, "ernie_gen_acrostic_poetry_L%sW%s" % (line, word)) ernie_cfg_path = os.path.join(assets_path, 'ernie_config.json') with open(ernie_cfg_path, encoding='utf8') as ernie_cfg_file: ernie_cfg = dict(json.loads(ernie_cfg_file.read())) ernie_vocab_path = os.path.join(assets_path, 'vocab.txt') with open(ernie_vocab_path, encoding='utf8') as ernie_vocab_file: ernie_vocab = { j.strip().split('\t')[0]: i for i, j in enumerate(ernie_vocab_file.readlines()) } with fluid.dygraph.guard(fluid.CPUPlace()): with fluid.unique_name.guard(): self.model = ErnieModelForGeneration(ernie_cfg) finetuned_states, _ = D.load_dygraph(gen_checkpoint_path) self.model.set_dict(finetuned_states) self.tokenizer = ErnieTokenizer(ernie_vocab) self.rev_dict = {v: k for k, v in self.tokenizer.vocab.items()} self.rev_dict[self.tokenizer.pad_id] = '' # replace [PAD] self.rev_dict[self.tokenizer.unk_id] = '' # replace [PAD] self.rev_lookup = np.vectorize(lambda i: self.rev_dict[i])
def extract_and_convert(input_dir, output_dir): if not os.path.exists(output_dir): os.makedirs(output_dir) print('=' * 20 + 'save config file' + '=' * 20) config = json.load( open(os.path.join(input_dir, 'ernie_config.json'), 'rt', encoding='utf-8')) config['layer_norm_eps'] = 1e-5 if 'sent_type_vocab_size' in config: config['type_vocab_size'] = config['sent_type_vocab_size'] config['intermediate_size'] = 4 * config['hidden_size'] json.dump(config, open(os.path.join(output_dir, 'config.json'), 'wt', encoding='utf-8'), indent=4) print('=' * 20 + 'save vocab file' + '=' * 20) shutil.copyfile(os.path.join(input_dir, 'vocab.txt'), os.path.join(output_dir, 'vocab.txt')) print('=' * 20 + 'extract weights' + '=' * 20) state_dict = collections.OrderedDict() weight_map = build_params_map(attention_num=config['num_hidden_layers']) with fluid.dygraph.guard(): paddle_paddle_params, _ = D.load_dygraph( os.path.join(input_dir, 'saved_weights')) for weight_name, weight_value in paddle_paddle_params.items(): if 'weight' in weight_name: if 'encoder_stack' in weight_name or 'pooler' in weight_name or 'mlm.' in weight_name: weight_value = weight_value.transpose() state_dict[weight_map[weight_name]] = torch.FloatTensor(weight_value) print(weight_name, '->', weight_map[weight_name], weight_value.shape) torch.save(state_dict, os.path.join(output_dir, "pytorch_model.bin"))
def main(): Place = paddle.fluid.CUDAPlace(0) with fluid.dygraph.guard(Place): model = Transformer(image_size=512, num_classes=15, hidden_unit_num=1024, layer_num=2, head_num=16, dropout=0.8, decoder_name='PUP', hyber=True, visualable=False) preprocess = Transform(512) dataloader_1 = Dataloader('/home/aistudio/dataset', '/home/aistudio/dataset/val_list.txt', transform=preprocess, shuffle=True) val_load = fluid.io.DataLoader.from_generator(capacity=1, use_multiprocess=False) val_load.set_sample_generator(dataloader_1, batch_size=1, places=Place) model_dic, optic_dic = load_dygraph( "./output/SETR-NotZero-Epoch-2-Loss-0.161517-MIOU-0.325002") model.load_dict(model_dic) model.eval() '''result = get_infer_data("/home/aistudio/dataset/infer") infer_load = Load_infer('/home/aistudio/dataset', result, transform=preprocess, shuffle=False) loader_infer= fluid.io.DataLoader.from_generator(capacity=1, use_multiprocess=False) loader_infer.set_sample_generator(infer_load, batch_size=1, places=Place) process_image(model, loader_infer, result)''' validation(val_load, model, 15)
def _initialize(self): """ initialize with the necessary elements """ assets_path = os.path.join(self.directory, "assets") gen_checkpoint_path = os.path.join(assets_path, "ernie_gen") ernie_cfg_path = os.path.join(assets_path, 'ernie_config.json') with open(ernie_cfg_path, encoding='utf8') as ernie_cfg_file: ernie_cfg = dict(json.loads(ernie_cfg_file.read())) ernie_vocab_path = os.path.join(assets_path, 'vocab.txt') with open(ernie_vocab_path, encoding='utf8') as ernie_vocab_file: ernie_vocab = { j.strip().split('\t')[0]: i for i, j in enumerate(ernie_vocab_file.readlines()) } with fluid.dygraph.guard(fluid.CPUPlace()): with fluid.unique_name.guard(): self.model = ErnieModelForGeneration(ernie_cfg) finetuned_states, _ = D.load_dygraph(gen_checkpoint_path) self.model.set_dict(finetuned_states) self.tokenizer = ErnieTokenizer(ernie_vocab) self.rev_dict = {v: k for k, v in self.tokenizer.vocab.items()} self.rev_dict[self.tokenizer.pad_id] = '' # replace [PAD] self.rev_dict[self.tokenizer.unk_id] = '' # replace [PAD] self.rev_lookup = np.vectorize(lambda i: self.rev_dict[i])
def load_parameters(model, optimizer=None, checkpoint_dir=None, iteration=None, checkpoint_path=None): """Load a specific model checkpoint from disk. Args: model (obj): model to load parameters. optimizer (obj, optional): optimizer to load states if needed. Defaults to None. checkpoint_dir (str, optional): the directory where checkpoint is saved. iteration (int, optional): if specified, load the specific checkpoint, if not specified, load the latest one. Defaults to None. checkpoint_path (str, optional): if specified, load the checkpoint stored in the checkpoint_path and the argument 'checkpoint_dir' will be ignored. Defaults to None. Returns: iteration (int): number of iterations that the loaded checkpoint has been trained. """ if checkpoint_path is not None: iteration = int(os.path.basename(checkpoint_path).split("-")[-1]) elif checkpoint_dir is not None: if iteration is None: iteration = _load_latest_checkpoint(checkpoint_dir) if iteration == 0: return iteration checkpoint_path = os.path.join(checkpoint_dir, "step-{}".format(iteration)) else: raise ValueError( "At least one of 'checkpoint_dir' and 'checkpoint_path' should be specified!" ) local_rank = dg.parallel.Env().local_rank model_dict, optimizer_dict = dg.load_dygraph(checkpoint_path) state_dict = model.state_dict() # cast to desired data type, for mixed-precision training/inference. for k, v in model_dict.items(): if k in state_dict and convert_np_dtype( v.dtype) != state_dict[k].dtype: model_dict[k] = v.astype(state_dict[k].numpy().dtype) model.set_dict(model_dict) print("[checkpoint] Rank {}: loaded model from {}.pdparams".format( local_rank, checkpoint_path)) if optimizer and optimizer_dict: optimizer.set_dict(optimizer_dict) print("[checkpoint] Rank {}: loaded optimizer state from {}.pdopt". format(local_rank, checkpoint_path)) return iteration
def load_checkpoint(step, model_path): model_dict, opti_dict = dg.load_dygraph(os.path.join(model_path, step)) new_state_dict = OrderedDict() for param in model_dict: if param.startswith('_layers.'): new_state_dict[param[8:]] = model_dict[param] else: new_state_dict[param] = model_dict[param] return new_state_dict, opti_dict
def load_model(init_model, model_path): if os.path.exists(model_path + ".pdparams"): logging.info("load model from {}".format(model_path)) start_time = time.time() sd, _ = D.load_dygraph(model_path) init_model.set_dict(sd) logging.info("cost time: %.4fs" % (time.time() - start_time)) else: logging.info("cannot find model file: {}".format(model_path + ".pdparams"))
def load_D(path='data/anime-biggan-256px-run39-607250.discriminator'): place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id) fluid.enable_dygraph(place) discriminator = Discriminator(n_class=1000, chn=96, blocks_with_attention="B2", resolution=256) discriminator.set_dict(dg.load_dygraph(path)[0]) model_cache.D = discriminator
def load_G(path='data/anime-biggan-256px-run39-607250.generator'): place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id) fluid.enable_dygraph(place) generator = Generator(code_dim=140, n_class=1000, chn=96, blocks_with_attention="B5", resolution=256) generator.set_dict(dg.load_dygraph(path)[0]) model_cache.G = generator
def load_model(init_model, model_path): """ 将训练得到的参数加载到paddle动态图模型结构中 [in] init_model: 已构造好的模型结构 model_path: str, 模型地址(去掉.pdparams后缀) """ if os.path.exists(model_path + ".pdparams"): logging.info("load model from {}".format(model_path)) start_time = time.time() sd, _ = D.load_dygraph(model_path) init_model.set_dict(sd) logging.info("cost time: %.4fs" % (time.time() - start_time)) else: logging.info("cannot find model file: {}".format(model_path + ".pdparams"))
def load_wavenet(model, path): wavenet_dict, _ = dg.load_dygraph(path) encoder_dict = OrderedDict() teacher_dict = OrderedDict() for k, v in wavenet_dict.items(): if k.startswith("encoder."): encoder_dict[k.split('.', 1)[1]] = v else: # k starts with "decoder." teacher_dict[k.split('.', 1)[1]] = v model.encoder.set_dict(encoder_dict) model.teacher.set_dict(teacher_dict) print("loaded the encoder part and teacher part from wavenet model.")
def build_body_model(body_points=25): if body_points == 25: body_model = BodyPose25() state_dict, _ = dg.load_dygraph( '/home/aistudio/openpose/pretrained_models/pose_body_25_iter_584000.pdparams' ) body_model.load_dict(state_dict) elif body_points == 18: body_model = BodyPose18() state_dict, _ = dg.load_dygraph( '/home/aistudio/openpose/pretrained_models/pose_body_18_iter_440000.pdparams' ) body_model.load_dict(state_dict) elif body_points == 15: body_model = BodyPose15() state_dict, _ = dg.load_dygraph( '/home/aistudio/openpose/pretrained_models/pose_body_15_iter_160000.pdparams' ) body_model.load_dict(state_dict) else: raise ValueError() return body_model
def from_pretrained(cls, pretrain_dir_or_url, force_download=False, **kwargs): if not Path(pretrain_dir_or_url).exists( ) and pretrain_dir_or_url in cls.resource_map: url = cls.resource_map[pretrain_dir_or_url] logger.info('get pretrain dir from %s' % url) pretrain_dir = Path(_fetch_from_remote(url, force_download)) else: logger.info('pretrain dir %s not in %s, read from local' % (pretrain_dir_or_url, repr(cls.resource_map))) pretrain_dir = Path(pretrain_dir_or_url) if not pretrain_dir.exists(): raise ValueError('pretrain dir not found: %s' % pretrain_dir) param_path = pretrain_dir / 'params' state_dict_path = pretrain_dir / 'saved_weights' config_path = pretrain_dir / 'ernie_config.json' if not config_path.exists(): raise ValueError('config path not found: %s' % config_path) name_prefix = kwargs.pop('name', None) cfg_dict = dict(json.loads(config_path.open().read()), **kwargs) model = cls(cfg_dict, name=name_prefix) logger.info('loading pretrained model from %s' % pretrain_dir) # if os.path.exists(param_path): # raise NotImplementedError() # logger.debug('load pretrained weight from program state') # F.io.load_program_state(param_path) #buggy in dygraph.gurad, push paddle to fix if state_dict_path.with_suffix('.pdparams').exists(): m, _ = D.load_dygraph(state_dict_path.as_posix()) for k, v in model.state_dict().items(): if k not in m: logger.warn('param:%s not set in pretrained model, skip' % k) m[k] = v model.set_dict(m) else: raise ValueError('weight file not found in pretrain dir: %s' % pretrain_dir) return model
def from_pretrained(cls, pretrain_dir_or_url, force_download=False, **kwargs): if pretrain_dir_or_url in cls.resource_map: url = cls.resource_map[pretrain_dir_or_url] log.info('get pretrain dir from %s' % url) pretrain_dir = _fetch_from_remote(url, force_download) else: log.info('pretrain dir %s not in %s, read from local' % (pretrain_dir_or_url, repr(cls.resource_map))) pretrain_dir = pretrain_dir_or_url if not os.path.exists(pretrain_dir): raise ValueError('pretrain dir not found: %s' % pretrain_dir) param_path = os.path.join(pretrain_dir, 'params') state_dict_path = os.path.join(pretrain_dir, 'saved_weights') config_path = os.path.join(pretrain_dir, 'ernie_config.json') if not os.path.exists(config_path): raise ValueError('config path not found: %s' % config_path) name_prefix = kwargs.pop('name', None) cfg_dict = dict(json.loads(open(config_path).read()), **kwargs) model = cls(cfg_dict, name=name_prefix) log.info('loading pretrained model from %s' % pretrain_dir) #if os.path.exists(param_path): # raise NotImplementedError() # log.debug('load pretrained weight from program state') # F.io.load_program_state(param_path) #buggy in dygraph.gurad, push paddle to fix if os.path.exists(state_dict_path + '.pdparams'): m, _ = D.load_dygraph(state_dict_path) for k, v in model.state_dict().items(): if k not in m: log.warn('param:%s not set in pretrained model, skip' % k) m[k] = v # FIXME: no need to do this in the future model.set_dict(m) else: raise ValueError('weight file not found in pretrain dir: %s' % pretrain_dir) return model
a = L.argmax(logits, -1) == label acc.append(a.numpy()) model.train() log.debug('acc %.5f' % np.concatenate(acc).mean()) if args.save_dir is not None: F.save_dygraph(model.state_dict(), args.save_dir) else: feature_column = propeller.data.FeatureColumns([ propeller.data.TextColumn('seg_a', unk_id=tokenizer.unk_id, vocab_dict=tokenizer.vocab, tokenizer=tokenizer.tokenize), ]) assert args.save_dir is not None sd, _ = FD.load_dygraph(args.save_dir) model.set_dict(sd) model.eval() def map_fn(seg_a): seg_a, _ = tokenizer.truncate(seg_a, [], seqlen=args.max_seqlen) sentence, segments = tokenizer.build_for_ernie(seg_a, []) return sentence, segments predict_ds = feature_column.build_dataset_from_stdin('predict') \ .map(map_fn) \ .padded_batch(args.bsz) shapes = ([-1, args.max_seqlen], [-1, args.max_seqlen]) types = ('int64', 'int64') predict_ds.data_shapes = shapes
if args.use_data_parallel else fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()) with dg.guard(place) as g: pyreader = fluid.io.PyReader(capacity=10, return_list=True) pyreader.decorate_batch_generator(data_loader, place) model = make_deepvoice3_from_hparams(hparams) optimizer, clipper = make_optimizer_from_hparams(hparams) print("Log event path: {}".format(tensorboard_dir)) writer = SummaryWriter(tensorboard_dir) if local_rank == 0 else None criterion = make_loss_from_hparams(hparams) # loading saved model if args.train_postnet_only or args.train_seq2seq_only: assert args.checkpoint is not None, \ "you must train part of the model from a trained whole model" if args.train_postnet_only: assert hparams.use_decoder_state_for_postnet_input is False, \ "when training only the postnet, there is no decoder states" if args.checkpoint is not None: model_dict, optimizer_dict = dg.load_dygraph(args.checkpoint) if args.use_data_parallel: strategy = dg.parallel.prepare_context() model = MyDataParallel(model, strategy) train_model(model, pyreader, criterion, optimizer, clipper, writer, args, hparams) print("Done!")
print(" {}: {}".format(k, v)) # Load preset if specified if preset is not None: with io.open(preset) as f: hparams.parse_json(f.read()) # Override hyper parameters hparams.parse(args.hparams) assert hparams.name == "deepvoice3" place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() with dg.guard(place): # Model model = make_deepvoice3_from_hparams(hparams) dry_run(model) model_dict, _ = dg.load_dygraph(args.checkpoint) model.set_dict(model_dict) checkpoint_name = splitext(basename(checkpoint_path))[0] model.seq2seq.decoder.max_decoder_steps = max_decoder_steps if not os.path.exists(dst_dir): os.makedirs(dst_dir) with io.open(text_list_file_path, "rt", encoding="utf-8") as f: lines = f.readlines() for idx, line in enumerate(lines): text = line[:-1] words = nltk.word_tokenize(text) waveform, alignment, _, _ = tts(model, text,
# print(np.percentile([len(row[0]) for row in dev_features], [0, 50, 95, 99, 100])) # to batch print('start training...') bst_f1, global_step = 0, 0 args.max_steps = (len(train_features) // args.bsz + 1) * args.epochs try: place = F.CUDAPlace(0) except: place = F.CPUPlace() with FD.guard(place): if 'ernie' in args.from_pretrained: model = ErnieModelForSequenceClassification.from_pretrained( args.from_pretrained, num_labels=2, name='') if args.init_checkpoint is not None: print('loading checkpoint from %s' % args.init_checkpoint) sd, _ = FD.load_dygraph(args.init_checkpoint) model.set_dict(sd) elif 'wwm' in args.from_pretrained: config = json.load( open(os.path.join(args.from_pretrained, 'ernie_config.json'), 'rt', encoding='utf-8')) config['num_labels'] = 2 model = ErnieModelForSequenceClassification(config) # print(model) print('loading checkpoint from %s' % 'chinese_roberta_wwm_pp') sd, _ = FD.load_dygraph('%s/roberta_wwm.pdparams' % args.from_pretrained) for k, v in model.state_dict().items(): if k not in sd: print('param:%s not set in pretrained model, skip' % k)
# print(np.percentile([len(row[0]) for row in train_features], [0, 50, 95, 99, 100])) # print(np.percentile([len(row[0]) for row in dev_features], [0, 50, 95, 99, 100])) # to batch try: place = F.CUDAPlace(0) except: place = F.CPUPlace() with FD.guard(place): if 'wwm' in args.from_pretrained: config = json.load(open(os.path.join(args.from_pretrained, 'ernie_config.json'), 'rt', encoding='utf-8')) config['num_labels'] = 2 model = ErnieModelForSequenceClassification(config) # print(model) print('loading checkpoint from %s' % 'chinese_roberta_wwm_pp') sd, _ = FD.load_dygraph('%s/roberta_wwm.pdparams' % args.from_pretrained) for k, v in model.state_dict().items(): if k not in sd: print('param:%s not set in pretrained model, skip' % k) sd[k] = v # FIXME: no need to do this in the future model.set_dict(sd) else: model = ErnieModelForSequenceClassification.from_pretrained(args.from_pretrained, num_labels=2, name='') if args.init_checkpoint is not None: print('loading checkpoint from %s' % args.init_checkpoint) sd, _ = FD.load_dygraph(args.init_checkpoint) model.set_dict(sd) test_batch_data = batchify(test_features, args.bsz, args.max_seqlen) if args.debug: print(len(test_batch_data))
default=None, help='inference model output directory') parser.add_argument('--init_checkpoint', type=str, default=None) parser.add_argument('--save_dir', type=str, default=None, help='model output directory') parser.add_argument('--wd', type=float, default=0.01, help='weight decay, aka L2 regularizer') args = parser.parse_args() place = F.CUDAPlace(D.parallel.Env().dev_id) D.guard(place).__enter__() ernie = ErnieModelForGeneration.from_pretrained(args.from_pretrained) tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained, mask_token=None) rev_dict = {v: k for k, v in tokenizer.vocab.items()} rev_dict[tokenizer.pad_id] = '' # replace [PAD] rev_dict[tokenizer.unk_id] = '' # replace [PAD] if args.init_checkpoint is not None: log.info('loading checkpoint from %s' % args.init_checkpoint) sd, _ = D.load_dygraph(args.init_checkpoint) ernie.set_dict(sd) seq2seq(ernie, tokenizer, args)
for epoch in range(EPOCH): for step, (ids_student, ids, sids, labels) in enumerate(train_ds.start(place)): loss, logits = teacher_model(ids, labels=labels) loss.backward() if step % 10 == 0: print('[step %03d] teacher train loss %.5f lr %.3e' % (step, loss.numpy(), opt.current_step_lr())) opt.minimize(loss, grad_clip=g_clip) teacher_model.clear_gradients() if step % 100 == 0: f1 = evaluate_teacher(teacher_model, dev_ds) print('teacher f1: %.5f' % f1) D.save_dygraph(teacher_model.state_dict(), './teacher_model') else: state_dict, _ = D.load_dygraph('./teacher_model') teacher_model.set_dict(state_dict) f1 = evaluate_teacher(teacher_model, dev_ds) print('teacher f1: %.5f' % f1) # 定义finetune student 模型所需要的超参数 SEQLEN = 256 BATCH = 100 EPOCH = 10 LR = 1e-4 def evaluate_student(model, dataset): all_pred, all_label = [], [] with D.base._switch_tracer_mode_guard_(is_train=False): model.eval()
def finetune( self, train_path, dev_path=None, save_dir="ernie_gen_result", init_ckpt_path=None, use_gpu=True, max_steps=500, batch_size=8, max_encode_len=50, max_decode_len=50, learning_rate=5e-5, warmup_proportion=0.1, weight_decay=0.1, noise_prob=0, label_smooth=0, beam_width=5, length_penalty=1.0, log_interval=100, save_interval=200, ): """ finetune with the specified dataset. Args: train_path(str): the train dataset path. dev_path(str): the dev dataset path. save_dir(str): the model params and dev dataset predict result save path. init_ckpt_path(str): incremental training load path. use_gpu(bool): use gpu or not. max_steps(int): max training steps. batch_size(int): the batch size. max_encode_len(int): the max encode length. max_decode_len(int): the max decode length. learning_rate(float): the learning rate. warmup_proportion(float): the warmup proportion. weight_decay(float): the weight decay magnitude. noise_prob(float): the nosie probability. see the ernie gen paper for details. label_smooth(float): the label smooth magnitude. beam_width(int): the beam size during evaluating the dev dataset. length_penalty(float): the length penalty during evaluating the dev dataset. log_interval(int): the log interval. save_interval(int): the save interval. dev set will be evaluated after saving. Return: result(dict): A Dictionary of shape:: { last_save_path(str): last model save path. last_ppl(float): last model ppl. } """ self.max_encode_len = max_encode_len self.max_decode_len = max_decode_len self.noise_prob = noise_prob place = F.CUDAPlace(0) if use_gpu else F.CPUPlace() with F.dygraph.guard(place): if init_ckpt_path is not None: logger.info('loading checkpoint from %s' % init_ckpt_path) sd, _ = D.load_dygraph(init_ckpt_path) self.model.set_dict(sd) feature_column = propeller.data.FeatureColumns([ propeller.data.LabelColumn('id'), propeller.data.TextColumn( 'src', unk_id=self.tokenizer.unk_id, vocab_dict=self.tokenizer.vocab, tokenizer=self.tokenizer.tokenize), propeller.data.TextColumn( 'tgt', unk_id=self.tokenizer.unk_id, vocab_dict=self.tokenizer.vocab, tokenizer=self.tokenizer.tokenize), ]) train_ds = feature_column.build_dataset('train', data_file=train_path, shuffle=False, repeat=True, use_gz=False)\ .map(self._map_fn).shuffle(10000).padded_batch(batch_size).map(self._after_padding) train_ds.data_shapes = [[None, None]] * 7 + [[None, None, None] ] * 3 + [[None]] train_ds.data_types = ['int64'] * 11 if dev_path: dev_ds = feature_column.build_dataset('dev', data_file=dev_path, shuffle=False, repeat=False, use_gz=False) \ .map(self._map_fn) \ .padded_batch(1) \ .map(self._after_padding) dev_ds.data_shapes = [[None, None]] * 7 + [[None, None, None] ] * 3 + [[None]] dev_ds.data_types = ['int64'] * 11 vocab_size, _ = self.model.word_emb.weight.shape g_clip = F.clip.GradientClipByGlobalNorm(1.0) opt = AdamW( learning_rate=LinearDecay(learning_rate, int(warmup_proportion * max_steps), max_steps), parameter_list=self.model.parameters(), weight_decay=weight_decay, grad_clip=g_clip) loss = None save_path = None ppl = None if save_dir and not os.path.exists(save_dir): os.makedirs(save_dir) for step, data in enumerate(train_ds.start(place)): (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels) = data _, __, info = self.model( src_ids, sent_ids=src_sids, pos_ids=src_pids, attn_bias=mask_src_2_src, encode_only=True) cached_k, cached_v = info['caches'] _, __, info = self.model( tgt_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_tgt_2_srctgt, past_cache=(cached_k, cached_v), encode_only=True) cached_k2, cached_v2 = info['caches'] past_cache_k = [ L.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2) ] past_cache_v = [ L.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2) ] if label_smooth > 0.: tgt_labels = L.label_smooth( F.one_hot(tgt_labels, vocab_size), epsilon=label_smooth) loss, _, __ = self.model( attn_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_attn_2_srctgtattn, past_cache=(past_cache_k, past_cache_v), tgt_labels=tgt_labels, tgt_pos=L.where(attn_ids == self.tokenizer.vocab['[MASK]'])) loss.backward() opt.minimize(loss) self.model.clear_gradients() if step % log_interval == 0: loss_np = loss.numpy() ppl = np.exp(loss_np) logger.info( '[step %d / %d]train loss %.5f, ppl %.5f, elr %.3e' % (step, max_steps, loss_np, ppl, opt.current_step_lr())) if save_dir and step % save_interval == 0 and step > 0: loss_np = loss.numpy() ppl = np.exp(loss_np) save_name = "step_%s_ppl_%.5f" % (step, ppl) save_path = os.path.join(save_dir, save_name) logger.info("save the model in %s" % save_path) F.save_dygraph(self.model.state_dict(), save_path) if dev_path: logger.info('evaluating...') res = self._evaluate(dev_ds, place, beam_width, length_penalty) output_path = os.path.join( save_dir, "step_%s_ppl_%.5f.txt" % (step, ppl)) logger.info( 'save the predict result in %s' % output_path) with open(output_path, 'w') as fout: fout.write(('\n'.join(res))) if step > max_steps: break if loss: loss_np = loss.numpy() ppl = np.exp(loss_np) logger.info('[final step %d]train loss %.5f, ppl %.5f, elr %.3e' % (step, loss_np, ppl, opt.current_step_lr())) if save_dir: save_name = "step_%s_ppl_%.5f" % (step, ppl) save_path = os.path.join(save_dir, save_name) logger.info("save the model in %s" % save_path) F.save_dygraph(self.model.state_dict(), save_path) if dev_path: logger.info('evaluating...') res = self._evaluate(dev_ds, place, beam_width, length_penalty) output_path = os.path.join( save_dir, "step_%s_ppl_%.5f.txt" % (step, ppl)) logger.info( 'save the predict result in %s' % output_path) with open(output_path, 'w') as fout: fout.write(('\n'.join(res))) result = { "last_save_path": "%s.pdparams" % save_path, "last_ppl": ppl[0], } return result