def evaluate(model, ds, all_examples, all_features, tokenizer, args): dev_file = json.loads(open(args.dev_file, encoding='utf8').read()) with P.no_grad(): log.debug('start eval') model.eval() all_res = [] for step, (uids, token_ids, token_type_ids, _, __) in enumerate( P.io.DataLoader(ds, places=P.CUDAPlace(env.dev_id), batch_size=None)): _, start_logits, end_logits = model(token_ids, token_type_ids) res = [ mrc_metrics.RawResult(unique_id=u, start_logits=s, end_logits=e) for u, s, e in zip(uids.numpy(), start_logits.numpy(), end_logits.numpy()) ] all_res += res open('all_res', 'wb').write(pickle.dumps(all_res)) all_pred, all_nbests = mrc_metrics.make_results( tokenizer, all_examples, all_features, all_res, n_best_size=args.n_best_size, max_answer_length=args.max_answer_length, do_lower_case=tokenizer.lower) f1, em, _, __ = mrc_metrics.evaluate(dev_file, all_pred) model.train() log.debug('done eval') return f1, em
def evaluate(model, ds, all_examples, all_features, tokenizer, args): dev_file = json.loads(open(args.dev_file).read()) with D.base._switch_tracer_mode_guard_(is_train=False): log.debug('start eval') model.eval() all_res = [] for step, (uids, token_ids, token_type_ids, _, __) in enumerate(ds.start(place)): _, start_logits, end_logits = model(token_ids, token_type_ids) res = [ mrc_metrics.RawResult(unique_id=u, start_logits=s, end_logits=e) for u, s, e in zip(uids.numpy(), start_logits.numpy(), end_logits.numpy()) ] all_res += res open('all_res', 'wb').write(pickle.dumps(all_res)) all_pred, all_nbests = mrc_metrics.make_results( tokenizer, all_examples, all_features, all_res, n_best_size=args.n_best_size, max_answer_length=args.max_answer_length, do_lower_case=tokenizer.lower) f1, em, _, __ = mrc_metrics.evaluate(dev_file, all_pred) model.train() log.debug('done eval') return f1, em
def build_bb(from_file, to_file): slots = [] for i, line in enumerate(from_file): line = line.strip() if args.verbose and i % 10000 == 0: log.debug(i) if len(line) == 0: if len(slots) != 0: transposed_slots = list(zip(*slots)) ex = build_example(transposed_slots) write_gz(ex.SerializeToString(), to_file) slots = [] continue parsed_line = parse_txt(line) slots.append(parsed_line) if len(slots) != 0: transposed_slots = list(zip(*slots)) ex = build_example(transposed_slots) write_gz(ex.SerializeToString(), to_file) slots = []
def train(model, train_dataset, dev_dataset, dev_examples, dev_features, tokenizer, args): ctx = D.parallel.prepare_context() model = D.parallel.DataParallel(model, ctx) max_steps = len(train_features) * args.epoch // args.bsz opt = AdamW(learning_rate=args.lr, parameter_list=model.parameters(), weight_decay=args.wd) g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental train_dataset = train_dataset \ .repeat() \ .shard(D.parallel.Env().nranks, D.parallel.Env().dev_id) \ .shuffle(1000) \ .padded_batch(args.bsz) log.debug('init training with args: %s' % repr(args)) for step, (_, token_ids, token_type_ids, start_pos, end_pos) in enumerate(train_dataset.start(place)): loss, _, __ = model(token_ids, token_type_ids, start_pos=start_pos, end_pos=end_pos) scaled_loss = model.scale_loss(loss) scaled_loss.backward() model.apply_collective_grads() opt.minimize(scaled_loss, grad_clip=g_clip) model.clear_gradients() if D.parallel.Env().dev_id == 0 and step % 10 == 0: log.debug('[step %d] train loss %.5f lr %.3e' % (step, loss.numpy(), opt.current_step_lr())) if D.parallel.Env().dev_id == 0 and step % 100 == 0: f1, em = evaluate(model, dev_dataset, dev_examples, dev_features, tokenizer, args) log.debug('[step %d] eval result: f1 %.5f em %.5f' % (step, f1, em)) if step > max_steps: break
def tokenizer(sen): log.debug(sen) return sen.split(b' ')
def optimization(loss, warmup_steps, num_train_steps, learning_rate, train_program, startup_prog, weight_decay, scheduler='linear_warmup_decay', use_fp16=False, use_lamb=False, use_dynamic_loss_scaling=False, init_loss_scaling=1.0, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, incr_ratio=2.0, decr_ratio=0.8, layer_decay_rate=0.0, n_layers=12): def exclude_from_weight_decay(param): name = param.name.rstrip('.master') if name.find("layer_norm") > -1: return True bias_suffix = ["_bias", "_b", ".b_0"] for suffix in bias_suffix: if name.endswith(suffix): return True return False if warmup_steps > 0: if scheduler == 'noam_decay': scheduled_lr = fluid.layers.learning_rate_scheduler\ .noam_decay(1/(warmup_steps *(learning_rate ** 2)), warmup_steps) elif scheduler == 'linear_warmup_decay': scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps, num_train_steps) else: raise ValueError("Unkown learning rate scheduler, should be " "'noam_decay' or 'linear_warmup_decay'") if not use_lamb: log.debug('using Adam') optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) else: log.debug('using Lamb') optimizer = fluid.optimizer.Lamb( learning_rate=scheduled_lr, lamb_weight_decay=weight_decay, exclude_from_weight_decay_fn=exclude_from_weight_decay) else: scheduled_lr = fluid.layers.create_global_var( name=fluid.unique_name.generate("learning_rate"), shape=[1], value=learning_rate, dtype='float32', persistable=True) if not use_lamb: log.debug('using Adam') optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) else: log.debug('using Lamb') optimizer = fluid.optimizer.Lamb( learning_rate=scheduled_lr, lamb_weight_decay=weight_decay, exclude_from_weight_decay_fn=exclude_from_weight_decay) optimizer._learning_rate_map[fluid.default_main_program( )] = scheduled_lr fluid.clip.set_gradient_clip( clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)) param_list = dict() loss_scaling = fluid.layers.create_global_var( name=fluid.unique_name.generate("loss_scaling"), shape=[1], value=init_loss_scaling, dtype='float32', persistable=True) if use_fp16: from utils.fp16 import create_master_params_grads, master_param_to_train_param, apply_dynamic_loss_scaling loss *= loss_scaling param_grads = optimizer.backward(loss) master_param_grads = create_master_params_grads( param_grads, train_program, startup_prog, loss_scaling) for param, _ in master_param_grads: param_list[param.name] = param * 1.0 param_list[param.name].stop_gradient = True if use_dynamic_loss_scaling: apply_dynamic_loss_scaling( loss_scaling, master_param_grads, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio) optimizer.apply_gradients(master_param_grads) if not use_lamb and weight_decay > 0: for param, grad in master_param_grads: if exclude_from_weight_decay(param): continue with param.block.program._optimized_guard( [param, grad]), fluid.framework.name_scope("weight_decay"): updated_param = param - param_list[ param.name] * weight_decay * scheduled_lr fluid.layers.assign(output=param, input=updated_param) master_param_to_train_param(master_param_grads, param_grads, train_program) else: for param in train_program.global_block().all_parameters(): param_list[param.name] = param * 1.0 param_list[param.name].stop_gradient = True _, param_grads = optimizer.minimize(loss) if layer_decay_rate > 0: for param, grad in param_grads: with param.block.program._optimized_guard( [param, grad]), fluid.framework.name_scope("layer_decay"): param_decay = layer_decay(param, param_list[param.name], scheduled_lr, layer_decay_rate, n_layers) if param_decay: fluid.layers.assign(output=param, input=param_decay) if not use_lamb and weight_decay > 0: for param, grad in param_grads: if exclude_from_weight_decay(param): continue with param.block.program._optimized_guard( [param, grad]), fluid.framework.name_scope("weight_decay"): updated_param = param - param_list[ param.name] * weight_decay * scheduled_lr fluid.layers.assign(output=param, input=updated_param) return scheduled_lr, loss_scaling
loss.backward() scaler.minimize(opt, loss) model.clear_gradients() lr_scheduler.step() if step % 10 == 0: _lr = lr_scheduler.get_lr() if args.use_amp: _l = (loss / scaler._scale).numpy() msg = '[step-%d] train loss %.5f lr %.3e scaling %.3e' % ( step, _l, _lr, scaler._scale.numpy()) else: _l = loss.numpy() msg = '[step-%d] train loss %.5f lr %.3e' % ( step, _l, _lr) log.debug(msg) log_writer.add_scalar('loss', _l, step=step) log_writer.add_scalar('lr', _lr, step=step) if step % 100 == 0: acc = [] with P.no_grad(): model.eval() for step, d in enumerate( P.io.DataLoader(dev_ds, places=P.CUDAPlace(0), batch_size=None)): ids, sids, label = d loss, logits = model(ids, sids, labels=label) a = (logits.argmax(-1) == label) acc.append(a.numpy())
args.from_pretrained, num_labels=3, name='') opt = AdamW(learning_rate=LinearDecay( args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps), parameter_list=model.parameters(), weight_decay=args.wd) g_clip = F.dygraph_grad_clip.GradClipByGlobalNorm(1.0) #experimental for epoch in range(args.epoch): for step, d in enumerate( tqdm(train_ds.start(place), desc='training')): ids, sids, label = d loss, _ = model(ids, sids, labels=label) loss.backward() if step % 10 == 0: log.debug('train loss %.5f lr %.3e' % (loss.numpy(), opt.current_step_lr())) opt.minimize(loss, grad_clip=g_clip) model.clear_gradients() if step % 100 == 0: acc = [] with FD.base._switch_tracer_mode_guard_(is_train=False): model.eval() for step, d in enumerate( tqdm(dev_ds.start(), desc='evaluating %d' % epoch)): ids, sids, label = d loss, logits = model(ids, sids, labels=label) #print('\n'.join(map(str, logits.numpy().tolist()))) a = L.argmax(logits, -1) == label acc.append(a.numpy()) model.train()
def greedy_search_infilling(model, q_ids, q_sids, sos_id, eos_id, attn_id, max_encode_len=640, max_decode_len=100): model.eval() #log.debug(q_ids.numpy().tolist()) _, logits, info = model(q_ids, q_sids) gen_ids = L.argmax(logits, -1) d_batch, d_seqlen = q_ids.shape seqlen = L.reduce_sum(L.cast(q_ids != 0, 'int64'), 1, keep_dim=True) log.debug(seqlen.numpy()) log.debug(d_seqlen) has_stopped = np.zeros([d_batch], dtype=np.bool) gen_seq_len = np.zeros([d_batch], dtype=np.int64) output_ids = [] past_cache = info['caches'] cls_ids = L.ones([d_batch], dtype='int64') * sos_id attn_ids = L.ones([d_batch], dtype='int64') * attn_id ids = L.stack([cls_ids, attn_ids], -1) for step in range(max_decode_len): log.debug('decode step %d' % step) bias = gen_bias(q_ids, ids, step) pos_ids = D.to_variable( np.tile(np.array([[step, step + 1]], dtype=np.int64), [d_batch, 1])) pos_ids += seqlen _, logits, info = model(ids, L.ones_like(ids) * 3, pos_ids=pos_ids, attn_bias=bias, past_cache=past_cache) gen_ids = L.argmax(logits, -1) past_cached_k, past_cached_v = past_cache cached_k, cached_v = info['caches'] cached_k = [ L.concat([pk, k[:, :1, :]], 1) for pk, k in zip(past_cached_k, cached_k) ] # concat cached cached_v = [ L.concat([pv, v[:, :1, :]], 1) for pv, v in zip(past_cached_v, cached_v) ] past_cache = (cached_k, cached_v) gen_ids = gen_ids[:, 1] ids = L.stack([gen_ids, attn_ids], 1) gen_ids = gen_ids.numpy() has_stopped |= (gen_ids == eos_id).astype(np.bool) gen_seq_len += (1 - has_stopped.astype(np.int64)) output_ids.append(gen_ids.tolist()) if has_stopped.all(): #log.debug('exit because all done') break #if step == 1: break output_ids = np.array(output_ids).transpose([1, 0]) return output_ids
def create_model(args, phase, micro_bsz, dp_sharding_rank, dp_worldsize, topo): if args.use_sop: from reader.pretraining_ds_ernie_full_sent import make_pretrain_dataset else: from reader.pretraining_ds_mlm import make_pretrain_dataset # mask_label, mask_pos for mlm, labels for sop if args.use_sop: input_fields = { 'names': ['src_ids', 'sent_ids', 'mask_label', 'mask_pos', 'labels'], 'shapes': [[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1], [-1, 1], [-1, 1]], 'dtypes': ['int64', 'int64', 'int64', 'int64', 'int64'], 'lod_levels': [0, 0, 0, 0, 0], } else: input_fields = { 'names': ['src_ids', 'sent_ids', 'mask_label', 'mask_pos'], 'shapes': [[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1], [-1, 1]], 'dtypes': ['int64', 'int64', 'int64', 'int64'], 'lod_levels': [0, 0, 0, 0], } with fluid.device_guard("gpu:0"): inputs = [ fluid.data(name=input_fields['names'][i], shape=input_fields['shapes'][i], dtype=input_fields['dtypes'][i], lod_level=input_fields['lod_levels'][i]) for i in range(len(input_fields['names'])) ] if args.use_sop: (src_ids, sent_ids, mask_label, mask_pos, labels) = inputs else: (src_ids, sent_ids, mask_label, mask_pos) = inputs train_file_list = glob.glob(args.data_dir + "/*") vocab = {} with open(args.vocab_file) as r: for line in r: lines = line.strip().split('\t') vocab[lines[0]] = int(lines[1]) log.debug("========= worker: {} of {} ==========".format( dp_sharding_rank, dp_worldsize)) data_reader = make_pretrain_dataset('pt', train_file_list, True, vocab, micro_bsz, len(vocab), args.max_seq_len, dp_sharding_rank, dp_worldsize) with fluid.device_guard("gpu:0"): data_loader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=70, iterable=False) places = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0))) def data_gen(): yield from data_reader data_loader.set_batch_generator(data_gen, places) ernie_config = ErnieConfig(args.ernie_config_file)._config_dict ernie_config["preln"] = args.preln weight_sharing = (topo.mp.size == 1 and topo.pp.size == 1 ) # pp mp should not do weight sharing with fluid.device_guard("gpu:0"): ernie = ErnieModel(src_ids, sent_ids, ernie_config, weight_sharing=weight_sharing, topo=topo) checkpoints = ernie._checkpoints checkpoints.pop(-1) with fluid.device_guard(f'gpu:{args.num_pp-1}'): mask_lm_loss, mean_mask_lm_loss = ernie.get_lm_output( mask_label, mask_pos) total_loss = mean_mask_lm_loss if args.use_sop: sop_acc, mean_sop_loss = ernie.get_next_sentence_output(labels) total_loss += mean_sop_loss if topo.pp.size > 1: mask_lm_loss.persistable = True mean_mask_lm_loss.persistable = True # checkpoints.extend([mask_lm_loss.name, mean_mask_lm_loss.name]) if args.use_sop: mean_sop_loss.persistable = True sop_acc.persistable = True # checkpoints.extend([mean_sop_loss.name, sop_acc.name]) total_loss.persistable = True # checkpoints.append(total_loss.name) if args.use_sop: graph_vars = { 'data_loader': data_loader, 'mask_lm_loss': mask_lm_loss, 'mean_mask_lm_loss': mean_mask_lm_loss, 'sop_loss': mean_sop_loss, 'sop_acc': sop_acc, 'total_loss': total_loss, 'checkpoints': checkpoints } else: graph_vars = { 'data_loader': data_loader, 'mask_lm_loss': mask_lm_loss, 'mean_mask_lm_loss': mean_mask_lm_loss, 'total_loss': total_loss, 'checkpoints': checkpoints, } return graph_vars
def train(args): log.info("pretraining start") profile = False place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0))) # set seed random.seed(args.seed) np.random.seed(args.seed) paddle.seed(args.seed) get_rng_state_tracker().add('global_seed', args.seed) get_rng_state_tracker().add('local_seed', args.seed + fleet.worker_index() + 2021) # define execution strategy exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = 2 exec_strategy.num_iteration_per_drop_scope = 1 # define distribution strategy dist_strategy = fleet.DistributedStrategy() dist_strategy.execution_strategy = exec_strategy dist_strategy.nccl_comm_num = 3 if args.use_recompute: log.info("using recompute.") dist_strategy.recompute = args.use_recompute dist_strategy.sharding = args.use_sharding dist_strategy.pipeline = args.num_pp > 1 # define topology structure for dp/pp/mp topo = Topology(rank=fleet.worker_index(), world_size=fleet.worker_num(), dp=args.num_dp, pp=args.num_pp, sharding=args.num_sharding, mp=args.num_mp) is_last = False if topo.pp.rank == (topo.pp.size - 1): is_last = True dp_sharding_rank = topo.dp.rank * topo.sharding.size + topo.sharding.rank dp_worldsize = topo.dp.size * topo.sharding.size bsz_per_dp = args.global_bsz // dp_worldsize micro_bsz = args.micro_bsz assert args.global_bsz % micro_bsz == 0, f"cannot do gradient accumulate, globa_bsz: {args.bsz} micro_bsz: {micro_bsz}" acc_steps = bsz_per_dp // micro_bsz # sharding \ model parallel \ pipeline assert dist_strategy.sharding == True dist_strategy.sharding_configs = { "segment_broadcast_MB": 32, "sharding_degree": args.num_sharding, "mp_degree": args.num_mp, "pp_degree": args.num_pp, "dp_degree": args.num_dp, "optimize_offload": True, } dist_strategy.pipeline_configs = { "schedule_mode": "1F1B", "micro_batch_size": micro_bsz, "accumulate_steps": acc_steps, } log.info( f"using globa_bsz: {args.global_bsz} micro_bsz: {micro_bsz}, acc_steps: {acc_steps}" ) dist_strategy.amp = args.use_amp dist_strategy.amp_configs = { "custom_white_list": ['softmax', 'layer_norm', 'gelu'], "init_loss_scaling": 32768, "decr_every_n_nan_or_inf": 2, "incr_every_n_steps": 1000, "incr_ratio": 2.0, "use_dynamic_loss_scaling": True, "decr_ratio": 0.5, "use_pure_fp16": False, "use_fp16_guard": False, } dist_strategy.lamb = args.use_lamb dist_strategy.lamb_configs = { 'lamb_weight_decay': 0.01, 'exclude_from_weight_decay': ['layer_norm_bias', 'layer_norm_scale', '.b_0'] } train_program = fluid.Program() startup_program = fluid.Program() with fluid.program_guard(train_program, startup_program): with fluid.unique_name.guard(): graph_vars = create_model(args, 'train', micro_bsz, dp_sharding_rank, dp_worldsize, topo) data_loader = graph_vars['data_loader'] for op in train_program.global_block().ops: if op.type == 'fill_constant': op._set_attr( 'op_device', "gpu:0" ) # XXX: hack: https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/layers/tensor.py#L1376 if args.use_recompute: dist_strategy.recompute_configs = { "checkpoints": graph_vars['checkpoints'], # "enable_offload": args.use_offload, # "checkpoint_shape": [micro_bsz, args.max_seq_len, 4096], } log.debug("base lr: {}".format(args.learning_rate)) scheduled_lr = linear_warmup_decay( learning_rate=args.learning_rate, warmup_steps=args.warmup_steps, num_train_steps=args.num_train_steps) clip_norm_thres = 1.0 if paddlenlp.ops.optimizer._jit_compile(): optimizer = paddlenlp.ops.optimizer.AdamwOptimizer( learning_rate=scheduled_lr, grad_clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=clip_norm_thres), weight_decay=args.weight_decay, apply_decay_param_fun=apply_weight_decay_fun) else: optimizer = fluid.optimizer.Adam( learning_rate=scheduled_lr, grad_clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=clip_norm_thres), #multi_precision=True, #weight_decay=args.weight_decay, # merge this pr to use weight_decay: https://github.com/PaddlePaddle/Paddle/pull/29248 #exclude_from_weight_decay_fn=exclude_from_weight_decay ) optimizer = fleet.distributed_optimizer(optimizer, dist_strategy) log.info(f"using dist strategy: {dist_strategy}") optimizer.minimize(graph_vars['total_loss']) final_strategy = fleet._final_strategy() applied_meta_list = fleet._get_applied_meta_list() log.info("final strategy: {}".format(final_strategy)) log.info("applied_meta_list: {}".format(applied_meta_list)) program_desc_dir = os.path.join(args.output_dir, "program_desc") if not os.path.isdir(program_desc_dir): os.mkdir(program_desc_dir) with open( program_desc_dir + "/main_program.txt.%d" % (int(os.environ.get('FLAGS_selected_gpus', 0))), 'w') as f: f.write(str(train_program)) with open( program_desc_dir + "/startup_program.txt.%d" % (int(os.environ.get('FLAGS_selected_gpus', 0))), 'w') as f: f.write(str(startup_program)) exe = fluid.Executor(place) exe.run(startup_program) optimizer.amp_init(place) #save_path = os.path.join(args.output_dir, 'step_0') #log.debug("saving models to {}".format(save_path)) #save_persistables(exe, save_path, train_program) if args.init_checkpoint and args.init_checkpoint != "": log.info(' ') log.info( '############################WARNING############################') log.info( '####### using ini_checkpoint, not init_pretraining_params ####') log.info( '## meaning hyper param e.g. lr will inherit from checkpoint ##') log.info( '###############################################################') init_checkpoint(exe, args.init_checkpoint, train_program) log.info(' ') output_dir = args.output_dir save_steps = args.save_steps total_time = 0 cost_vals, lm_losses, sop_accs = [], [], [] global_steps = args.global_steps + 1 steps = 0 log_path = 'train_log/node-%d' % fleet.worker_index() start_time = time.time() with LogWriter(os.path.join(args.output_dir, log_path)) as swriter: data_loader.start() while True: #if steps < global_steps: # steps += 1 # continue if not is_last: fetch_list = [] else: fetch_list = [ graph_vars['total_loss'], graph_vars['mean_mask_lm_loss'], scheduled_lr ] if args.use_sop: fetch_list.extend( [graph_vars['sop_acc'], graph_vars['sop_loss']]) if args.use_amp: loss_scaling = train_program.global_block( ).vars['loss_scaling_0'] fetch_list.append(loss_scaling) ret = exe.run(train_program, fetch_list=fetch_list ) # run one mini-batch(=acc_steps micro-batch) #use_program_cache=True) steps += 1 if is_last: if args.use_sop and args.use_amp: cost_val, lm_loss, lr, sop_acc, sop_loss, loss_scaling_0 = ret elif args.use_sop: cost_val, lm_loss, lr, sop_acc, sop_loss = ret elif args.use_amp: cost_val, lm_loss, lr, loss_scaling_0 = ret else: cost_val, lm_loss, lr = ret cost_vals.append(cost_val[0]) lm_losses.append(lm_loss[0]) if args.use_sop: sop_accs.append(sop_acc[0]) if steps > 0 and (steps % args.log_steps) == 0: end_time = time.time() total_time = end_time - start_time cost_val = np.mean(cost_vals) lm_loss = np.mean(lm_losses) swriter.add_scalar('loss/total_loss', cost_val, steps) swriter.add_scalar('loss/mlm_loss', lm_loss, steps) swriter.add_scalar('lr/scheduled_lr', lr[0], steps) if args.use_sop: sop_acc = np.mean(sop_accs) swriter.add_scalar('loss/sop_loss', sop_loss, steps) swriter.add_scalar('train/sop_acc', sop_acc, steps) else: sop_acc = 0.0 if args.use_amp: swriter.add_scalar('lr/loss_scaling', loss_scaling_0[0], steps) else: loss_scaling_0 = [0.0] log.info( "worker_index: %d, step: %d, cost: %f, " "mlm loss: %f, sentence order acc: %f, " "speed: %f steps/s, " "speed: %f samples/s, " "speed: %f tokens/s, " "learning rate: %.3e, loss_scalings: %f" % (fleet.worker_index(), steps, cost_val, lm_loss, sop_acc, args.log_steps / total_time, args.log_steps * args.global_bsz / total_time, args.log_steps * args.global_bsz * args.max_seq_len / total_time, lr[0], loss_scaling_0[0])) cost_vals, lm_losses, sop_accs = [], [], [] start_time = time.time() # TODO: add evaluation if steps > 0 and args.eval_steps > 0 and steps % args.eval_steps == 0: pass if steps > 0 and args.save_steps > 0 and steps % args.save_steps == 0: if args.use_hybrid_dp and fleet.worker_index() > 8: continue save_path = os.path.join(output_dir, 'step_' + str(steps)) log.debug("saving models to {}".format(save_path)) save_persistables(exe, save_path, train_program) if steps == args.num_train_steps: if args.use_hybrid_dp and fleet.worker_index() > 8: continue save_path = os.path.join(output_dir, 'final_step_' + str(steps)) save_persistables(exe, save_path, train_program) log.debug("saving final models to {}".format(save_path)) log.debug("end of training, total steps: {}".format(steps))
with FD.guard(): model = ErnieModelForTokenClassification.from_pretrained( args.from_pretrained, num_labels=7, name='') opt = AdamW(learning_rate=LinearDecay(args.lr, args.warmup_steps, args.max_steps), parameter_list=model.parameters(), weight_decay=0.01) #opt = F.optimizer.AdamOptimizer(learning_rate=LinearDecay(args.lr, args.warmup_steps, args.max_steps), parameter_list=model.parameters()) for epoch in range(args.epoch): for step, (ids, sids, aligned_label, label, orig_pos) in enumerate(tqdm(train_ds.start())): loss, _ = model(ids, sids, labels=aligned_label) loss.backward() if step % 10 == 0: log.debug('train loss %.5f' % loss.numpy()) opt.minimize(loss) model.clear_gradients() if step % 100 == 0: all_pred, all_label = [], [] with FD.base._switch_tracer_mode_guard_(is_train=False): model.eval() for step, (ids, sids, aligned_label, label, orig_pos) in enumerate( tqdm(dev_ds.start())): loss, logits = model(ids, sids, labels=aligned_label) #print('\n'.join(map(str, logits.numpy().tolist()))) for pos, lo, la in zip(orig_pos.numpy(),
def train(model, train_dataset, dev_dataset, dev_examples, dev_features, tokenizer, args): model = P.DataParallel(model) max_steps = len(train_features) * args.epoch // args.bsz g_clip = P.nn.ClipGradByGlobalNorm(1.0) #experimental lr_scheduler = P.optimizer.lr.LambdaDecay( args.lr, get_warmup_and_linear_decay(max_steps, int(args.warmup_proportion * max_steps))) opt = P.optimizer.AdamW(lr_scheduler, parameters=model.parameters(), weight_decay=args.wd, grad_clip=g_clip) train_dataset = train_dataset \ .cache_shuffle_shard(env.nranks, env.dev_id, drop_last=True) \ .padded_batch(args.bsz) log.debug('init training with args: %s' % repr(args)) scaler = P.amp.GradScaler(enable=args.use_amp) create_if_not_exists(args.save_dir) with P.amp.auto_cast(enable=args.use_amp): for step, (_, token_ids, token_type_ids, start_pos, end_pos) in enumerate( P.io.DataLoader(train_dataset, places=P.CUDAPlace(env.dev_id), batch_size=None)): loss, _, __ = model(token_ids, token_type_ids, start_pos=start_pos, end_pos=end_pos) loss = scaler.scale(loss) loss.backward() scaler.minimize(opt, loss) model.clear_gradients() lr_scheduler.step() if env.dev_id == 0 and step % 10 == 0: _lr = lr_scheduler.get_lr() if args.use_amp: _l = (loss / scaler._scale).numpy() msg = '[rank-%d][step-%d] train loss %.5f lr %.3e scaling %.3e' % ( env.dev_id, step, _l, _lr, scaler._scale.numpy()) else: _l = loss.numpy() msg = '[rank-%d][step-%d] train loss %.5f lr %.3e' % ( env.dev_id, step, _l, _lr) log.debug(msg) if env.dev_id == 0 and step % 100 == 0: f1, em = evaluate(model, dev_dataset, dev_examples, dev_features, tokenizer, args) log.debug('[step %d] eval result: f1 %.5f em %.5f' % (step, f1, em)) if env.dev_id == 0 and args.save_dir is not None: P.save(model.state_dict(), args.save_dir / 'ckpt.bin') if step > max_steps: break
def seq2seq(model, tokenizer, args): log.info('Training starts with args: %r' % args) attn_id = tokenizer.vocab[args.attn_token] def gen_mask(batch_ids, mask_type='bidi', query_len=None, pad_value=0): if query_len is None: query_len = batch_ids.shape[1] if mask_type != 'empty': mask = (batch_ids != pad_value).astype(np.float32) mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1]) if mask_type == 'causal': assert query_len == batch_ids.shape[1] mask = np.tril(mask) elif mask_type == 'causal_without_diag': assert query_len == batch_ids.shape[1] mask = np.tril(mask, -1) elif mask_type == 'diag': assert query_len == batch_ids.shape[1] mask = np.stack([np.diag(np.diag(m)) for m in mask], 0) else: mask_type == 'empty' mask = np.zeros_like(batch_ids).astype(np.float32) mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1]) return mask def make_some_noice(ids): if args.use_random_noice: noice_ids = np.random.randint( 1, len(tokenizer.vocab), size=ids.shape) else: noice_ids = np.ones_like(ids) * tokenizer.vocab['[NOISE]'] pos, = np.where(np.ones_like(ids)) np.random.shuffle(pos) pos = pos[:int(args.noise_prob * len(pos))] ids[pos, ] = noice_ids[pos, ] return ids def map_fn(example_id, src_ids, tgt_ids): src_ids = src_ids[:args.max_encode_len] tgt_ids = tgt_ids[:args.max_decode_len] src_ids, src_sids = tokenizer.build_for_ernie(src_ids) src_pids = np.arange(len(src_ids)) tgt_ids, tgt_sids = tokenizer.build_for_ernie(tgt_ids) tgt_pids = np.arange(len(tgt_ids)) + len(src_ids) # continues position tgt_sids = np.ones_like(tgt_sids) * args.tgt_type_id attn_ids = np.ones_like(tgt_ids) * attn_id if args.noise_prob > 0.: tgt_labels = deepcopy(tgt_ids) tgt_ids = make_some_noice(tgt_ids) #corrupted else: tgt_labels = tgt_ids return (example_id, src_ids, src_pids, src_sids, tgt_ids, tgt_pids, tgt_sids, attn_ids, tgt_labels) def after_padding(example_id, src_ids, src_pids, src_sids, tgt_ids, tgt_pids, tgt_sids, attn_ids, tgt_labels): ''' attention mask: *** src, tgt, attn src 00, 01, 11 tgt 10, 11, 12 attn 20, 21, 22 *** s1, s2 | t1 t2 t3| attn1 attn2 attn3 s1 1, 1 | 0, 0, 0,| 0, 0, 0, s2 1, 1 | 0, 0, 0,| 0, 0, 0, - t1 1, 1, | 1, 0, 0,| 0, 0, 0, t2 1, 1, | 1, 1, 0,| 0, 0, 0, t3 1, 1, | 1, 1, 1,| 0, 0, 0, - attn1 1, 1, | 0, 0, 0,| 1, 0, 0, attn2 1, 1, | 1, 0, 0,| 0, 1, 0, attn3 1, 1, | 1, 1, 0,| 0, 0, 1, for details, see Fig3. https://arxiv.org/abs/2001.11314 ''' src_len = src_ids.shape[1] tgt_len = tgt_ids.shape[1] mask_00 = gen_mask(src_ids, 'bidi', query_len=src_len) mask_01 = gen_mask(tgt_ids, 'empty', query_len=src_len) mask_02 = gen_mask(attn_ids, 'empty', query_len=src_len) mask_10 = gen_mask(src_ids, 'bidi', query_len=tgt_len) mask_11 = gen_mask(tgt_ids, 'causal', query_len=tgt_len) mask_12 = gen_mask(attn_ids, 'empty', query_len=tgt_len) mask_20 = gen_mask(src_ids, 'bidi', query_len=tgt_len) mask_21 = gen_mask(tgt_ids, 'causal_without_diag', query_len=tgt_len) mask_22 = gen_mask(attn_ids, 'diag', query_len=tgt_len) ''' mask = np.concatenate([ np.concatenate([mask_00, mask_01, mask_02], 2), np.concatenate([mask_10, mask_11, mask_12], 2), np.concatenate([mask_20, mask_21, mask_22], 2), ], 1) ids = np.concatenate([src_ids, tgt_ids, attn_ids], 1) pids = np.concatenate([src_pids, tgt_pids, tgt_pids], 1) sids = np.concatenate([src_sids, tgt_sids, tgt_sids], 1) ''' mask_src_2_src = mask_00 mask_tgt_2_srctgt = np.concatenate([mask_10, mask_11], 2) mask_attn_2_srctgtattn = np.concatenate([mask_20, mask_21, mask_22], 2) tgt_labels = tgt_labels[np.where(tgt_labels != 0)] return (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels) bytes_vocab = {k.encode('utf8'): v for k, v in tokenizer.vocab.items()} feature_column = propeller.data.FeatureColumns([ propeller.data.LabelColumn('id'), propeller.data.TextColumn( 'src', unk_id=tokenizer.unk_id, vocab_dict=bytes_vocab), propeller.data.TextColumn( 'tgt', unk_id=tokenizer.unk_id, vocab_dict=bytes_vocab), ]) train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=True, use_gz=False) \ .map(map_fn) \ .padded_batch(args.bsz) \ .map(after_padding) dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \ .map(map_fn) \ .padded_batch(args.eval_bsz) \ .map(after_padding) \ .shard(env.nranks, env.dev_id) vocab_size, _ = model.word_emb.weight.shape model = P.DataParallel(model) g_clip = P.nn.ClipGradByGlobalNorm(1.0) param_name_to_exclue_from_weight_decay = re.compile( r'.*layer_norm_scale|.*layer_norm_bias|.*b_0') lr_scheduler = P.optimizer.lr.LambdaDecay( args.lr, get_warmup_and_linear_decay( args.max_steps, int(args.warmup_proportion * args.max_steps))) opt = P.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.wd, apply_decay_param_fun=lambda n: param_name_to_exclue_from_weight_decay.match(n), grad_clip=g_clip) scaler = P.amp.GradScaler(enable=args.use_amp) attn_id = tokenizer.vocab[args.attn_token] create_if_not_exists(args.save_dir) if args.predict_output_dir: create_if_not_exists(args.predict_output_dir) with P.amp.auto_cast(enable=args.use_amp): for step, data in enumerate( P.io.DataLoader( train_ds, places=P.CUDAPlace(env.dev_id), batch_size=None)): (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels) = data _, __, info = model( src_ids, sent_ids=src_sids, pos_ids=src_pids, attn_bias=mask_src_2_src, encode_only=True) cached_k, cached_v = info['caches'] _, __, info = model( tgt_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_tgt_2_srctgt, past_cache=(cached_k, cached_v), encode_only=True) cached_k2, cached_v2 = info['caches'] past_cache_k = [ P.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2) ] past_cache_v = [ P.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2) ] tgt_labels = F.one_hot(tgt_labels, vocab_size) if args.label_smooth > 0.: tgt_labels = F.label_smooth( tgt_labels, epsilon=args.label_smooth) loss, _, __ = model( attn_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_attn_2_srctgtattn, past_cache=(past_cache_k, past_cache_v), tgt_labels=tgt_labels, tgt_pos=P.nonzero(attn_ids == attn_id)) loss = scaler.scale(loss) loss.backward() scaler.minimize(opt, loss) model.clear_gradients() lr_scheduler.step() if step % 10 == 0: _lr = lr_scheduler.get_lr() if args.use_amp: _l = (loss / scaler._scale).numpy() msg = '[rank-%d][step-%d] train loss %.5f lr %.3e scaling %.3e' % ( env.dev_id, step, _l, _lr, scaler._scale.numpy()) else: _l = loss.numpy() msg = '[rank-%d][step-%d] train loss %.5f lr %.3e' % ( env.dev_id, step, _l, _lr) log.debug(msg) if args.save_dir is not None and step % 1000 == 0 and env.dev_id == 0: P.save(model.state_dict(), args.save_dir / 'ckpt.bin') if args.predict_output_dir is not None and step > args.skip_eval_steps and step % args.eval_steps == 0: assert args.predict_output_dir.exists(), \ 'predict_output_dir not found: %s' % args.predict_output_dir log.debug('doing predict on gpu %d...' % env.dev_id) evaluate(model, dev_ds, step, args) if step > args.max_steps: break evaluate(model, dev_ds, step, args) if args.save_dir is not None: P.save(model.state_dict(), args.save_dir / 'ckpt.bin')
b"2": 2, }), ]) def map_fn(seg_a, seg_b, label): seg_a, seg_b = tokenizer.truncate(seg_a, seg_b, seqlen=args.max_seqlen) sentence, segments = tokenizer.build_for_ernie(seg_a, seg_b) return sentence, segments, label train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=False, repeat=True, use_gz=False) \ .map(map_fn) \ .padded_batch(args.bsz, (0, 0, 0)) train_ds = train_ds.shard(propeller.train.distribution.status.num_replica, propeller.train.distribution.status.replica_id) log.debug('shard %d/%d' % (propeller.train.distribution.status.num_replica, propeller.train.distribution.status.replica_id)) train_ds = train_ds.shuffle(10000) dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \ .map(map_fn) \ .padded_batch(args.bsz, (0, 0, 0)) shapes = ([-1, args.max_seqlen], [-1, args.max_seqlen], [-1]) types = ('int64', 'int64', 'int64') train_ds.data_shapes = shapes train_ds.data_types = types dev_ds.data_shapes = shapes dev_ds.data_types = types place = F.CUDAPlace(FD.parallel.Env().dev_id)
place = F.CUDAPlace(D.parallel.Env().dev_id) with D.guard(place): model = ErnieModelForPretraining.from_pretrained(args.from_pretrained) opt = AdamW(learning_rate=LinearDecay(args.lr, args.warmup_steps, args.max_steps), parameter_list=model.parameters(), weight_decay=0.01) ctx = D.parallel.prepare_context() model = D.parallel.DataParallel(model, ctx) for step, samples in enumerate(tqdm(train_ds.start(place))): (src_ids, sent_ids, mlm_label, mask_pos, nsp_label) = samples loss, mlmloss, nsploss = model(src_ids, sent_ids, labels=mlm_label, mlm_pos=mask_pos, nsp_labels=nsp_label) scaled_loss = model.scale_loss(loss) scaled_loss.backward() model.apply_collective_grads() opt.minimize(scaled_loss) model.clear_gradients() if step % 10 == 0: log.debug('train loss %.5f scaled loss %.5f' % (loss.numpy(), scaled_loss.numpy())) if step % 10000 == 0 and D.parallel.Env( ).dev_id == 0 and args.save_dir is not None: F.save_dygraph(model.state_dict(), args.save_dir)
]) def before(seg_a, label): sentence, segments = utils.data.build_1_pair( seg_a, max_seqlen=args.max_seqlen, cls_id=cls_id, sep_id=sep_id) return sentence, segments, label def after(sentence, segments, label): sentence, segments, label = utils.data.expand_dims( sentence, segments, label) return sentence, segments, label log.debug(os.path.join(args.data_dir, 'train')) train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=True, use_gz=False) \ .map(before) \ .padded_batch(hparams.batch_size, (0, 0, 0)) \ .map(after) dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \ .map(before) \ .padded_batch(hparams.batch_size, (0, 0, 0)) \ .map(after) shapes = ([-1, args.max_seqlen, 1], [-1, args.max_seqlen, 1], [-1, 1]) types = ('int64', 'int64', 'int64') train_ds.data_shapes = shapes train_ds.data_types = types
sd, _ = FD.load_dygraph(args.init_checkpoint) model.set_dict(sd) g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental if args.use_lr_decay: opt = AdamW(learning_rate=LinearDecay(args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps), parameter_list=model.parameters(), weight_decay=args.wd, grad_clip=g_clip) else: opt = AdamW(args.lr, parameter_list=model.parameters(), weight_decay=args.wd, grad_clip=g_clip) for epoch in range(args.epoch): for step, d in enumerate(tqdm(train_ds.start(place), desc='training')): ids, sids, label = d loss, _ = model(ids, sids, labels=label) loss.backward() if step % 10 == 0: log.debug('train loss %.5f lr %.3e' % (loss.numpy(), opt.current_step_lr())) opt.minimize(loss) model.clear_gradients() with FD.base._switch_tracer_mode_guard_(is_train=False): model.eval() FP = 0 TP = 0 FN = 0 TN = 0 for step, d in enumerate(tqdm(dev_ds.start(place), desc='evaluating %d' % epoch)): ids, sids, label = d loss, logits = model(ids, sids, labels=label) #print('\n'.join(map(str, logits.numpy().tolist()))) a = L.argmax(logits, -1).numpy() label = label.numpy() length = a.shape[0]
batch_size=0)): (src_ids, sent_ids, mlm_label, mask_pos, nsp_label) = samples loss, mlmloss, nsploss = model(src_ids, sent_ids, labels=mlm_label, mlm_pos=mask_pos, nsp_labels=nsp_label) loss = scaler.scale(loss) loss.backward() scaler.minimize(opt, loss) model.clear_gradients() lr_scheduler.step() if step % 10 == 0: _lr = lr_scheduler.get_lr() if args.use_amp: _l = (loss / scaler._scale).numpy() msg = '[rank-%d][step-%d] train loss %.5f lr %.3e scaling %.3e' % ( env.dev_id, step, _l, _lr, scaler._scale.numpy()) else: _l = loss.numpy() msg = '[rank-%d][step-%d] train loss %.5f lr %.3e' % ( env.dev_id, step, _l, _lr) log.debug(msg) if step % 1000 == 0 and env.dev_id == 0: log.debug('saveing...') P.save(model.state_dict(), args.save_dir / 'ckpt.bin') if step > args.max_steps: break log.info('done')
def seq2seq(model, tokenizer, args): log.info('Training starts with args: %r' % args) attn_id = tokenizer.vocab[args.attn_token] def gen_mask(batch_ids, mask_type='bidi', query_len=None, pad_value=0): if query_len is None: query_len = batch_ids.shape[1] if mask_type != 'empty': mask = (batch_ids != pad_value).astype(np.float32) mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1]) if mask_type == 'causal': assert query_len == batch_ids.shape[1] mask = np.tril(mask) elif mask_type == 'causal_without_diag': assert query_len == batch_ids.shape[1] mask = np.tril(mask, -1) elif mask_type == 'diag': assert query_len == batch_ids.shape[1] mask = np.stack([np.diag(np.diag(m)) for m in mask], 0) else: mask_type == 'empty' mask = np.zeros_like(batch_ids).astype(np.float32) mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1]) return mask def make_some_noice(ids): if args.use_random_noice: noice_ids = np.random.randint(1, len(tokenizer.vocab), size=ids.shape) else: noice_ids = np.ones_like(ids) * tokenizer.vocab['[NOISE]'] pos, = np.where(np.ones_like(ids)) np.random.shuffle(pos) pos = pos[:int(args.noise_prob * len(pos))] ids[pos, ] = noice_ids[pos, ] return ids def map_fn(example_id, src_ids, tgt_ids): src_ids = src_ids[:args.max_encode_len] tgt_ids = tgt_ids[:args.max_decode_len] src_ids, src_sids = tokenizer.build_for_ernie(src_ids) src_pids = np.arange(len(src_ids)) tgt_ids, tgt_sids = tokenizer.build_for_ernie(tgt_ids) tgt_pids = np.arange(len(tgt_ids)) + len(src_ids) # continues position tgt_sids = np.ones_like(tgt_sids) * args.tgt_type_id attn_ids = np.ones_like(tgt_ids) * attn_id if args.noise_prob > 0.: tgt_labels = deepcopy(tgt_ids) tgt_ids = make_some_noice(tgt_ids) #corrupted else: tgt_labels = tgt_ids return (example_id, src_ids, src_pids, src_sids, tgt_ids, tgt_pids, tgt_sids, attn_ids, tgt_labels) def after_padding(example_id, src_ids, src_pids, src_sids, tgt_ids, tgt_pids, tgt_sids, attn_ids, tgt_labels): ''' attention mask: *** src, tgt, attn src 00, 01, 11 tgt 10, 11, 12 attn 20, 21, 22 *** s1, s2 | t1 t2 t3| attn1 attn2 attn3 s1 1, 1 | 0, 0, 0,| 0, 0, 0, s2 1, 1 | 0, 0, 0,| 0, 0, 0, - t1 1, 1, | 1, 0, 0,| 0, 0, 0, t2 1, 1, | 1, 1, 0,| 0, 0, 0, t3 1, 1, | 1, 1, 1,| 0, 0, 0, - attn1 1, 1, | 0, 0, 0,| 1, 0, 0, attn2 1, 1, | 1, 0, 0,| 0, 1, 0, attn3 1, 1, | 1, 1, 0,| 0, 0, 1, for details, see Fig3. https://arxiv.org/abs/2001.11314 ''' src_len = src_ids.shape[1] tgt_len = tgt_ids.shape[1] mask_00 = gen_mask(src_ids, 'bidi', query_len=src_len) mask_01 = gen_mask(tgt_ids, 'empty', query_len=src_len) mask_02 = gen_mask(attn_ids, 'empty', query_len=src_len) mask_10 = gen_mask(src_ids, 'bidi', query_len=tgt_len) mask_11 = gen_mask(tgt_ids, 'causal', query_len=tgt_len) mask_12 = gen_mask(attn_ids, 'empty', query_len=tgt_len) mask_20 = gen_mask(src_ids, 'bidi', query_len=tgt_len) mask_21 = gen_mask(tgt_ids, 'causal_without_diag', query_len=tgt_len) mask_22 = gen_mask(attn_ids, 'diag', query_len=tgt_len) ''' mask = np.concatenate([ np.concatenate([mask_00, mask_01, mask_02], 2), np.concatenate([mask_10, mask_11, mask_12], 2), np.concatenate([mask_20, mask_21, mask_22], 2), ], 1) ids = np.concatenate([src_ids, tgt_ids, attn_ids], 1) pids = np.concatenate([src_pids, tgt_pids, tgt_pids], 1) sids = np.concatenate([src_sids, tgt_sids, tgt_sids], 1) ''' mask_src_2_src = mask_00 mask_tgt_2_srctgt = np.concatenate([mask_10, mask_11], 2) mask_attn_2_srctgtattn = np.concatenate([mask_20, mask_21, mask_22], 2) tgt_labels = tgt_labels[np.where(tgt_labels != 0)] return (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels) bytes_vocab = {k.encode('utf8'): v for k, v in tokenizer.vocab.items()} feature_column = propeller.data.FeatureColumns([ propeller.data.LabelColumn('id'), propeller.data.TextColumn('src', unk_id=tokenizer.unk_id, vocab_dict=bytes_vocab), propeller.data.TextColumn('tgt', unk_id=tokenizer.unk_id, vocab_dict=bytes_vocab), ]) train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=False, repeat=True, use_gz=False) \ .map(map_fn) dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \ .map(map_fn) \ .padded_batch(args.eval_bsz) \ .map(after_padding) log.debug('shard %d of %d' % (D.parallel.Env().dev_id, D.parallel.Env().nranks)) train_ds = train_ds.shard( D.parallel.Env().nranks, D.parallel.Env().dev_id).shuffle(10000).padded_batch( args.bsz).map(after_padding) dev_ds = dev_ds.shard(D.parallel.Env().nranks, D.parallel.Env().dev_id) shapes = [[None, None]] * 7 + [[None, None, None]] * 3 + [[None]] types = ['int64'] * 11 train_ds.data_shapes = shapes train_ds.data_types = types dev_ds.data_shapes = shapes dev_ds.data_types = types vocab_size, _ = model.word_emb.weight.shape ctx = D.parallel.prepare_context() model = D.parallel.DataParallel(model, ctx) g_clip = F.clip.GradientClipByGlobalNorm(1.0) opt = AdamW(learning_rate=LinearDecay( args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps), parameter_list=model.parameters(), weight_decay=args.wd, grad_clip=g_clip) attn_id = tokenizer.vocab[args.attn_token] for step, data in enumerate(train_ds.start(place)): (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels) = data _, __, info = model(src_ids, sent_ids=src_sids, pos_ids=src_pids, attn_bias=mask_src_2_src, encode_only=True) cached_k, cached_v = info['caches'] _, __, info = model(tgt_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_tgt_2_srctgt, past_cache=(cached_k, cached_v), encode_only=True) cached_k2, cached_v2 = info['caches'] past_cache_k = [ L.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2) ] past_cache_v = [ L.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2) ] if args.label_smooth > 0.: tgt_labels = L.label_smooth(F.one_hot(tgt_labels, vocab_size), epsilon=args.label_smooth) loss, _, __ = model(attn_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_attn_2_srctgtattn, past_cache=(past_cache_k, past_cache_v), tgt_labels=tgt_labels, tgt_pos=L.where(attn_ids == attn_id)) scaled_loss = model.scale_loss(loss) scaled_loss.backward() model.apply_collective_grads() opt.minimize(scaled_loss) model.clear_gradients() if step % 10 == 0: loss = loss.numpy() ppl = np.exp(loss) log.debug('[step %d]train loss %.5f, ppl %.5f, lr %.3e' % (step, loss, ppl, opt.current_step_lr())) if args.save_dir is not None and step % 1000 == 0 and D.parallel.Env( ).dev_id == 0: F.save_dygraph(model.state_dict(), args.save_dir) if args.predict_output_dir is not None and step > args.skip_eval_steps and step % args.eval_steps == 0: assert os.path.exists( args.predict_output_dir ), 'predict_output_dir not found: %s' % args.predict_output_dir log.debug('doing predict on gpu %d...' % D.parallel.Env().dev_id) evaluate(model, dev_ds, step, args) if step > args.max_steps: break evaluate(model, dev_ds, step, args) if args.save_dir is not None: F.save_dygraph(model.state_dict(), args.save_dir)
args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps), parameter_list=model.parameters(), weight_decay=args.wd, grad_clip=g_clip) #opt = F.optimizer.AdamOptimizer(learning_rate=LinearDecay(args.lr, args.warmup_steps, args.max_steps), parameter_list=model.parameters()) for epoch in range(args.epoch): for step, (ids, sids, aligned_label, label, orig_pos) in enumerate(tqdm(train_ds.start(place))): loss, logits = model( ids, sids, labels=aligned_label, loss_weights=L.cast( ids > tokenizer.mask_id, 'float32')) # [MASK] is the largest special token loss.backward() if step % 10 == 0: log.debug('train loss %.5f, lr %.3e' % (loss.numpy(), opt.current_step_lr())) opt.minimize(loss) model.clear_gradients() if step % 100 == 0: f1 = evaluate(model, dev_ds) log.debug('eval f1: %.5f' % f1) f1 = evaluate(model, dev_ds) log.debug('final eval f1: %.5f' % f1) if args.save_dir is not None: F.save_dygraph(model.state_dict(), args.save_dir)
step += 1 scaler.minimize(opt, loss) model.clear_gradients() lr_scheduler and lr_scheduler.step() if step % 10 == 0: _lr = lr_scheduler.get_lr() if args.use_amp: _l = (loss / scaler._scale).numpy() msg = '[step-%d] train loss %.5f lr %.3e scaling %.3e' % ( step, _l, _lr, scaler._scale.numpy()) else: _l = loss.numpy() msg = '[step-%d] train loss %.5f lr %.3e' % (step, _l, _lr) log.debug(msg) log_writer.add_scalar('loss', _l, step=step) log_writer.add_scalar('lr', _lr, step=step) if step % 100 == 0: acc = [] with P.no_grad(): model.eval() for ids, sids, label in P.io.DataLoader( dev_ds, places=P.CUDAPlace(0), batch_size=None): loss, logits = model(ids, sids, labels=label) #print('\n'.join(map(str, logits.numpy().tolist()))) a = (logits.argmax(-1) == label) acc.append(a.numpy()) model.train() acc = np.concatenate(acc).mean()
loss.backward() scaler.minimize(opt, loss) model.clear_gradients() lr_scheduler.step() if step % 10 == 0: _lr = lr_scheduler.get_lr() if args.use_amp: _l = (loss / scaler._scale).numpy() msg = '[step-%d] train loss %.5f lr %.3e scaling %.3e' % ( step, _l, _lr, scaler._scale.numpy()) else: _l = loss.numpy() msg = '[step-%d] train loss %.5f lr %.3e' % (step, _l, _lr) log.debug(msg) log_writer.add_scalar('loss', _l, step=step) log_writer.add_scalar('lr', _lr, step=step) if step % 100 == 0: f1 = evaluate(model, dev_ds) log.debug('eval f1: %.5f' % f1) log_writer.add_scalar('eval/f1', f1, step=step) if args.save_dir is not None: P.save(model.state_dict(), args.save_dir / 'ckpt.bin') f1 = evaluate(model, dev_ds) log.debug('final eval f1: %.5f' % f1) log_writer.add_scalar('eval/f1', f1, step=step) if args.save_dir is not None: P.save(model.state_dict(), args.save_dir / 'ckpt.bin')
g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental opt = AdamW(learning_rate=LinearDecay( args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps), parameter_list=model.parameters(), weight_decay=args.wd, grad_clip=g_clip) for epoch in range(args.epoch): for step, d in enumerate( tqdm(train_ds.start(place), desc='training')): ids, sids, label = d loss, _ = model(ids, sids, labels=label) loss.backward() if step % 10 == 0: log.debug('train loss %.5f lr %.3e' % (loss.numpy(), opt.current_step_lr())) opt.minimize(loss) model.clear_gradients() if step % 100 == 0: acc = [] with FD.base._switch_tracer_mode_guard_( is_train=False): model.eval() for step, d in enumerate( tqdm(dev_ds.start(place), desc='evaluating %d' % epoch)): ids, sids, label = d loss, logits = model(ids, sids, labels=label) #print('\n'.join(map(str, logits.numpy().tolist()))) a = L.argmax(logits, -1) == label acc.append(a.numpy())
if end_position is None: end_position = 0 return np.array(unique_id), np.array(token_ids), np.array(text_type_ids), np.array(start_position), np.array(end_position) train_dataset = propeller.data.Dataset.from_list(train_features).map(map_fn) dev_dataset = propeller.data.Dataset.from_list(dev_features).map(map_fn).padded_batch(args.bsz) shapes = ([-1], [-1, args.max_seqlen], [-1, args.max_seqlen], [-1], [-1]) types = ('int64', 'int64', 'int64', 'int64', 'int64') train_dataset.name = 'train' dev_dataset.name = 'dev' train_dataset.data_shapes = shapes train_dataset.data_types = types dev_dataset.data_shapes = shapes dev_dataset.data_types = types place = F.CUDAPlace(D.parallel.Env().dev_id) D.guard(place).__enter__() model = ErnieModelForQuestionAnswering.from_pretrained(args.from_pretrained, name='') train(model, train_dataset, dev_dataset, dev_examples, dev_features, tokenizer, args) if D.parallel.Env().dev_id == 0: f1, em = evaluate(model, dev_dataset, dev_examples, dev_features, tokenizer, args) log.debug('final eval result: f1 %.5f em %.5f' % (f1, em)) if D.parallel.Env().dev_id == 0 and args.save_dir is not None: F.save_dygraph(model.state_dict(), args.save_dir)