def infer(args): log.info("loading data") raw_dataset = GraphPropPredDataset(name=args.dataset_name) args.num_class = raw_dataset.num_tasks args.eval_metric = raw_dataset.eval_metric args.task_type = raw_dataset.task_type test_ds = MolDataset(args, raw_dataset, mode="test") fn = MgfCollateFn(args, mode="test") test_loader = Dataloader(test_ds, batch_size=args.batch_size, num_workers=1, collate_fn=fn) test_loader = PDataset.from_generator_func(test_loader) est = propeller.Learner(MgfModel, args, args.model_config) mgf_list = [] for soft_mgf in est.predict(test_loader, ckpt_path=args.model_path_for_infer, split_batch=True): mgf_list.append(soft_mgf) mgf = np.concatenate(mgf_list) log.info("saving features") np.save( "dataset/%s/soft_mgf_feat.npy" % (args.dataset_name.replace("-", "_")), mgf)
def __init__(self, config, mode, run_config): for k, v in config.items(): log.info("%s: %s" % (k, repr(v))) self.hidden_size = config['hidden_size'] self.vocab_size = config['vocab_size'] self.embedding_size = config['embedding_size'] self.num_layers = config['num_layers'] self.learning_rate = config['learning_rate'] self.mode = mode
def __init__(self, address, batch_size=128, num_coroutine=10, timeout=10.): self.loop = asyncio.new_event_loop() asyncio.set_event_loop(self.loop) context = zmq.asyncio.Context() self.socket_pool = [ context.socket(zmq.REQ) for _ in range(num_coroutine) ] log.info("Connecting to server... %s" % address) for socket in self.socket_pool: socket.connect(address) self.num_coroutine = num_coroutine self.batch_size = batch_size self.timeout = int(timeout * 1000)
def layer_decay(param, param_last, learning_rate, decay_rate, n_layers): #encoder params delta = param - param_last encoder_layer_m = re.search("encoder_layer_([0-9]*)_", param.name) if encoder_layer_m is not None: layer = int(encoder_layer_m.group(1)) ratio = decay_rate**(n_layers + 1 - layer) log.info('layer deay %s: ratio %s.' % (param.name, ratio)) param_update = param + (ratio - 1) * delta elif "embedding" in param.name: ratio = decay_rate**(n_layers + 2) param_update = param + (ratio - 1) * delta else: param_update = None return param_update
def __init__(self, config, raw_dataset, mode='train'): self.config = config self.raw_dataset = raw_dataset self.mode = mode log.info("preprocess graph data in %s" % self.__class__.__name__) self.graph_list = [] log.info("loading mgf feature") mgf_feature = np.load(self.config.mgf_file) log.info(["the shape of mgf feature is: ", mgf_feature.shape]) for i in range(len(self.raw_dataset)): # num_nodes, edge_index, node_feat, edge_feat, label graph, label = self.raw_dataset[i] num_nodes = graph['num_nodes'] node_feat = graph['node_feat'].copy() edges = list(zip(graph["edge_index"][0], graph["edge_index"][1])) edge_feat = graph['edge_feat'].copy() new_graph = {} new_graph['num_nodes'] = num_nodes new_graph['node_feat'] = node_feat new_graph['edges'] = edges new_graph['edge_feat'] = edge_feat new_graph['mgf'] = mgf_feature[i, :].reshape(-1, ) self.graph_list.append(new_graph)
def __init__(self, graph_work_path): trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) trainer_count = int(os.getenv("PADDLE_TRAINERS_NUM", "1")) log.info("trainer_id: %s, trainer_count: %s." % (trainer_id, trainer_count)) edges = np.load(os.path.join(graph_work_path, "train_data.npy"), allow_pickle=True) # edges is bidirectional. train_usr = edges[trainer_id::trainer_count, 0] train_ad = edges[trainer_id::trainer_count, 1] returns = {"train_data": [train_usr, train_ad]} if os.path.exists(os.path.join(graph_work_path, "neg_samples.npy")): neg_samples = np.load(os.path.join(graph_work_path, "neg_samples.npy"), allow_pickle=True) if neg_samples.size != 0: train_negs = neg_samples[trainer_id::trainer_count] returns["train_data"].append(train_negs) log.info("Load train_data done.") self.data = returns
def seq2seq(model, tokenizer, args): log.info('Training starts with args: %r' % args) attn_id = tokenizer.vocab[args.attn_token] def gen_mask(batch_ids, mask_type='bidi', query_len=None, pad_value=0): if query_len is None: query_len = batch_ids.shape[1] if mask_type != 'empty': mask = (batch_ids != pad_value).astype(np.float32) mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1]) if mask_type == 'causal': assert query_len == batch_ids.shape[1] mask = np.tril(mask) elif mask_type == 'causal_without_diag': assert query_len == batch_ids.shape[1] mask = np.tril(mask, -1) elif mask_type == 'diag': assert query_len == batch_ids.shape[1] mask = np.stack([np.diag(np.diag(m)) for m in mask], 0) else: mask_type == 'empty' mask = np.zeros_like(batch_ids).astype(np.float32) mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1]) return mask def make_some_noice(ids): if args.use_random_noice: noice_ids = np.random.randint( 1, len(tokenizer.vocab), size=ids.shape) else: noice_ids = np.ones_like(ids) * tokenizer.vocab['[NOISE]'] pos, = np.where(np.ones_like(ids)) np.random.shuffle(pos) pos = pos[:int(args.noise_prob * len(pos))] ids[pos, ] = noice_ids[pos, ] return ids def map_fn(example_id, src_ids, tgt_ids): src_ids = src_ids[:args.max_encode_len] tgt_ids = tgt_ids[:args.max_decode_len] src_ids, src_sids = tokenizer.build_for_ernie(src_ids) src_pids = np.arange(len(src_ids)) tgt_ids, tgt_sids = tokenizer.build_for_ernie(tgt_ids) tgt_pids = np.arange(len(tgt_ids)) + len(src_ids) # continues position tgt_sids = np.ones_like(tgt_sids) * args.tgt_type_id attn_ids = np.ones_like(tgt_ids) * attn_id if args.noise_prob > 0.: tgt_labels = deepcopy(tgt_ids) tgt_ids = make_some_noice(tgt_ids) #corrupted else: tgt_labels = tgt_ids return (example_id, src_ids, src_pids, src_sids, tgt_ids, tgt_pids, tgt_sids, attn_ids, tgt_labels) def after_padding(example_id, src_ids, src_pids, src_sids, tgt_ids, tgt_pids, tgt_sids, attn_ids, tgt_labels): ''' attention mask: *** src, tgt, attn src 00, 01, 11 tgt 10, 11, 12 attn 20, 21, 22 *** s1, s2 | t1 t2 t3| attn1 attn2 attn3 s1 1, 1 | 0, 0, 0,| 0, 0, 0, s2 1, 1 | 0, 0, 0,| 0, 0, 0, - t1 1, 1, | 1, 0, 0,| 0, 0, 0, t2 1, 1, | 1, 1, 0,| 0, 0, 0, t3 1, 1, | 1, 1, 1,| 0, 0, 0, - attn1 1, 1, | 0, 0, 0,| 1, 0, 0, attn2 1, 1, | 1, 0, 0,| 0, 1, 0, attn3 1, 1, | 1, 1, 0,| 0, 0, 1, for details, see Fig3. https://arxiv.org/abs/2001.11314 ''' src_len = src_ids.shape[1] tgt_len = tgt_ids.shape[1] mask_00 = gen_mask(src_ids, 'bidi', query_len=src_len) mask_01 = gen_mask(tgt_ids, 'empty', query_len=src_len) mask_02 = gen_mask(attn_ids, 'empty', query_len=src_len) mask_10 = gen_mask(src_ids, 'bidi', query_len=tgt_len) mask_11 = gen_mask(tgt_ids, 'causal', query_len=tgt_len) mask_12 = gen_mask(attn_ids, 'empty', query_len=tgt_len) mask_20 = gen_mask(src_ids, 'bidi', query_len=tgt_len) mask_21 = gen_mask(tgt_ids, 'causal_without_diag', query_len=tgt_len) mask_22 = gen_mask(attn_ids, 'diag', query_len=tgt_len) ''' mask = np.concatenate([ np.concatenate([mask_00, mask_01, mask_02], 2), np.concatenate([mask_10, mask_11, mask_12], 2), np.concatenate([mask_20, mask_21, mask_22], 2), ], 1) ids = np.concatenate([src_ids, tgt_ids, attn_ids], 1) pids = np.concatenate([src_pids, tgt_pids, tgt_pids], 1) sids = np.concatenate([src_sids, tgt_sids, tgt_sids], 1) ''' mask_src_2_src = mask_00 mask_tgt_2_srctgt = np.concatenate([mask_10, mask_11], 2) mask_attn_2_srctgtattn = np.concatenate([mask_20, mask_21, mask_22], 2) tgt_labels = tgt_labels[np.where(tgt_labels != 0)] return (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels) bytes_vocab = {k.encode('utf8'): v for k, v in tokenizer.vocab.items()} feature_column = propeller.data.FeatureColumns([ propeller.data.LabelColumn('id'), propeller.data.TextColumn( 'src', unk_id=tokenizer.unk_id, vocab_dict=bytes_vocab), propeller.data.TextColumn( 'tgt', unk_id=tokenizer.unk_id, vocab_dict=bytes_vocab), ]) train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=True, use_gz=False) \ .map(map_fn) \ .padded_batch(args.bsz) \ .map(after_padding) dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \ .map(map_fn) \ .padded_batch(args.eval_bsz) \ .map(after_padding) \ .shard(env.nranks, env.dev_id) vocab_size, _ = model.word_emb.weight.shape model = P.DataParallel(model) g_clip = P.nn.ClipGradByGlobalNorm(1.0) param_name_to_exclue_from_weight_decay = re.compile( r'.*layer_norm_scale|.*layer_norm_bias|.*b_0') lr_scheduler = P.optimizer.lr.LambdaDecay( args.lr, get_warmup_and_linear_decay( args.max_steps, int(args.warmup_proportion * args.max_steps))) opt = P.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.wd, apply_decay_param_fun=lambda n: param_name_to_exclue_from_weight_decay.match(n), grad_clip=g_clip) scaler = P.amp.GradScaler(enable=args.use_amp) attn_id = tokenizer.vocab[args.attn_token] create_if_not_exists(args.save_dir) if args.predict_output_dir: create_if_not_exists(args.predict_output_dir) with P.amp.auto_cast(enable=args.use_amp): for step, data in enumerate( P.io.DataLoader( train_ds, places=P.CUDAPlace(env.dev_id), batch_size=None)): (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels) = data _, __, info = model( src_ids, sent_ids=src_sids, pos_ids=src_pids, attn_bias=mask_src_2_src, encode_only=True) cached_k, cached_v = info['caches'] _, __, info = model( tgt_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_tgt_2_srctgt, past_cache=(cached_k, cached_v), encode_only=True) cached_k2, cached_v2 = info['caches'] past_cache_k = [ P.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2) ] past_cache_v = [ P.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2) ] tgt_labels = F.one_hot(tgt_labels, vocab_size) if args.label_smooth > 0.: tgt_labels = F.label_smooth( tgt_labels, epsilon=args.label_smooth) loss, _, __ = model( attn_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_attn_2_srctgtattn, past_cache=(past_cache_k, past_cache_v), tgt_labels=tgt_labels, tgt_pos=P.nonzero(attn_ids == attn_id)) loss = scaler.scale(loss) loss.backward() scaler.minimize(opt, loss) model.clear_gradients() lr_scheduler.step() if step % 10 == 0: _lr = lr_scheduler.get_lr() if args.use_amp: _l = (loss / scaler._scale).numpy() msg = '[rank-%d][step-%d] train loss %.5f lr %.3e scaling %.3e' % ( env.dev_id, step, _l, _lr, scaler._scale.numpy()) else: _l = loss.numpy() msg = '[rank-%d][step-%d] train loss %.5f lr %.3e' % ( env.dev_id, step, _l, _lr) log.debug(msg) if args.save_dir is not None and step % 1000 == 0 and env.dev_id == 0: P.save(model.state_dict(), args.save_dir / 'ckpt.bin') if args.predict_output_dir is not None and step > args.skip_eval_steps and step % args.eval_steps == 0: assert args.predict_output_dir.exists(), \ 'predict_output_dir not found: %s' % args.predict_output_dir log.debug('doing predict on gpu %d...' % env.dev_id) evaluate(model, dev_ds, step, args) if step > args.max_steps: break evaluate(model, dev_ds, step, args) if args.save_dir is not None: P.save(model.state_dict(), args.save_dir / 'ckpt.bin')
def train(args): log.info("pretraining start") profile = False place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0))) # set seed random.seed(args.seed) np.random.seed(args.seed) paddle.seed(args.seed) get_rng_state_tracker().add('global_seed', args.seed) get_rng_state_tracker().add('local_seed', args.seed + fleet.worker_index() + 2021) # define execution strategy exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = 2 exec_strategy.num_iteration_per_drop_scope = 1 # define distribution strategy dist_strategy = fleet.DistributedStrategy() dist_strategy.execution_strategy = exec_strategy dist_strategy.nccl_comm_num = 3 if args.use_recompute: log.info("using recompute.") dist_strategy.recompute = args.use_recompute dist_strategy.sharding = args.use_sharding dist_strategy.pipeline = args.num_pp > 1 # define topology structure for dp/pp/mp topo = Topology(rank=fleet.worker_index(), world_size=fleet.worker_num(), dp=args.num_dp, pp=args.num_pp, sharding=args.num_sharding, mp=args.num_mp) is_last = False if topo.pp.rank == (topo.pp.size - 1): is_last = True dp_sharding_rank = topo.dp.rank * topo.sharding.size + topo.sharding.rank dp_worldsize = topo.dp.size * topo.sharding.size bsz_per_dp = args.global_bsz // dp_worldsize micro_bsz = args.micro_bsz assert args.global_bsz % micro_bsz == 0, f"cannot do gradient accumulate, globa_bsz: {args.bsz} micro_bsz: {micro_bsz}" acc_steps = bsz_per_dp // micro_bsz # sharding \ model parallel \ pipeline assert dist_strategy.sharding == True dist_strategy.sharding_configs = { "segment_broadcast_MB": 32, "sharding_degree": args.num_sharding, "mp_degree": args.num_mp, "pp_degree": args.num_pp, "dp_degree": args.num_dp, "optimize_offload": True, } dist_strategy.pipeline_configs = { "schedule_mode": "1F1B", "micro_batch_size": micro_bsz, "accumulate_steps": acc_steps, } log.info( f"using globa_bsz: {args.global_bsz} micro_bsz: {micro_bsz}, acc_steps: {acc_steps}" ) dist_strategy.amp = args.use_amp dist_strategy.amp_configs = { "custom_white_list": ['softmax', 'layer_norm', 'gelu'], "init_loss_scaling": 32768, "decr_every_n_nan_or_inf": 2, "incr_every_n_steps": 1000, "incr_ratio": 2.0, "use_dynamic_loss_scaling": True, "decr_ratio": 0.5, "use_pure_fp16": False, "use_fp16_guard": False, } dist_strategy.lamb = args.use_lamb dist_strategy.lamb_configs = { 'lamb_weight_decay': 0.01, 'exclude_from_weight_decay': ['layer_norm_bias', 'layer_norm_scale', '.b_0'] } train_program = fluid.Program() startup_program = fluid.Program() with fluid.program_guard(train_program, startup_program): with fluid.unique_name.guard(): graph_vars = create_model(args, 'train', micro_bsz, dp_sharding_rank, dp_worldsize, topo) data_loader = graph_vars['data_loader'] for op in train_program.global_block().ops: if op.type == 'fill_constant': op._set_attr( 'op_device', "gpu:0" ) # XXX: hack: https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/layers/tensor.py#L1376 if args.use_recompute: dist_strategy.recompute_configs = { "checkpoints": graph_vars['checkpoints'], # "enable_offload": args.use_offload, # "checkpoint_shape": [micro_bsz, args.max_seq_len, 4096], } log.debug("base lr: {}".format(args.learning_rate)) scheduled_lr = linear_warmup_decay( learning_rate=args.learning_rate, warmup_steps=args.warmup_steps, num_train_steps=args.num_train_steps) clip_norm_thres = 1.0 if paddlenlp.ops.optimizer._jit_compile(): optimizer = paddlenlp.ops.optimizer.AdamwOptimizer( learning_rate=scheduled_lr, grad_clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=clip_norm_thres), weight_decay=args.weight_decay, apply_decay_param_fun=apply_weight_decay_fun) else: optimizer = fluid.optimizer.Adam( learning_rate=scheduled_lr, grad_clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=clip_norm_thres), #multi_precision=True, #weight_decay=args.weight_decay, # merge this pr to use weight_decay: https://github.com/PaddlePaddle/Paddle/pull/29248 #exclude_from_weight_decay_fn=exclude_from_weight_decay ) optimizer = fleet.distributed_optimizer(optimizer, dist_strategy) log.info(f"using dist strategy: {dist_strategy}") optimizer.minimize(graph_vars['total_loss']) final_strategy = fleet._final_strategy() applied_meta_list = fleet._get_applied_meta_list() log.info("final strategy: {}".format(final_strategy)) log.info("applied_meta_list: {}".format(applied_meta_list)) program_desc_dir = os.path.join(args.output_dir, "program_desc") if not os.path.isdir(program_desc_dir): os.mkdir(program_desc_dir) with open( program_desc_dir + "/main_program.txt.%d" % (int(os.environ.get('FLAGS_selected_gpus', 0))), 'w') as f: f.write(str(train_program)) with open( program_desc_dir + "/startup_program.txt.%d" % (int(os.environ.get('FLAGS_selected_gpus', 0))), 'w') as f: f.write(str(startup_program)) exe = fluid.Executor(place) exe.run(startup_program) optimizer.amp_init(place) #save_path = os.path.join(args.output_dir, 'step_0') #log.debug("saving models to {}".format(save_path)) #save_persistables(exe, save_path, train_program) if args.init_checkpoint and args.init_checkpoint != "": log.info(' ') log.info( '############################WARNING############################') log.info( '####### using ini_checkpoint, not init_pretraining_params ####') log.info( '## meaning hyper param e.g. lr will inherit from checkpoint ##') log.info( '###############################################################') init_checkpoint(exe, args.init_checkpoint, train_program) log.info(' ') output_dir = args.output_dir save_steps = args.save_steps total_time = 0 cost_vals, lm_losses, sop_accs = [], [], [] global_steps = args.global_steps + 1 steps = 0 log_path = 'train_log/node-%d' % fleet.worker_index() start_time = time.time() with LogWriter(os.path.join(args.output_dir, log_path)) as swriter: data_loader.start() while True: #if steps < global_steps: # steps += 1 # continue if not is_last: fetch_list = [] else: fetch_list = [ graph_vars['total_loss'], graph_vars['mean_mask_lm_loss'], scheduled_lr ] if args.use_sop: fetch_list.extend( [graph_vars['sop_acc'], graph_vars['sop_loss']]) if args.use_amp: loss_scaling = train_program.global_block( ).vars['loss_scaling_0'] fetch_list.append(loss_scaling) ret = exe.run(train_program, fetch_list=fetch_list ) # run one mini-batch(=acc_steps micro-batch) #use_program_cache=True) steps += 1 if is_last: if args.use_sop and args.use_amp: cost_val, lm_loss, lr, sop_acc, sop_loss, loss_scaling_0 = ret elif args.use_sop: cost_val, lm_loss, lr, sop_acc, sop_loss = ret elif args.use_amp: cost_val, lm_loss, lr, loss_scaling_0 = ret else: cost_val, lm_loss, lr = ret cost_vals.append(cost_val[0]) lm_losses.append(lm_loss[0]) if args.use_sop: sop_accs.append(sop_acc[0]) if steps > 0 and (steps % args.log_steps) == 0: end_time = time.time() total_time = end_time - start_time cost_val = np.mean(cost_vals) lm_loss = np.mean(lm_losses) swriter.add_scalar('loss/total_loss', cost_val, steps) swriter.add_scalar('loss/mlm_loss', lm_loss, steps) swriter.add_scalar('lr/scheduled_lr', lr[0], steps) if args.use_sop: sop_acc = np.mean(sop_accs) swriter.add_scalar('loss/sop_loss', sop_loss, steps) swriter.add_scalar('train/sop_acc', sop_acc, steps) else: sop_acc = 0.0 if args.use_amp: swriter.add_scalar('lr/loss_scaling', loss_scaling_0[0], steps) else: loss_scaling_0 = [0.0] log.info( "worker_index: %d, step: %d, cost: %f, " "mlm loss: %f, sentence order acc: %f, " "speed: %f steps/s, " "speed: %f samples/s, " "speed: %f tokens/s, " "learning rate: %.3e, loss_scalings: %f" % (fleet.worker_index(), steps, cost_val, lm_loss, sop_acc, args.log_steps / total_time, args.log_steps * args.global_bsz / total_time, args.log_steps * args.global_bsz * args.max_seq_len / total_time, lr[0], loss_scaling_0[0])) cost_vals, lm_losses, sop_accs = [], [], [] start_time = time.time() # TODO: add evaluation if steps > 0 and args.eval_steps > 0 and steps % args.eval_steps == 0: pass if steps > 0 and args.save_steps > 0 and steps % args.save_steps == 0: if args.use_hybrid_dp and fleet.worker_index() > 8: continue save_path = os.path.join(output_dir, 'step_' + str(steps)) log.debug("saving models to {}".format(save_path)) save_persistables(exe, save_path, train_program) if steps == args.num_train_steps: if args.use_hybrid_dp and fleet.worker_index() > 8: continue save_path = os.path.join(output_dir, 'final_step_' + str(steps)) save_persistables(exe, save_path, train_program) log.debug("saving final models to {}".format(save_path)) log.debug("end of training, total steps: {}".format(steps))
mgf) if __name__ == "__main__": parser = argparse.ArgumentParser(description='gnn') parser.add_argument("--config", type=str, default="./config.yaml") parser.add_argument("--task_name", type=str, default="task_name") parser.add_argument("--infer_model", type=str, default=None) parser.add_argument("--log_id", type=str, default=None) args = parser.parse_args() if args.infer_model is not None: config = prepare_config(args.config, isCreate=False, isSave=False) config.model_path_for_infer = args.infer_model infer(config) else: config = prepare_config(args.config, isCreate=True, isSave=True) log_to_file(log, config.log_dir, config.log_filename) if config.warm_start_from is not None: log.info("loading model config from %s" % config.pretrained_config_file) pretrained_config = prepare_config(config.pretrained_config_file) pretrained_model_config = pretrained_config.pretrained_model_config else: pretrained_model_config = config.model_config config.log_id = args.log_id train(config, pretrained_model_config)
parser.add_argument('--lr', type=float, default=3e-5, help='learning rate') parser.add_argument('--save_dir', type=str, default=None, help='model output directory') parser.add_argument('--n_best_size', type=int, default=20, help='nbest prediction to keep') parser.add_argument('--max_answer_length', type=int, default=100, help='max answer span') parser.add_argument('--wd', type=float, default=0.00, help='weight decay, aka L2 regularizer') args = parser.parse_args() tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained) if not os.path.exists(args.train_file): raise RuntimeError('input data not found at %s' % args.train_file) if not os.path.exists(args.dev_file): raise RuntimeError('input data not found at %s' % args.dev_file) log.info('making train/dev data...') train_examples = mrc_reader.read_files(args.train_file, is_training=True) train_features = mrc_reader.convert_example_to_features(train_examples, args.max_seqlen, tokenizer, is_training=True) dev_examples = mrc_reader.read_files(args.dev_file, is_training=False) dev_features = mrc_reader.convert_example_to_features(dev_examples, args.max_seqlen, tokenizer, is_training=False) log.info('train examples: %d, features: %d' % (len(train_examples), len(train_features))) def map_fn(unique_id, example_index, doc_span_index, tokens, token_to_orig_map, token_is_max_context, token_ids, position_ids, text_type_ids, start_position, end_position): if start_position is None: start_position = 0 if end_position is None: end_position = 0 return np.array(unique_id), np.array(token_ids), np.array(text_type_ids), np.array(start_position), np.array(end_position)
ex = build_example(transposed_slots) write_gz(ex.SerializeToString(), to_file) slots = [] if __name__ == '__main__': parser = argparse.ArgumentParser('Pretrain Data Maker') parser.add_argument('src', type=str) parser.add_argument('tgt', type=str) parser.add_argument('--vocab', type=str, required=True) parser.add_argument('-v', '--verbose', action='store_true') parser.add_argument('-c', '--check', action='store_true') args = parser.parse_args() log.setLevel(logging.DEBUG) from tokenizing_ernie import _wordpiece pat = re.compile(r'([a-zA-Z0-9]+|\S)') vocab = { j.strip().split(b'\t')[0].decode('utf8'): i for i, j in enumerate(open(args.vocab, 'rb')) } vocab_set = set(vocab.keys()) with open(args.src, 'rb') as from_file, gzip.open(args.tgt, 'wb') as to_file: log.info('making gz from bb %s ==> %s' % (from_file, to_file)) build_bb(from_file, to_file) log.info('done: %s' % to_file)
def cmp_fn(old, new): if old['eval'][args.metrics] - new['eval'][args.metrics] > 0: log.info("best %s eval result: %s" % (args.metrics, new['eval'])) return True else: return False
def seq2seq(model, tokenizer, args): log.info('Training starts with args: %r' % args) attn_id = tokenizer.vocab[args.attn_token] def gen_mask(batch_ids, mask_type='bidi', query_len=None, pad_value=0): if query_len is None: query_len = batch_ids.shape[1] if mask_type != 'empty': mask = (batch_ids != pad_value).astype(np.float32) mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1]) if mask_type == 'causal': assert query_len == batch_ids.shape[1] mask = np.tril(mask) elif mask_type == 'causal_without_diag': assert query_len == batch_ids.shape[1] mask = np.tril(mask, -1) elif mask_type == 'diag': assert query_len == batch_ids.shape[1] mask = np.stack([np.diag(np.diag(m)) for m in mask], 0) else: mask_type == 'empty' mask = np.zeros_like(batch_ids).astype(np.float32) mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1]) return mask def make_some_noice(ids): if args.use_random_noice: noice_ids = np.random.randint(1, len(tokenizer.vocab), size=ids.shape) else: noice_ids = np.ones_like(ids) * tokenizer.vocab['[NOISE]'] pos, = np.where(np.ones_like(ids)) np.random.shuffle(pos) pos = pos[:int(args.noise_prob * len(pos))] ids[pos, ] = noice_ids[pos, ] return ids def map_fn(example_id, src_ids, tgt_ids): src_ids = src_ids[:args.max_encode_len] tgt_ids = tgt_ids[:args.max_decode_len] src_ids, src_sids = tokenizer.build_for_ernie(src_ids) src_pids = np.arange(len(src_ids)) tgt_ids, tgt_sids = tokenizer.build_for_ernie(tgt_ids) tgt_pids = np.arange(len(tgt_ids)) + len(src_ids) # continues position tgt_sids = np.ones_like(tgt_sids) * args.tgt_type_id attn_ids = np.ones_like(tgt_ids) * attn_id if args.noise_prob > 0.: tgt_labels = deepcopy(tgt_ids) tgt_ids = make_some_noice(tgt_ids) #corrupted else: tgt_labels = tgt_ids return (example_id, src_ids, src_pids, src_sids, tgt_ids, tgt_pids, tgt_sids, attn_ids, tgt_labels) def after_padding(example_id, src_ids, src_pids, src_sids, tgt_ids, tgt_pids, tgt_sids, attn_ids, tgt_labels): ''' attention mask: *** src, tgt, attn src 00, 01, 11 tgt 10, 11, 12 attn 20, 21, 22 *** s1, s2 | t1 t2 t3| attn1 attn2 attn3 s1 1, 1 | 0, 0, 0,| 0, 0, 0, s2 1, 1 | 0, 0, 0,| 0, 0, 0, - t1 1, 1, | 1, 0, 0,| 0, 0, 0, t2 1, 1, | 1, 1, 0,| 0, 0, 0, t3 1, 1, | 1, 1, 1,| 0, 0, 0, - attn1 1, 1, | 0, 0, 0,| 1, 0, 0, attn2 1, 1, | 1, 0, 0,| 0, 1, 0, attn3 1, 1, | 1, 1, 0,| 0, 0, 1, for details, see Fig3. https://arxiv.org/abs/2001.11314 ''' src_len = src_ids.shape[1] tgt_len = tgt_ids.shape[1] mask_00 = gen_mask(src_ids, 'bidi', query_len=src_len) mask_01 = gen_mask(tgt_ids, 'empty', query_len=src_len) mask_02 = gen_mask(attn_ids, 'empty', query_len=src_len) mask_10 = gen_mask(src_ids, 'bidi', query_len=tgt_len) mask_11 = gen_mask(tgt_ids, 'causal', query_len=tgt_len) mask_12 = gen_mask(attn_ids, 'empty', query_len=tgt_len) mask_20 = gen_mask(src_ids, 'bidi', query_len=tgt_len) mask_21 = gen_mask(tgt_ids, 'causal_without_diag', query_len=tgt_len) mask_22 = gen_mask(attn_ids, 'diag', query_len=tgt_len) ''' mask = np.concatenate([ np.concatenate([mask_00, mask_01, mask_02], 2), np.concatenate([mask_10, mask_11, mask_12], 2), np.concatenate([mask_20, mask_21, mask_22], 2), ], 1) ids = np.concatenate([src_ids, tgt_ids, attn_ids], 1) pids = np.concatenate([src_pids, tgt_pids, tgt_pids], 1) sids = np.concatenate([src_sids, tgt_sids, tgt_sids], 1) ''' mask_src_2_src = mask_00 mask_tgt_2_srctgt = np.concatenate([mask_10, mask_11], 2) mask_attn_2_srctgtattn = np.concatenate([mask_20, mask_21, mask_22], 2) tgt_labels = tgt_labels[np.where(tgt_labels != 0)] return (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels) bytes_vocab = {k.encode('utf8'): v for k, v in tokenizer.vocab.items()} feature_column = propeller.data.FeatureColumns([ propeller.data.LabelColumn('id'), propeller.data.TextColumn('src', unk_id=tokenizer.unk_id, vocab_dict=bytes_vocab), propeller.data.TextColumn('tgt', unk_id=tokenizer.unk_id, vocab_dict=bytes_vocab), ]) train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=False, repeat=True, use_gz=False) \ .map(map_fn) dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \ .map(map_fn) \ .padded_batch(args.eval_bsz) \ .map(after_padding) log.debug('shard %d of %d' % (D.parallel.Env().dev_id, D.parallel.Env().nranks)) train_ds = train_ds.shard( D.parallel.Env().nranks, D.parallel.Env().dev_id).shuffle(10000).padded_batch( args.bsz).map(after_padding) dev_ds = dev_ds.shard(D.parallel.Env().nranks, D.parallel.Env().dev_id) shapes = [[None, None]] * 7 + [[None, None, None]] * 3 + [[None]] types = ['int64'] * 11 train_ds.data_shapes = shapes train_ds.data_types = types dev_ds.data_shapes = shapes dev_ds.data_types = types vocab_size, _ = model.word_emb.weight.shape ctx = D.parallel.prepare_context() model = D.parallel.DataParallel(model, ctx) g_clip = F.clip.GradientClipByGlobalNorm(1.0) opt = AdamW(learning_rate=LinearDecay( args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps), parameter_list=model.parameters(), weight_decay=args.wd, grad_clip=g_clip) attn_id = tokenizer.vocab[args.attn_token] for step, data in enumerate(train_ds.start(place)): (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels) = data _, __, info = model(src_ids, sent_ids=src_sids, pos_ids=src_pids, attn_bias=mask_src_2_src, encode_only=True) cached_k, cached_v = info['caches'] _, __, info = model(tgt_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_tgt_2_srctgt, past_cache=(cached_k, cached_v), encode_only=True) cached_k2, cached_v2 = info['caches'] past_cache_k = [ L.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2) ] past_cache_v = [ L.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2) ] if args.label_smooth > 0.: tgt_labels = L.label_smooth(F.one_hot(tgt_labels, vocab_size), epsilon=args.label_smooth) loss, _, __ = model(attn_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_attn_2_srctgtattn, past_cache=(past_cache_k, past_cache_v), tgt_labels=tgt_labels, tgt_pos=L.where(attn_ids == attn_id)) scaled_loss = model.scale_loss(loss) scaled_loss.backward() model.apply_collective_grads() opt.minimize(scaled_loss) model.clear_gradients() if step % 10 == 0: loss = loss.numpy() ppl = np.exp(loss) log.debug('[step %d]train loss %.5f, ppl %.5f, lr %.3e' % (step, loss, ppl, opt.current_step_lr())) if args.save_dir is not None and step % 1000 == 0 and D.parallel.Env( ).dev_id == 0: F.save_dygraph(model.state_dict(), args.save_dir) if args.predict_output_dir is not None and step > args.skip_eval_steps and step % args.eval_steps == 0: assert os.path.exists( args.predict_output_dir ), 'predict_output_dir not found: %s' % args.predict_output_dir log.debug('doing predict on gpu %d...' % D.parallel.Env().dev_id) evaluate(model, dev_ds, step, args) if step > args.max_steps: break evaluate(model, dev_ds, step, args) if args.save_dir is not None: F.save_dygraph(model.state_dict(), args.save_dir)
def make_pretrain_dataset(name, dir, vocab, hparams, args): gz_files = glob(dir) if not gz_files: raise ValueError('train data not found in %s' % dir) log.info('read from %s' % '\n'.join(gz_files)) max_input_seqlen = args.max_seqlen max_pretrain_seqlen = lambda: max_input_seqlen if r.random( ) > 0.15 else r.randint(1, max_input_seqlen) # short sentence rate def _parse_gz(record_str): # function that takes python_str as input ex = propeller.data.example_pb2.SequenceExample() ex.ParseFromString(record_str) doc = [ np.array(f.int64_list.value, dtype=np.int64) for f in ex.feature_lists.feature_list['txt'].feature ] doc_seg = [ np.array(f.int64_list.value, dtype=np.int64) for f in ex.feature_lists.feature_list['segs'].feature ] return doc, doc_seg def bb_to_segments(filename): ds = Dataset.from_record_file(filename).map(_parse_gz) iterable = iter(ds) def gen(): buf, size = [], 0 iterator = iter(ds) while 1: doc, doc_seg = next(iterator) for line, line_seg in zip(doc, doc_seg): #line = np.array(sp_model.SampleEncodeAsIds(line, -1, 0.1), dtype=np.int64) # 0.1 means large variance on sentence piece result if len(line) == 0: continue line = np.array( line ) # 0.1 means large variance on sentence piece result line_seg = np.array(line_seg) size += len(line) buf.append(np.stack([line, line_seg]).transpose()) if size > max_input_seqlen: yield buf, buf, size = [], 0 if len(buf) != 0: yield buf, buf, size = [], 0 return Dataset.from_generator_func(gen) def sample_negative(dataset): def gen(): iterator = iter(dataset) while True: chunk_a, = next(iterator) #chunk_b, = next(iterator) seqlen = max_pretrain_seqlen() seqlen_a = r.randint(1, seqlen) seqlen_b = seqlen - seqlen_a len_a = list(accumulate([len(c) for c in chunk_a])) buf_a = [c for c, l in zip(chunk_a, len_a) if l < seqlen_a] #always take the first one buf_b = [ c for c, l in zip(chunk_a, len_a) if seqlen_a <= l < seqlen ] if r.random() < 0.5: #pos or neg label = np.int64(1) else: label = np.int64(0) buf_a, buf_b = buf_b, buf_a if not (len(buf_a) and len(buf_b)): continue a = np.concatenate(buf_a) b = np.concatenate(buf_b) #log.debug(a) #log.debug(b) sample, seg_info, token_type = build_pair( a, b, args.max_seqlen, vocab) #negative sample might exceed max seqlen yield sample, seg_info, token_type, label ds = propeller.data.Dataset.from_generator_func(gen) return ds def after(sentence, seg_info, segments, label): batch_size, seqlen = sentence.shape sentence, mask_pos, mlm_label = apply_mask(sentence, seg_info, args.mask_rate, hparams.vocab_size, vocab) ra = r.random() if ra < args.check: print('***') print('\n'.join([ str(j) + '\t' + '|'.join(map(str, i)) for i, j in zip(sentence.tolist(), label) ])) print('***') print('\n'.join(['|'.join(map(str, i)) for i in seg_info.tolist()])) print('***') print('|'.join(map(str, mlm_label.tolist()))) print('***') return sentence, segments, mlm_label, mask_pos, label # pretrain pipeline dataset = Dataset.from_list(gz_files) if propeller.train.distribution.status.mode == propeller.train.distribution.DistributionMode.NCCL: log.info('Apply sharding in distribution env') dataset = dataset.shard( propeller.train.distribution.status.num_replica, propeller.train.distribution.status.replica_id) dataset = dataset.repeat().shuffle(buffer_size=len(gz_files)) dataset = dataset.interleave(map_fn=bb_to_segments, cycle_length=len(gz_files), block_length=1) dataset = dataset.shuffle( buffer_size=1000) #must shuffle to ensure negative sample randomness dataset = sample_negative(dataset) dataset = dataset.padded_batch(hparams.batch_size, (0, 0, 0, 0)).map(after) dataset.name = name return dataset
.padded_batch(hparams.batch_size, (0, 0)) \ .map(after) def unsuperve_before(text_a, teacher_text_a): teacher_sentence, teacher_segments = utils.data.build_1_pair( teacher_text_a, max_seqlen=args.teacher_max_seqlen, cls_id=teacher_cls_id, sep_id=teacher_sep_id) sentence_a = text_a[:args.max_seqlen] return sentence_a, teacher_sentence, teacher_segments client = InferenceClient(args.teacher_host, batch_size=args.server_batch_size, num_coroutine=args.num_coroutine) log.info('teacher host %s' % args.teacher_host) def ask_teacher_for_label(sentence_a, teacher_sentence, teacher_segments): sentence_a, teacher_sentence, teacher_segments = utils.data.expand_dims( sentence_a, teacher_sentence, teacher_segments) teacher_label, = client(teacher_sentence, teacher_segments) teacher_label = teacher_label[:, :] return sentence_a, teacher_label unsup_train_ds = unsupervise_feature_column.build_dataset('unsup_train', data_dir=os.path.join(args.data_dir, 'unsup_train_aug'), shuffle=True, repeat=True, use_gz=False) \ .buffered(100) \ .map(unsuperve_before) \ .padded_batch(hparams.batch_size, (0, 0, 0)) \ .map(ask_teacher_for_label) train_ds = utils.data.interleave(train_ds, unsup_train_ds)
def train(args, pretrained_model_config=None): log.info("loading data") raw_dataset = GraphPropPredDataset(name=args.dataset_name) args.num_class = raw_dataset.num_tasks args.eval_metric = raw_dataset.eval_metric args.task_type = raw_dataset.task_type train_ds = MolDataset(args, raw_dataset) args.eval_steps = math.ceil(len(train_ds) / args.batch_size) log.info("Total %s steps (eval_steps) every epoch." % (args.eval_steps)) fn = MgfCollateFn(args) train_loader = Dataloader(train_ds, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=args.shuffle, stream_shuffle_size=args.shuffle_size, collate_fn=fn) # for evaluating eval_train_loader = train_loader eval_train_loader = PDataset.from_generator_func(eval_train_loader) train_loader = multi_epoch_dataloader(train_loader, args.epochs) train_loader = PDataset.from_generator_func(train_loader) if args.warm_start_from is not None: # warm start setting def _fn(v): if not isinstance(v, F.framework.Parameter): return False if os.path.exists(os.path.join(args.warm_start_from, v.name)): return True else: return False ws = propeller.WarmStartSetting(predicate_fn=_fn, from_dir=args.warm_start_from) else: ws = None def cmp_fn(old, new): if old['eval'][args.metrics] - new['eval'][args.metrics] > 0: log.info("best %s eval result: %s" % (args.metrics, new['eval'])) return True else: return False if args.log_id is not None: save_best_model = int(args.log_id) == 5 else: save_best_model = True best_exporter = propeller.exporter.BestResultExporter( args.output_dir, (cmp_fn, save_best_model)) eval_datasets = {"eval": eval_train_loader} propeller.train.train_and_eval( model_class_or_model_fn=MgfModel, params=pretrained_model_config, run_config=args, train_dataset=train_loader, eval_dataset=eval_datasets, warm_start_setting=ws, exporters=[best_exporter], )
def _worker(): for i in range(epochs): log.info("BEGIN: epoch %s ..." % i) for batch in loader(): yield batch log.info("END: epoch %s ..." % i)
'layer9', 'layer8', 'layer7', 'layer6', 'layer5', 'layer4', 'layer3', 'layer2', 'layer1', ], default='pooler') args = parser.parse_args() if args.verbose: log.setLevel(logging.DEBUG) cuda_env = os.getenv("CUDA_VISIBLE_DEVICES") if cuda_env is None: raise RuntimeError('CUDA_VISIBLE_DEVICES not set') n_devices = len(cuda_env.split(",")) if args.encode_layer.lower() == 'pooler': model_dir = os.path.join(args.model_dir, 'pooler') else: pat = re.compile(r'layer(\d+)') match = pat.match(args.encode_layer.lower()) layer = int(match.group(1)) model_dir = os.path.join(args.model_dir, 'enc%d' % layer) server = InferenceServer(model_dir, n_devices) log.info('propeller server listent on port %d' % args.port) server.listen(args.port)
batch_size=0)): (src_ids, sent_ids, mlm_label, mask_pos, nsp_label) = samples loss, mlmloss, nsploss = model(src_ids, sent_ids, labels=mlm_label, mlm_pos=mask_pos, nsp_labels=nsp_label) loss = scaler.scale(loss) loss.backward() scaler.minimize(opt, loss) model.clear_gradients() lr_scheduler.step() if step % 10 == 0: _lr = lr_scheduler.get_lr() if args.use_amp: _l = (loss / scaler._scale).numpy() msg = '[rank-%d][step-%d] train loss %.5f lr %.3e scaling %.3e' % ( env.dev_id, step, _l, _lr, scaler._scale.numpy()) else: _l = loss.numpy() msg = '[rank-%d][step-%d] train loss %.5f lr %.3e' % ( env.dev_id, step, _l, _lr) log.debug(msg) if step % 1000 == 0 and env.dev_id == 0: log.debug('saveing...') P.save(model.state_dict(), args.save_dir / 'ckpt.bin') if step > args.max_steps: break log.info('done')
default=None, help='inference model output directory') parser.add_argument('--init_checkpoint', type=str, default=None) parser.add_argument('--save_dir', type=str, default=None, help='model output directory') parser.add_argument('--wd', type=float, default=0.01, help='weight decay, aka L2 regularizer') args = parser.parse_args() place = F.CUDAPlace(D.parallel.Env().dev_id) D.guard(place).__enter__() ernie = ErnieModelForGeneration.from_pretrained(args.from_pretrained) tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained, mask_token=None) rev_dict = {v: k for k, v in tokenizer.vocab.items()} rev_dict[tokenizer.pad_id] = '' # replace [PAD] rev_dict[tokenizer.unk_id] = '' # replace [PAD] if args.init_checkpoint is not None: log.info('loading checkpoint from %s' % args.init_checkpoint) sd, _ = D.load_dygraph(args.init_checkpoint) ernie.set_dict(sd) seq2seq(ernie, tokenizer, args)
def make_pretrain_dataset(name, gz_files, is_train, vocab, batch_size, vocab_size, max_seqlen, global_rank, world_size): max_input_seqlen = max_seqlen max_pretrain_seqlen = lambda: max_input_seqlen if r.random( ) > 0.15 else r.randint(1, max_input_seqlen) # short sentence rate def _parse_gz(record_str): # function that takes python_str as input ex = propeller.data.example_pb2.SequenceExample() ex.ParseFromString(record_str) doc = [ np.array(f.int64_list.value, dtype=np.int64) for f in ex.feature_lists.feature_list['txt'].feature ] doc_seg = [ np.array(f.int64_list.value, dtype=np.int64) for f in ex.feature_lists.feature_list['segs'].feature ] return doc, doc_seg def _mereg_docseg(doc_seg): # ngram masking ret, span_ctr, ngram_ctr, ngram, last = [], 0, 1, sample_geo(), None for s in doc_seg: if s != -1 and last is not None and s != last: ngram_ctr += 1 if ngram_ctr > ngram: ngram = sample_geo() ngram_ctr = 1 span_ctr += 1 last = s ret.append(span_ctr) ret = np.array(ret) assert len(doc_seg) == len(ret) return ret def bb_to_segments(filename): ds = Dataset.from_record_file(filename).map(_parse_gz) iterable = iter(ds) def gen(): buf, size = [], 0 iterator = iter(ds) while 1: doc, doc_seg = next(iterator) for line, line_seg in zip(doc, doc_seg): #line = np.array(sp_model.SampleEncodeAsIds(line, -1, 0.1), dtype=np.int64) # 0.1 means large variance on sentence piece result if len(line) == 0: continue line = np.array(line) line_seg = np.array(line_seg) line_seg = _mereg_docseg(line_seg) # mask span size += len(line) buf.append(np.stack([line, line_seg]).transpose()) if size > max_input_seqlen: yield buf, buf, size = [], 0 if len(buf) != 0: yield buf, buf, size = [], 0 return Dataset.from_generator_func(gen) def sample_negative(dataset): def gen(): iterator = iter(dataset) while True: chunk_a, = next(iterator) #chunk_b, = next(iterator) seqlen = max_pretrain_seqlen() seqlen_a = r.randint(1, seqlen) seqlen_b = seqlen - seqlen_a len_a = list(accumulate([len(c) for c in chunk_a])) buf_a = [c for c, l in zip(chunk_a, len_a) if l < seqlen_a] #always take the first one buf_b = [ c for c, l in zip(chunk_a, len_a) if seqlen_a <= l < seqlen ] if not (len(buf_a) and len(buf_b)): continue a = np.concatenate(buf_a) b = np.concatenate(buf_b) #log.debug(a) #log.debug(b) sample, seg_info, token_type = build_pair( a, b, max_seqlen, vocab) #negative sample might exceed max seqlen yield sample, seg_info, token_type ds = propeller.data.Dataset.from_generator_func(gen) return ds def after(sentence, seg_info, segments): batch_size, seqlen = sentence.shape sentence, mask_pos, mlm_label = apply_mask(sentence, seg_info, 1., 0.15, vocab_size, vocab) #return {'input_ids': sentence, 'token_type_ids': segments, 'sentence_order_label': label, 'labels': mlm_label, 'mlm_mask': mlm_mask} sentence = sentence.reshape([-1, seqlen, 1]) segments = segments.reshape([-1, seqlen, 1]) mlm_label = mlm_label.reshape([-1, 1]) mask_pos_reshape = [] for i, p in zip(mask_pos[0], mask_pos[1]): p += i * seqlen mask_pos_reshape.append(p) mask_pos = np.array(mask_pos_reshape).reshape([-1, 1]) return sentence, segments, mlm_label, mask_pos # pretrain pipeline dataset = Dataset.from_list(gz_files) log.info('Apply sharding in distribution env %d/%d' % (global_rank, world_size)) dataset = dataset.shard(world_size, global_rank) log.info('read from %s' % ','.join(list(iter(dataset)))) cycle_length = len(range(global_rank, len(gz_files), world_size)) if is_train: dataset = dataset.repeat() #dataset = dataset.repeat().shuffle(buffer_size=len(gz_files)) #dataset = dataset.shuffle(buffer_size=len(gz_files)) dataset = dataset.interleave(map_fn=bb_to_segments, cycle_length=cycle_length, block_length=1) dataset = dataset.shuffle( buffer_size=10000) # must shuffle to ensure negative sample randomness dataset = sample_negative(dataset) dataset = dataset.padded_batch(batch_size, (0, -1, 0), max_seqlen) \ .map(after) dataset.name = name return dataset
help='checkpoint to warm start from') parser.add_argument( '--use_amp', action='store_true', help= 'only activate AMP(auto mixed precision accelatoin) on TensorCore compatible devices' ) args = parser.parse_args() if args.bsz > args.micro_bsz: assert args.bsz % args.micro_bsz == 0, 'cannot perform gradient accumulate with bsz:%d micro_bsz:%d' % ( args.bsz, args.micro_bsz) acc_step = args.bsz // args.micro_bsz log.info( 'performing gradient accumulate: global_bsz:%d, micro_bsz:%d, accumulate_steps:%d' % (args.bsz, args.micro_bsz, acc_step)) args.bsz = args.micro_bsz else: acc_step = 1 tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained) #tokenizer = ErnieTinyTokenizer.from_pretrained(args.from_pretrained) feature_column = propeller.data.FeatureColumns([ propeller.data.TextColumn('seg_a', unk_id=tokenizer.unk_id, vocab_dict=tokenizer.vocab, tokenizer=tokenizer.tokenize), propeller.data.TextColumn('seg_b', unk_id=tokenizer.unk_id,
def __init__(self, address): self.context = zmq.Context() self.address = address self.socket = self.context.socket(zmq.REQ) self.socket.connect(address) log.info("Connecting to server... %s" % address)
args.host, args.port, batch_size=args.batch_size, num_coroutine=args.num_coroutine) inputs = [ i.strip().split(b'\t') for i in open(args.input, 'rb').readlines() ] if len(inputs) == 0: raise ValueError('empty input') send_batch = args.num_coroutine * args.batch_size send_num = len(inputs) // send_batch + 1 rets = [] start = time() for i in range(send_num): slice = inputs[i * send_batch:(i + 1) * send_batch] if len(slice) == 0: continue columns = list(zip(*slice)) if len(columns) > 2: raise ValueError('inputs file has more than 2 columns') ret = client(*columns) if len(ret.shape) == 3: ret = ret[:, 0, :] # take cls rets.append(ret) end = time() with open(args.output, 'wb') as outf: arr = np.concatenate(rets, 0) np.save(outf, arr) log.info('query num: %d average latency %.5f' % (len(inputs), (end - start) / len(inputs)))
def make_pretrain_dataset(name, gz_files, is_train, vocab, batch_size, vocab_size, max_seqlen, global_rank, world_size): max_input_seqlen = max_seqlen max_pretrain_seqlen = lambda: max_input_seqlen if r.random() > 0.15 else r.randint(1, max_input_seqlen) # short sentence rate def _parse_gz(record_str): # function that takes python_str as input ex = propeller.data.example_pb2.SequenceExample() ex.ParseFromString(record_str) doc = [ np.array( f.int64_list.value, dtype=np.int64) for f in ex.feature_lists.feature_list['txt'].feature ] doc_seg = [ np.array( f.int64_list.value, dtype=np.int64) for f in ex.feature_lists.feature_list['segs'].feature ] return doc, doc_seg def random_n_sub_sentence(): ratio = r.random() n_sub_sentence = 4 label_index_start = 0 if ratio < float(1) / float(33): n_sub_sentence = 1 elif float(1) / float(33) <= ratio < float(3) / float(33): n_sub_sentence = 2 elif float(3) / float(33) <= ratio < float(9) / float(33): n_sub_sentence = 3 else: n_sub_sentence = 4 return n_sub_sentence def gen_interval(l, n_interval): n_needed = n_interval - 1 split_points = sorted(r.sample(range(1, l), n_needed)) index = [0] + split_points + [l] return [(index[i], index[i + 1]) for i in range(len(index) - 1)] def joint_sentences(buf, n_sub_sentence, interval_start_ends): # buf = [[text1, seg1], [text2, seg2]] tokens_of_sub_sentence = [[] for _ in range(n_sub_sentence)] segs_of_sub_sentence = [[] for _ in range(n_sub_sentence)] assert (len(interval_start_ends) == len(tokens_of_sub_sentence)) for (start, end), tokens, segs in zip(interval_start_ends, tokens_of_sub_sentence, segs_of_sub_sentence): for chunk in buf[start:end]: tokens.extend(chunk[0]) segs.extend(chunk[1]) new_buf = [] for t_merge, s_merge in zip(tokens_of_sub_sentence, segs_of_sub_sentence): new_buf.append([t_merge, s_merge]) return new_buf def _mereg_docseg(doc_seg): # ngram masking ret, span_ctr, ngram_ctr, ngram, last = [], 0, 1, sample_geo(), None for s in doc_seg: if s != -1 and last is not None and s != last: ngram_ctr += 1 if ngram_ctr > ngram: ngram = sample_geo() ngram_ctr = 1 span_ctr += 1 last = s ret.append(span_ctr) ret = np.array(ret) assert len(doc_seg) == len(ret) return ret def bb_to_segments(filename): ds = Dataset.from_record_file(filename).map(_parse_gz) iterable = iter(ds) def gen(): buf, size = [], 0 iterator = iter(ds) while 1: doc, doc_seg = next(iterator) n_sub_sentence = random_n_sub_sentence() max_num_tokens = max_input_seqlen - (n_sub_sentence + 1) for line, line_seg in zip(doc, doc_seg): if len(line) == 0: continue line = list(line) line_seg = np.array(line_seg) line_seg = list(_mereg_docseg(line_seg)) # mask span #size += len(line) #buf.append([line, line_seg]) if size + len(line) > max_num_tokens: if len(buf) > n_sub_sentence: interval = gen_interval(len(buf), n_sub_sentence) buf = joint_sentences(buf, n_sub_sentence, interval) elif len(buf) < n_sub_sentence: max_num_tokens = max_input_seqlen - (len(buf) + 1) if len(buf) > 0: truncate_seqs(buf, max_num_tokens) yield buf, buf, size = [[line, line_seg]], len(line) n_sub_sentence = random_n_sub_sentence() max_num_tokens = max_input_seqlen - (n_sub_sentence + 1) else: size += len(line) buf.append([line, line_seg]) if len(buf) != 0: if len(buf) > n_sub_sentence: interval = gen_interval(len(buf), n_sub_sentence) buf = joint_sentences(buf, n_sub_sentence, interval) elif len(buf) < n_sub_sentence: max_num_tokens = max_input_seqlen - (len(buf) + 1) truncate_seqs(buf, max_num_tokens) yield buf, buf, size = [], 0 return Dataset.from_generator_func(gen) def sample_negative(dataset): cls_id = vocab["[CLS]"] sep_id = vocab["[SEP]"] premutation_1_sent = [[0]] premutation_2_sent = [[0, 1], [1, 0]] premutation_3_sent = [[0, 1, 2], [0, 2, 1], [1, 0, 2], [1, 2, 0], [2, 0, 1], [2, 1, 0]] premutation_4_sent = [ [0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1], [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0], [2, 0, 1, 3], [2, 0, 3, 1], [2, 1, 0, 3], [2, 1, 3, 0], [2, 3, 0, 1], [2, 3, 1, 0], [3, 0, 1, 2], [3, 0, 2, 1], [3, 1, 0, 2], [3, 1, 2, 0], [3, 2, 0, 1], [3, 2, 1, 0] ] def gen(): iterator = iter(dataset) while True: chunks, = next(iterator) sample = [vocab['[CLS]']] seg_info = [-1] token_type = [0] label = 0 if len(chunks) == 1: # one sent # label in [0] choice_index = np.random.choice(1) for index, order in enumerate(premutation_1_sent[ choice_index]): sample += chunks[order][0] + [sep_id] seg_info += chunks[order][1] + [-1] token_type += [index] * len(chunks[order][0]) + [index] label += choice_index elif len(chunks) == 2: # two sent # label in [1, 2] choice_index = np.random.choice(2) for index, order in enumerate(premutation_2_sent[ choice_index]): sample += chunks[order][0] + [sep_id] seg_info += chunks[order][1] + [-1] token_type += [index] * len(chunks[order][0]) + [index] label += choice_index + 1 elif len(chunks) == 3: # three sent # label in [3,...,8] choice_index = np.random.choice(6) for index, order in enumerate(premutation_3_sent[ choice_index]): sample += chunks[order][0] + [sep_id] seg_info += chunks[order][1] + [-1] token_type += [index] * len(chunks[order][0]) + [index] label += choice_index + 3 else: # four sent # label in [9,...,32] choice_index = np.random.choice(24) for index, order in enumerate(premutation_4_sent[ choice_index]): sample += chunks[order][0] + [sep_id] seg_info += chunks[order][1] + [-1] token_type += [index] * len(chunks[order][0]) + [index] label += choice_index + 9 sample = np.array(sample) if len(sample) < 128: continue seg_info = np.array(seg_info) token_type = np.array(token_type) label = np.int64(label) yield sample, seg_info, token_type, label ds = propeller.data.Dataset.from_generator_func(gen) return ds def after(sentence, seg_info, segments, label): batch_size, seqlen = sentence.shape sentence, mask_pos, mlm_label = apply_mask(sentence, seg_info, 1., 0.15, vocab_size, vocab) #return {'input_ids': sentence, 'token_type_ids': segments, 'sentence_order_label': label, 'labels': mlm_label, 'mlm_mask': mlm_mask} sentence = sentence.reshape([-1, seqlen, 1]) segments = segments.reshape([-1, seqlen, 1]) mlm_label = mlm_label.reshape([-1, 1]) mask_pos_reshape = [] for i, p in zip(mask_pos[0], mask_pos[1]): p += i * seqlen mask_pos_reshape.append(p) mask_pos = np.array(mask_pos_reshape).reshape([-1, 1]) label = label.reshape([-1, 1]) return sentence, segments, mlm_label, mask_pos, label # pretrain pipeline dataset = Dataset.from_list(gz_files) log.info('Apply sharding in distribution env %d/%d' % (global_rank, world_size)) dataset = dataset.shard(world_size, global_rank) log.info('read from %s' % ','.join(list(iter(dataset)))) cycle_length = len(range(global_rank, len(gz_files), world_size)) if is_train: dataset = dataset.repeat() dataset = dataset.interleave( map_fn=bb_to_segments, cycle_length=cycle_length, block_length=1) dataset = dataset.shuffle( buffer_size=10000) # must shuffle to ensure negative sample randomness dataset = sample_negative(dataset) dataset = dataset.padded_batch(batch_size, (0, -1, 0, 0), max_seqlen) \ .map(after) dataset.name = name return dataset