def main(args): paddle.seed(12345) config = load_yaml(args.config_yaml) use_gpu = config.get("dygraph.use_gpu", True) test_data_dir = config.get("dygraph.test_data_dir", None) feature_size = config.get('hyper_parameters.feature_size', None) print_interval = config.get("dygraph.print_interval", None) model_load_path = config.get("dygraph.infer_load_path", "model_output") start_epoch = config.get("dygraph.infer_start_epoch", -1) end_epoch = config.get("dygraph.infer_end_epoch", 10) dense_input_dim = config.get('hyper_parameters.dense_input_dim', None) place = paddle.set_device('gpu' if use_gpu else 'cpu') print("***********************************") logger.info( "use_gpu: {}, test_data_dir: {}, start_epoch: {}, end_epoch: {}, print_interval: {}, model_load_path: {}" .format(use_gpu, test_data_dir, start_epoch, end_epoch, print_interval, model_load_path)) print("***********************************") fm_model = create_model(config) file_list = [ os.path.join(test_data_dir, x) for x in os.listdir(test_data_dir) ] print("read data") dataset = CriteoDataset(file_list) test_dataloader = create_data_loader(dataset, place=place, config=config) auc_metric = paddle.metric.Auc("ROC") acc_metric = paddle.metric.Accuracy() epoch_begin = time.time() interval_begin = time.time() for epoch_id in range(start_epoch + 1, end_epoch): logger.info("load model epoch {}".format(epoch_id)) model_path = os.path.join(model_load_path, str(epoch_id)) load_model(model_path, fm_model) for batch_id, batch in enumerate(test_dataloader()): batch_size = len(batch[0]) label, sparse_tensor, dense_tensor = create_feeds( batch, dense_input_dim) pred = fm_model(sparse_tensor, dense_tensor) label_int = paddle.cast(label, 'int64') # for auc predict_2d = paddle.concat(x=[1 - pred, pred], axis=1) auc_metric.update(preds=predict_2d.numpy(), labels=label_int.numpy()) if batch_id % print_interval == 0: logger.info( "infer epoch: {}, batch_id: {}, auc: {:.6f}, speed: {:.2f} ins/s" .format( epoch_id, batch_id, auc_metric.accumulate(), print_interval * batch_size / (time.time() - interval_begin))) interval_begin = time.time() logger.info( "infer epoch: {} done, auc: {:.6f}, epoch time{:.2f} s".format( epoch_id, auc_metric.accumulate(), time.time() - epoch_begin))
def set_global_seed(seed): random.seed(seed) np.random.seed(seed) paddle.seed(seed)
def do_train(args): if args.use_gpu: rank = dist.get_rank() trainer_count = dist.get_world_size() else: rank = 0 trainer_count = 1 paddle.set_device("cpu") if trainer_count > 1: dist.init_parallel_env() random_seed = eval(str(args.random_seed)) if random_seed is not None: paddle.seed(random_seed) vocab = get_lm_vocab(args) train_loader = get_lm_data_loader(args, vocab, "train") eval_loader = get_lm_data_loader(args, vocab, "valid") cutoffs, tie_projs = [], [False] if args.adaptive: assert args.dataset in ['wt103', 'lm1b'] if args.dataset == 'wt103': cutoffs = [20000, 40000, 200000] tie_projs += [True] * len(cutoffs) elif args.dataset == 'lm1b': cutoffs = [60000, 100000, 640000] tie_projs += [False] * len(cutoffs) mem_transformer = MemTransformerLM( args.ntokens, args.n_layer, args.n_head, args.d_model, args.d_head, args.d_inner_hid, args.dropout, args.attn_dropout, tie_weight=args.tie_weight, d_embed=args.d_model, div_val=args.div_val, tie_projs=tie_projs, normalize_before=args.normalize_before, tgt_len=args.tgt_len, ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=cutoffs, same_length=args.same_length, attn_type=args.attn_type, clamp_len=args.clamp_len, sample_softmax=args.sample_softmax) if args.scheduler == 'cosine': scheduler = paddle.optimizer.lr.CosineAnnealingDecay( learning_rate=args.learning_rate, T_max=args.max_step, eta_min=args.eta_min) elif args.scheduler == 'noam': scheduler = paddle.optimizer.lr.NoamDecay( d_model=args.d_model, warmup_steps=args.warmup_steps, learning_rate=args.learning_rate) elif args.scheduler == 'dev_perf': # fluid api scheduler = paddle.fluid.dygraph.ReduceLROnPlateau( learning_rate=args.learning_rate, decay_rate=args.decay_rate, patience=args.patience, min_lr=args.lr_min) elif args.scheduler == 'constant': scheduler = args.learning_rate clip = paddle.nn.ClipGradByGlobalNorm(args.clip) if args.optim.lower() == 'momentum': optimizer = paddle.optimizer.Momentum( learning_rate=scheduler, parameters=mem_transformer.parameters(), momentum=args.mom, grad_clip=clip) elif args.optim.lower() == 'adam': optimizer = paddle.optimizer.Adam( learning_rate=scheduler, parameters=mem_transformer.parameters(), beta1=args.beta1, beta2=args.beta2, epsilon=eval(args.eps), grad_clip=clip) elif args.optim.lower() == 'adagrad': optimizer = paddle.optimizer.Adagrad( learning_rate=scheduler, parameters=mem_transformer.parameters(), grad_clip=clip) # Init from some checkpoint, to resume the previous training if args.init_from_checkpoint: model_dict = paddle.load( os.path.join(args.init_from_checkpoint, "mem_transformer.pdparams")) opt_dict = paddle.load( os.path.join(args.init_from_checkpoint, "mem_transformer.pdopt")) mem_transformer.set_state_dict(model_dict) optimizer.set_state_dict(opt_dict) print("loaded from checkpoint.") # Init from some pretrain models, to better solve the current task if args.init_from_pretrain_model: model_dict = paddle.load( os.path.join(args.init_from_pretrain_model, "mem_transformer.pdparams")) mem_transformer.set_state_dict(model_dict) print("loaded from pre-trained model.") if trainer_count > 1: mem_transformer = paddle.DataParallel(mem_transformer) step_idx = 0 train_loss = 0.0 log_start_time = time.time() for pass_id in range(args.epoch): batch_id = 0 mems = tuple() for input_data in train_loader: (src, target, seq_len) = input_data ret = mem_transformer(src, target, *mems) loss = ret[0] mems = ret[1:] train_loss += loss.numpy() loss.backward() optimizer.step() optimizer.clear_grad() if step_idx > 0 and step_idx % args.print_step == 0 and rank == 0: cur_loss = train_loss / args.print_step elapsed = time.time() - log_start_time if args.scheduler == "constant": lr = optimizer.get_lr() else: lr = scheduler.get_lr() logger_info = "step_idx: %d, epoch: %d, batch: %d, learning rate: %.8f, " \ "speed: %f ms/batch, loss: %f" % \ (step_idx, pass_id, batch_id, lr, elapsed * 1000.0 / args.print_step, cur_loss) if args.dataset in ["enwik8", "text8"]: logger_info = logger_info + ", bpc: %f" % (cur_loss / np.log(2)) else: logger_info = logger_info + ", ppl: %f" % (np.exp(cur_loss)) logger.info(logger_info) train_loss = 0.0 log_start_time = time.time() if step_idx % args.save_step == 0 and step_idx != 0: # Do validation. mem_transformer.eval() # TODO(FrostML): simplify this. if args.mem_len == 0: if dist.get_world_size() == 1: mem_transformer.reset_length( tgt_len=args.eval_tgt_len, ext_len=args.ext_len + args.tgt_len - args.eval_tgt_len, mem_len=args.mem_len) else: mem_transformer._layers.reset_length( tgt_len=args.eval_tgt_len, ext_len=args.ext_len + args.tgt_len - args.eval_tgt_len, mem_len=args.mem_len) else: if dist.get_world_size() == 1: mem_transformer.reset_length( tgt_len=args.eval_tgt_len, ext_len=args.ext_len, mem_len=args.mem_len + args.tgt_len - args.eval_tgt_len) else: mem_transformer._layers.reset_length( tgt_len=args.eval_tgt_len, ext_len=args.ext_len, mem_len=args.mem_len + args.tgt_len - args.eval_tgt_len) total_len, total_loss = 0, 0. eval_mems = tuple() with paddle.no_grad(): for i, (src, target, seq_len) in enumerate(eval_loader): if args.max_eval_steps > 0 and i >= args.max_eval_steps: break ret = mem_transformer(src, target, *eval_mems) loss, eval_mems = ret[0], ret[1:] eval_cur_loss = seq_len * loss.numpy() total_loss += eval_cur_loss total_len += seq_len eval_loss = total_loss / total_len logger_info = "Validation, step_idx: %d, validation loss: %f" % \ (step_idx, eval_loss) if args.dataset in ['enwik8', 'text8']: logger_info = logger_info + ", bpc: %f" % (eval_loss / np.log(2)) else: logger_info = logger_info + ", ppl: %f" % (np.exp(eval_loss) ) logger.info(logger_info) if args.save_model and rank == 0: model_dir = os.path.join( args.save_model, "step_" + str(step_idx) + "_" + str(eval_loss)) if not os.path.exists(model_dir): os.makedirs(model_dir) paddle.save( mem_transformer.state_dict(), os.path.join(model_dir, "mem_transformer.pdparams")) paddle.save( optimizer.state_dict(), os.path.join(model_dir, "mem_transformer.pdopt")) if args.scheduler == 'dev_perf': scheduler.step(eval_loss) # TODO(FrostML): simplify this. if dist.get_world_size() == 1: mem_transformer.reset_length( tgt_len=args.tgt_len, ext_len=args.ext_len, mem_len=args.mem_len) else: mem_transformer._layers.reset_length( tgt_len=args.tgt_len, ext_len=args.ext_len, mem_len=args.mem_len) mem_transformer.train() step_idx += 1 batch_id += 1 if args.scheduler in ['cosine', 'dev_perf']: if step_idx < args.warmup_steps: curr_lr = args.learning_rate * step_idx / args.warmup_steps scheduler.base_lr = curr_lr else: if args.scheduler == 'cosine': scheduler.step() elif args.scheduler == 'constant': if step_idx < args.warmup_steps: curr_lr = args.learning_rate * step_idx / args.warmup_steps optimizer.set_lr(curr_lr) elif args.scheduler == 'noam': scheduler.step() if step_idx >= args.max_step: return if args.save_model and rank == 0: model_dir = os.path.join(args.save_model, "step_final") if not os.path.exists(model_dir): os.makedirs(model_dir) paddle.save(mem_transformer.state_dict(), os.path.join(model_dir, "mem_transformer.pdparams")) paddle.save(optimizer.state_dict(), os.path.join(model_dir, "mem_transformer.pdopt"))
def set_random_seed(seed, dp_id, rank_id): """Set random seed for reproducability.""" random.seed(seed) np.random.seed(seed + dp_id) paddle.seed(seed + dp_id)
def set_seed(seed): """sets random seed""" random.seed(seed) np.random.seed(seed) paddle.seed(seed)
def testSetNumpyBeforeTrain(self): seed = 90 hidden_size = 10 vocab_size = 1000 num_layers = 1 num_steps = 3 init_scale = 0.1 batch_size = 4 batch_num = 200 with fluid.dygraph.guard(): paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) # TODO: marsyang1993 Change seed to ptb_model = PtbModel(hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale) bd = [] lr_arr = [0.0] # this a fake lr decay strategy for i in range(1, 10): bd.append(100 * i) # set lr to 0.0, not update parameter new_lr = 0.0 lr_arr.append(new_lr) place = fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0) scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=bd, values=lr_arr) adam = Adam(learning_rate=scheduler, beta1=0.8, beta2=0.6, parameters=ptb_model.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None last_hidden = None last_cell = None np_opti_dict = {} np_state_dict = {} for k, v in self.opti_dict.items(): if isinstance(v, core.VarBase): np_opti_dict[v.name] = v.numpy() else: np_opti_dict[k] = v for k, v in self.state_dict.items(): np_state_dict[k] = v.numpy() adam.set_state_dict(np_opti_dict) ptb_model.set_dict(np_state_dict) for i in range(1): x_data = np.arange(12).reshape(4, 3).astype('int64') y_data = np.arange(1, 13).reshape(4, 3).astype('int64') y_data = y_data.reshape((-1, 1)) init_hidden_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') x = to_variable(x_data) y = to_variable(y_data) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) dy_loss, last_hidden, last_cell = ptb_model( x, y, init_hidden, init_cell) dy_loss.backward() scheduler.step() adam.minimize(dy_loss) ptb_model.clear_gradients() opti_dict = adam.state_dict() for k, v in opti_dict.items(): if k == "LR_Scheduler": self.assertTrue( np.array_equal(v['last_epoch'], self.base_opti[k]['last_epoch'] + 1)) if k.find("beta1_pow_acc_0") > 0: self.assertTrue( np.array_equal(v.numpy(), self.base_opti[v.name] * adam._beta1)) if k.find("beta2_pow_acc_0") > 0: self.assertTrue( np.array_equal(v.numpy(), self.base_opti[v.name] * adam._beta2)) # check parameter state_dict = ptb_model.state_dict() for k, v in state_dict.items(): new_t = v.numpy() base_t = self.model_base[k] self.assertTrue(np.array_equal(new_t, base_t))
def main(args): paddle.seed(12345) config = get_config(args.config, overrides=args.override, show=True) # assign the place use_gpu = config.get("use_gpu", True) place = paddle.set_device('gpu' if use_gpu else 'cpu') trainer_num = paddle.distributed.get_world_size() use_data_parallel = trainer_num != 1 config["use_data_parallel"] = use_data_parallel if config["use_data_parallel"]: paddle.distributed.init_parallel_env() net = program.create_model(config.ARCHITECTURE, config.classes_num) optimizer, lr_scheduler = program.create_optimizer( config, parameter_list=net.parameters()) dp_net = net if config["use_data_parallel"]: find_unused_parameters = config.get("find_unused_parameters", False) dp_net = paddle.DataParallel( net, find_unused_parameters=find_unused_parameters) # load model from checkpoint or pretrained model init_model(config, net, optimizer) train_dataloader = Reader(config, 'train', places=place)() if config.validate: valid_dataloader = Reader(config, 'valid', places=place)() last_epoch_id = config.get("last_epoch", -1) best_top1_acc = 0.0 # best top1 acc record best_top1_epoch = last_epoch_id vdl_writer_path = config.get("vdl_dir", None) vdl_writer = None if vdl_writer_path: from visualdl import LogWriter vdl_writer = LogWriter(vdl_writer_path) # Ensure that the vdl log file can be closed normally try: for epoch_id in range(last_epoch_id + 1, config.epochs): net.train() # 1. train with train dataset program.run(train_dataloader, config, dp_net, optimizer, lr_scheduler, epoch_id, 'train', vdl_writer) # 2. validate with validate dataset if config.validate and epoch_id % config.valid_interval == 0: net.eval() with paddle.no_grad(): top1_acc = program.run(valid_dataloader, config, net, None, None, epoch_id, 'valid', vdl_writer) if top1_acc > best_top1_acc: best_top1_acc = top1_acc best_top1_epoch = epoch_id model_path = os.path.join(config.model_save_dir, config.ARCHITECTURE["name"]) save_model(net, optimizer, model_path, "best_model") message = "The best top1 acc {:.5f}, in epoch: {:d}".format( best_top1_acc, best_top1_epoch) logger.info(message) # 3. save the persistable model if epoch_id % config.save_interval == 0: model_path = os.path.join(config.model_save_dir, config.ARCHITECTURE["name"]) save_model(net, optimizer, model_path, epoch_id) except Exception as e: logger.error(e) finally: vdl_writer.close() if vdl_writer else None
def main(args): paddle.seed(args.seed) np.random.seed(args.seed) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() node_labels = pd.read_csv( os.path.join(args.data_path, 'node_labels.txt'), header=None, sep="\t").values.astype("int64") node_labels = node_labels[:, 1:2] print(node_labels.shape) num_nodes = len(node_labels) train_idx = pd.read_csv( os.path.join(args.data_path, 'train_idx.txt'), header=None, sep="\t").values.astype("int64").reshape(-1, ).tolist() test_idx = pd.read_csv( os.path.join(args.data_path, 'test_idx.txt'), header=None, sep="\t").values.astype("int64").reshape(-1, ).tolist() g = build_heter_graph(os.path.join(args.data_path, "edges"), num_nodes) model = RGCN( num_nodes=num_nodes, input_size=args.input_size, hidden_size=args.hidden_size, num_class=args.num_class, num_layers=args.num_layers, etypes=g.edge_types, num_bases=args.num_bases, ) model = paddle.DataParallel(model) criterion = paddle.nn.loss.CrossEntropyLoss() optim = paddle.optimizer.Adam( learning_rate=args.lr, parameters=model.parameters(), weight_decay=0.001) test_acc_list = [] g.tensor() train_labels = paddle.to_tensor(node_labels[train_idx]) test_labels = paddle.to_tensor(node_labels[test_idx]) train_idx = paddle.to_tensor(train_idx) test_idx = paddle.to_tensor(test_idx) for epoch in range(args.epochs): logits = model(g) train_logits = paddle.gather(logits, train_idx) train_loss = criterion(train_logits, train_labels) train_loss.backward() train_acc = paddle.metric.accuracy( train_logits, label=train_labels, k=1) optim.step() optim.clear_grad() test_logits = paddle.gather(logits, test_idx) test_loss = criterion(test_logits, test_labels) test_acc = paddle.metric.accuracy(test_logits, label=test_labels, k=1) msg = "epoch: %s" % epoch msg += " | train_loss: %.4f | train_acc: %.4f" \ % (train_loss.numpy()[0], train_acc.numpy()[0]) msg += " | test_loss: %.4f | test_acc: %.4f" \ % (test_loss.numpy()[0], test_acc.numpy()[0]) log.info(msg) test_acc_list.append(test_acc.numpy()[0]) log.info("best test acc result: %.4f" % (np.max(test_acc_list)))
def main(args): paddle.set_device('gpu' if args.n_gpus else 'cpu') paddle.seed(args.seed) world_size = dist.get_world_size() rank = dist.get_rank() if world_size > 1: dist.init_parallel_env() model = UnifiedTransformerLMHeadModel.from_pretrained( args.model_name_or_path) tokenizer = UnifiedTransformerTokenizer.from_pretrained( args.model_name_or_path) if world_size > 1: model = paddle.DataParallel(model) train_dataset = DialogueDataset(args.train_data_path, args.batch_size, tokenizer.pad_token_id, tokenizer.cls_token_id, args.sort_pool_size, args.seed, mode='train') train_dataloader = DataLoader(train_dataset, return_list=True, batch_size=None) valid_dataset = DialogueDataset(args.valid_data_path, args.batch_size, tokenizer.pad_token_id, tokenizer.cls_token_id, args.sort_pool_size, mode='valid') valid_dataloader = DataLoader(valid_dataset, return_list=True, batch_size=None) lr_scheduler = NoamDecay(1 / (args.warmup_steps * (args.lr**2)), args.warmup_steps) optimizer = AdamW(learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ], grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)) step = 0 total_time = 0.0 for epoch in range(args.epochs): if rank == 0: print('\nEpoch %d/%d' % (epoch + 1, args.epochs)) batch_start_time = time.time() for inputs in train_dataloader: step += 1 token_ids, type_ids, pos_ids, generation_mask, tgt_label, tgt_pos = inputs logits = model(token_ids, type_ids, pos_ids, generation_mask, tgt_pos) loss = F.cross_entropy(logits, tgt_label) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() total_time += (time.time() - batch_start_time) if rank == 0: if step % args.logging_steps == 0: ppl = paddle.exp(loss) print( 'step %d - loss: %.4f - ppl: %.4f - lr: %.7f - %.3fs/step' % (step, loss, ppl, optimizer.get_lr(), total_time / args.logging_steps)) total_time = 0.0 if step % args.save_steps == 0: evaluation(model, valid_dataloader) save_ckpt(model, tokenizer, args.save_dir, step) batch_start_time = time.time()
def _test(self, place, use_tensor=True, use_fluid_api=True, use_global_beta_pow=False, flatten_param_grads=False): paddle.enable_static() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() SEED = 2021 paddle.seed(SEED) np.random.seed(SEED) a_np = np.random.random(size=(2, 2)).astype('float32') b_np = np.random.random(size=(2, 2)).astype('float32') label_np = np.random.randint(2, size=(2, 1)).astype('int64') weight_attr1 = paddle.ParamAttr( name="weight1", initializer=fluid.initializer.Constant(value=1.0), trainable=True) weight_attr2 = paddle.ParamAttr( name="weight2", initializer=fluid.initializer.Constant(value=2.0), trainable=True) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) with paddle.static.program_guard(main_prog, startup_prog): with paddle.utils.unique_name.guard(): a = paddle.static.data(name="a", shape=[2, 2], dtype='float32') b = paddle.static.data(name="b", shape=[2, 2], dtype='float32') label = paddle.static.data(name="label", shape=[2, 1], dtype='int64') sum = paddle.add(a, b) z = paddle.pow(sum, 2.0) fc_1 = fluid.layers.fc(input=z, size=2, param_attr=weight_attr1) prediction = fluid.layers.fc(input=fc_1, size=2, param_attr=weight_attr2, act='softmax') cost = fluid.layers.cross_entropy(input=prediction, label=label) loss = fluid.layers.reduce_mean(cost) beta1_init = 0.9 beta2_init = 0.999 epsilon_init = 1e-8 if use_tensor: beta1 = fluid.layers.create_global_var( shape=[1], value=float(beta1_init), dtype='float32', persistable=True, name="beta1") beta2 = fluid.layers.create_global_var( shape=[1], value=float(beta2_init), dtype='float32', persistable=True, name="beta2") epsilon = fluid.layers.create_global_var( shape=[1], value=float(epsilon_init), dtype='float32', persistable=True, name="epsilon") if use_fluid_api: adam = fluid.optimizer.Adam( learning_rate=0.01, beta1=beta1, beta2=beta2, epsilon=epsilon, use_global_beta_pow=use_global_beta_pow, flatten_param_grads=flatten_param_grads, align_size=256, grad_clip=clip) else: adam = paddle.optimizer.Adam(learning_rate=0.01, beta1=beta1, beta2=beta2, epsilon=epsilon, grad_clip=clip) else: if use_fluid_api: adam = fluid.optimizer.Adam( learning_rate=0.01, beta1=beta1_init, beta2=beta2_init, epsilon=epsilon_init, use_global_beta_pow=use_global_beta_pow, flatten_param_grads=flatten_param_grads, align_size=256, grad_clip=clip) else: adam = fluid.optimizer.Adam(learning_rate=0.01, beta1=beta1_init, beta2=beta2_init, epsilon=epsilon_init, grad_clip=clip) adam.minimize(loss) scope = fluid.Scope() with fluid.scope_guard(scope): exe = paddle.static.Executor(place) exe.run(startup_prog) print("Start run on {}".format(place)) for epoch in range(10): pred_res, loss_res = exe.run(main_prog, feed={ "a": a_np, "b": b_np, "label": label_np }, fetch_list=[prediction, loss]) print("Epoch {} | Prediction[0]: {}, Loss: {}".format( epoch, pred_res[0], loss_res)) paddle.disable_static() return pred_res, loss_res
def compress(args): num_workers = 4 shuffle = True if args.ce_test: # set seed seed = 111 paddle.seed(seed) np.random.seed(seed) random.seed(seed) num_workers = 0 shuffle = False paddle.set_device('gpu' if args.use_gpu else 'cpu') train_reader = None test_reader = None if args.data == "cifar10": transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = paddle.vision.datasets.Cifar10(mode="train", backend="cv2", transform=transform) val_dataset = paddle.vision.datasets.Cifar10(mode="test", backend="cv2", transform=transform) class_dim = 10 image_shape = [3, 32, 32] pretrain = False elif args.data == "imagenet": train_dataset = ImageNetDataset("data/ILSVRC2012", mode='train', image_size=224, resize_short_size=256) val_dataset = ImageNetDataset("data/ILSVRC2012", mode='val', image_size=224, resize_short_size=256) class_dim = 1000 image_shape = [3, 224, 224] pretrain = True else: raise ValueError("{} is not supported.".format(args.data)) assert args.model in model_list, "{} is not in lists: {}".format( args.model, model_list) inputs = [Input([None] + image_shape, 'float32', name='image')] labels = [Input([None, 1], 'int64', name='label')] # model definition net = models.__dict__[args.model](pretrained=pretrain, num_classes=class_dim) _logger.info("FLOPs before pruning: {}GFLOPs".format( flops(net, [1] + image_shape) / 1000)) net.eval() if args.criterion == 'fpgm': pruner = paddleslim.dygraph.FPGMFilterPruner(net, [1] + image_shape) elif args.criterion == 'l1_norm': pruner = paddleslim.dygraph.L1NormFilterPruner(net, [1] + image_shape) params = get_pruned_params(args, net) ratios = {} for param in params: ratios[param] = args.pruned_ratio plan = pruner.prune_vars(ratios, [0]) _logger.info("FLOPs after pruning: {}GFLOPs; pruned ratio: {}".format( flops(net, [1] + image_shape) / 1000, plan.pruned_flops)) for param in net.parameters(): if "conv2d" in param.name: print("{}\t{}".format(param.name, param.shape)) net.train() model = paddle.Model(net, inputs, labels) steps_per_epoch = int(np.ceil(len(train_dataset) * 1. / args.batch_size)) opt = create_optimizer(args, net.parameters(), steps_per_epoch) model.prepare(opt, paddle.nn.CrossEntropyLoss(), paddle.metric.Accuracy(topk=(1, 5))) if args.checkpoint is not None: model.load(args.checkpoint) model.fit(train_data=train_dataset, eval_data=val_dataset, epochs=args.num_epochs, batch_size=args.batch_size // ParallelEnv().nranks, verbose=1, save_dir=args.model_path, num_workers=num_workers, shuffle=shuffle)
import argparse import os import os.path as osp import sys import time import json from mmcv import Config from dataset import build_data_loader from models import build_model from utils import AverageMeter dist.get_world_size() dist.init_parallel_env() paddle.seed(123456) np.random.seed(123456) random.seed(123456) cnt = 0 def train(train_loader, model, optimizer, epoch, start_iter, cfg): model.train() # meters batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() losses_text = AverageMeter() losses_kernels = AverageMeter()
def train(args, fake_data_reader, to_static): program_translator = ProgramTranslator() program_translator.enable(to_static) config = parse_config(args.config) train_config = merge_configs(config, 'train', vars(args)) valid_config = merge_configs(config, 'valid', vars(args)) print_configs(train_config, 'Train') place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() random.seed(0) np.random.seed(0) with fluid.dygraph.guard(place): paddle.seed(1000) paddle.framework.random._manual_program_seed(1000) video_model = TSM_ResNet("TSM", train_config, 'Train') optimizer = create_optimizer(train_config.TRAIN, video_model.parameters()) train_reader = fake_data_reader.create_reader() ret = [] for epoch in range(train_config.TRAIN.epoch): video_model.train() total_loss = 0.0 total_acc1 = 0.0 total_acc5 = 0.0 total_sample = 0 for batch_id, data in enumerate(train_reader()): x_data = np.array([item[0] for item in data]) y_data = np.array([item[1] for item in data]).reshape([-1, 1]) imgs = to_variable(x_data) labels = to_variable(y_data) labels.stop_gradient = True outputs = video_model(imgs) loss = fluid.layers.cross_entropy(input=outputs, label=labels, ignore_index=-1) avg_loss = fluid.layers.mean(loss) acc_top1 = fluid.layers.accuracy(input=outputs, label=labels, k=1) acc_top5 = fluid.layers.accuracy(input=outputs, label=labels, k=5) avg_loss.backward() optimizer.minimize(avg_loss) video_model.clear_gradients() total_loss += avg_loss.numpy()[0] total_acc1 += acc_top1.numpy()[0] total_acc5 += acc_top5.numpy()[0] total_sample += 1 print('TRAIN Epoch {}, iter {}, loss = {}, acc1 {}, acc5 {}'. format(epoch, batch_id, avg_loss.numpy()[0], acc_top1.numpy()[0], acc_top5.numpy()[0])) ret.extend([ avg_loss.numpy()[0], acc_top1.numpy()[0], acc_top5.numpy()[0] ]) print( 'TRAIN End, Epoch {}, avg_loss= {}, avg_acc1= {}, avg_acc5= {}' .format(epoch, total_loss / total_sample, total_acc1 / total_sample, total_acc5 / total_sample)) return ret
def init_seed(seed): """ set seed for reproduct results""" paddle.seed(seed) np.random.seed(seed) random.seed(seed)
def testSetNumpy(self): seed = 90 hidden_size = 10 vocab_size = 1000 num_layers = 1 num_steps = 3 init_scale = 0.1 batch_size = 4 batch_num = 200 with fluid.dygraph.guard(): paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) # TODO: marsyang1993 Change seed to ptb_model = PtbModel(hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale) bd = [] lr_arr = [1.0] # this a fake lr decay strategy for i in range(1, 10): bd.append(100 * i) new_lr = 1.0 lr_arr.append(new_lr) place = fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0) scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=bd, values=lr_arr) adam = Adam(learning_rate=scheduler, parameters=ptb_model.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None last_hidden = None last_cell = None for i in range(batch_num): x_data = np.arange(12).reshape(4, 3).astype('int64') y_data = np.arange(1, 13).reshape(4, 3).astype('int64') y_data = y_data.reshape((-1, 1)) init_hidden_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') x = to_variable(x_data) y = to_variable(y_data) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) dy_loss, last_hidden, last_cell = ptb_model( x, y, init_hidden, init_cell) if i == 0: for param in ptb_model.parameters(): dy_param_init[param.name] = param.numpy() dy_loss.backward() adam.minimize(dy_loss) scheduler.step() ptb_model.clear_gradients() if i == batch_num - 1: for param in ptb_model.parameters(): dy_param_updated[param.name] = param.numpy() # check optimizer opti_dict = adam.state_dict() np_opti_dict = {} # set to zero for k, v in opti_dict.items(): if isinstance(v, core.VarBase): np_t = v.numpy() np_opti_dict[v.name] = np_t var = v.value().get_tensor() var.set(np.zeros_like(np_t), place) self.assertTrue(np.sum(np.abs(v.numpy())) == 0) else: np_opti_dict[k] = v if isinstance(adam._learning_rate, LearningRateDecay): adam._learning_rate.step_num = 0 adam.set_state_dict(np_opti_dict) opti_dict = adam.state_dict() for k, v in opti_dict.items(): if isinstance(v, core.VarBase): self.assertTrue( np.array_equal(v.numpy(), self.base_opti[v.name])) else: self.assertEqual(v, self.base_opti[k]) # check parameter state_dict = ptb_model.state_dict() np_state_dict = {} for k, v in state_dict.items(): np_t = v.numpy() np_state_dict[k] = np_t var = v.value().get_tensor() var.set(np.zeros_like(np_t), place) ptb_model.set_dict(np_state_dict) state_dict = ptb_model.state_dict() for k, v in state_dict.items(): new_t = v.numpy() base_t = self.model_base[k] self.assertTrue(np.array_equal(new_t, base_t))
def eval(args, num_states, num_actions): log_writer = LogWriter(logdir='log') # 固定初始化状态 paddle.seed(123) # 使用 GPU预测 if paddle.is_compiled_with_cuda(): paddle.set_device("gpu:0") # 判断游戏动作类型 if args.action_type == "right": actions = RIGHT_ONLY elif args.action_type == "simple": actions = SIMPLE_MOVEMENT else: actions = COMPLEX_MOVEMENT # 创建游戏动作 env = create_train_env(args.world, args.stage, actions) # 获取网络模型 local_model = Model(num_states, num_actions) # 切换为评估状态 local_model.eval() # 将图像转换为Paddle的数据类型 state = paddle.to_tensor(env.reset(), dtype="float32") # 一开始就更新模型参数 done = True # 日志的记录步数 step = 0 # 旧模型的MD5 old_model_file_md5 = '' # 游戏总得分 total_reward = 0 while True: # 每结束一次就更新模型参数 if done: try: model_path = "{}/model_{}_{}.pdparams".format(args.saved_path, args.world, args.stage) # 使用文件的MD5保证每个模型只用一次 with open(model_path, 'rb') as f: file = f.read() file_md5 = hashlib.md5(file).hexdigest() if file_md5 == old_model_file_md5: continue else: model_dict = paddle.load(model_path) old_model_file_md5 = file_md5 except: continue total_reward = 0 local_model.load_dict(model_dict) # 预测动作概率和评估值 logits, value = local_model(state) # 获取动作的序号 policy = F.softmax(logits, axis=1) action = paddle.argmax(policy)[0] # 执行游戏 state, reward, done, info = env.step(int(action)) total_reward += reward # 显示界面 if args.show_play: env.render() # 游戏通关 if info["flag_get"]: print("World {} stage {} 通关".format(args.world, args.stage)) paddle.save(local_model.state_dict(), "{}/model_{}_{}_finish.pdparams".format(args.saved_path, args.world, args.stage)) # 重置游戏状态 if done: step += 1 state = env.reset() print('总得分是:%f' % total_reward) log_writer.add_scalar(tag='Eval reward', value=total_reward, step=step) # 转换每一步都游戏状态 state = paddle.to_tensor(state, dtype="float32")
def testSetVariableBeforeTrain(self): seed = 90 hidden_size = 10 vocab_size = 1000 num_layers = 1 num_steps = 3 init_scale = 0.1 batch_size = 4 batch_num = 200 with fluid.dygraph.guard(): paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) # TODO: marsyang1993 Change seed to ptb_model = PtbModel(hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale) place = fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0) adam = Adam(learning_rate=0.0, beta1=0.8, beta2=0.6, parameters=ptb_model.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None last_hidden = None last_cell = None adam.set_state_dict(self.opti_dict) ptb_model.set_dict(self.state_dict) for i in range(1): x_data = np.arange(12).reshape(4, 3).astype('int64') y_data = np.arange(1, 13).reshape(4, 3).astype('int64') y_data = y_data.reshape((-1, 1)) init_hidden_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') x = to_variable(x_data) y = to_variable(y_data) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) dy_loss, last_hidden, last_cell = ptb_model( x, y, init_hidden, init_cell) dy_loss.backward() adam.minimize(dy_loss) ptb_model.clear_gradients() opti_dict = adam.state_dict() for k, v in opti_dict.items(): if k == "global_step": self.assertTrue( np.array_equal(v.numpy(), self.base_opti[v.name] + 1)) if k.find("beta1_pow_acc_0") > 0: self.assertTrue( np.array_equal(v.numpy(), self.base_opti[v.name] * adam._beta1)) if k.find("beta2_pow_acc_0") > 0: self.assertTrue( np.array_equal(v.numpy(), self.base_opti[v.name] * adam._beta2)) state_dict = ptb_model.state_dict() for k, v in state_dict.items(): new_t = v.numpy() base_t = self.model_base[k] self.assertTrue(np.array_equal(new_t, base_t))
def check_network_convergence(cls, method, use_device=DeviceType.CUDA, iter=5, batch_size=None, feed_dict=None, feed_data_reader=None, get_data_from_feeder=None, use_parallel_executor=True, use_reduce=False, use_ir_memory_optimize=False, enable_inplace=True, fuse_elewise_add_act_ops=False, fuse_all_optimizer_ops=False, fuse_all_reduce_ops=False, fuse_relu_depthwise_conv=False, optimizer=fluid.optimizer.Adam, use_fast_executor=False, enable_sequential_execution=False): def run_executor(exe, binary, feed, fetch_list): if feed_data_reader is None: res = exe.run(binary, feed=feed, fetch_list=fetch_list) else: res = exe.run(binary, feed=feed_data_reader.get_next(exe, binary), fetch_list=fetch_list) return res if feed_data_reader is not None: assert isinstance( feed_data_reader, FeedDataReader ), "feed_data_reader must be type of FeedDataReader" paddle.seed(1) paddle.framework.random._manual_program_seed(1) main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): feed_dict, loss = cls.build_model(feed_dict, get_data_from_feeder, main, method, optimizer) place = fluid.CUDAPlace( 0) if use_device == DeviceType.CUDA else fluid.XPUPlace( 0) if use_device == DeviceType.XPU else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup) build_strategy, exec_strategy = cls.set_strategy( enable_inplace, enable_sequential_execution, fuse_all_optimizer_ops, fuse_all_reduce_ops, fuse_elewise_add_act_ops, fuse_relu_depthwise_conv, use_fast_executor, use_ir_memory_optimize, use_reduce, use_device) if use_parallel_executor: binary = compiler.CompiledProgram(main).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) else: binary = main if batch_size is not None: batch_size *= fluid.core.get_cuda_device_count( ) if use_device == DeviceType.CUDA else fluid.core.get_xpu_device_count( ) if use_device == DeviceType.XPU else int( os.environ.get('CPU_NUM', multiprocessing.cpu_count())) begin = time.time() first_loss, = run_executor( exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]) for _ in range(iter): run_executor(exe=exe, binary=binary, feed=feed_dict, fetch_list=[]) last_loss, = run_executor( exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]) end = time.time() if batch_size is not None: print("%.4f Instance per second" % ( (batch_size * iter + 2) / (end - begin))) avg_last_loss_val = np.array(last_loss).mean() avg_first_loss_val = np.array(first_loss).mean() if math.isnan(float(avg_last_loss_val)) or math.isnan( float(avg_first_loss_val)): sys.exit("got NaN loss, training failed.") print(first_loss, last_loss) # self.assertGreater(first_loss[0], last_loss[0]) return first_loss, last_loss
# See the License for the specific language governing permissions and # limitations under the License. """MolTrans backbone model.""" from helper import utils import paddle from paddle import nn import paddle.io as Data import paddle.nn.functional as F import numpy as np import math import pdb #from pahelix.networks.involution_block import Involution2D # Set seed for reproduction paddle.seed(2) np.random.seed(3) class MolTransModel(nn.Sequential): """ Interaction Module """ def __init__(self, model_config): """ Initialization """ super(MolTransModel, self).__init__() # Basic config self.model_config = model_config self.drug_max_seq = model_config['drug_max_seq']
import paddle import paddle.distributed.auto_parallel as auto from .cost_model import estimate_cost from .dist_op import DistributedOperator from .process_group import _g_process_group_map from .process_group import ProcessGroup, get_process_group from .operators.common import is_elementwise_op from .operators.common import get_distributed_operator_impl_container from .utils import update_op_dims_mapping_by_default_dist_impl from .utils import update_op_dims_mapping_by_elementwise_like_dist_impl from .utils import get_all_distributed_main_program from .dist_context import DistributedContext, DistributedOperatorContext from .dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute paddle.enable_static() paddle.seed(123) random.seed(123) np.random.seed(123) class PlanFilter: @staticmethod def check_dims_mapping_for_tensor(process_mesh_topology, tensor_shape, dims_mapping): valid = True assert len(tensor_shape) == len(dims_mapping) for idx, dim_mapping in enumerate(dims_mapping): if dim_mapping != -1: if tensor_shape[idx] % process_mesh_topology[ dim_mapping] != 0 or dims_mapping.count(
def compress(args): shuffle = True if args.ce_test: # set seed seed = 111 paddle.seed(seed) np.random.seed(seed) random.seed(seed) args.num_workers = 0 shuffle = False env = os.environ num_trainers = int(env.get('PADDLE_TRAINERS_NUM', 1)) use_data_parallel = num_trainers > 1 if use_data_parallel: # Fleet step 1: initialize the distributed environment role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) train_reader = None test_reader = None if args.data == "mnist": transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = paddle.vision.datasets.MNIST( mode='train', backend="cv2", transform=transform) val_dataset = paddle.vision.datasets.MNIST( mode='test', backend="cv2", transform=transform) class_dim = 10 image_shape = "1,28,28" args.pretrained_model = False elif args.data == "cifar10": transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = paddle.vision.datasets.Cifar10( mode="train", backend="cv2", transform=transform) val_dataset = paddle.vision.datasets.Cifar10( mode="test", backend="cv2", transform=transform) class_dim = 10 image_shape = "3, 32, 32" args.pretrained_model = False elif args.data == "imagenet": import imagenet_reader as reader train_dataset = reader.ImageNetDataset(mode='train') val_dataset = reader.ImageNetDataset(mode='val') class_dim = 1000 image_shape = "3,224,224" else: raise ValueError("{} is not supported.".format(args.data)) image_shape = [int(m) for m in image_shape.split(",")] assert args.model in model_list, "{} is not in lists: {}".format(args.model, model_list) if args.use_gpu: places = paddle.static.cuda_places() else: places = paddle.static.cpu_places() place = places[0] exe = paddle.static.Executor(place) image = paddle.static.data( name='image', shape=[None] + image_shape, dtype='float32') label = paddle.static.data(name='label', shape=[None, 1], dtype='int64') batch_size_per_card = args.batch_size batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=batch_size_per_card, shuffle=shuffle, drop_last=True) train_loader = paddle.io.DataLoader( train_dataset, places=place, batch_sampler=batch_sampler, feed_list=[image, label], return_list=False, use_shared_memory=True, num_workers=args.num_workers) valid_loader = paddle.io.DataLoader( val_dataset, places=place, feed_list=[image, label], drop_last=False, return_list=False, use_shared_memory=True, batch_size=args.batch_size_for_validation, shuffle=False) step_per_epoch = int( np.ceil(len(train_dataset) * 1. / args.batch_size / num_trainers)) # model definition model = models.__dict__[args.model]() out = model.net(input=image, class_dim=class_dim) if args.data == 'cifar10': label = paddle.reshape(label, [-1, 1]) cost = paddle.nn.functional.loss.cross_entropy(input=out, label=label) avg_cost = paddle.mean(x=cost) acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1) acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5) val_program = paddle.static.default_main_program().clone(for_test=True) opt, learning_rate = create_optimizer(args, step_per_epoch) # Fleet step 2: distributed strategy if use_data_parallel: dist_strategy = DistributedStrategy() dist_strategy.sync_batch_norm = False dist_strategy.exec_strategy = paddle.static.ExecutionStrategy() dist_strategy.fuse_all_reduce_ops = False train_program = paddle.static.default_main_program() if args.pruning_strategy == 'gmp': # GMP pruner step 0: define configs for GMP, no need to define configs for the base training. configs = { 'stable_iterations': args.stable_epochs * step_per_epoch, 'pruning_iterations': args.pruning_epochs * step_per_epoch, 'tunning_iterations': args.tunning_epochs * step_per_epoch, 'resume_iteration': (args.last_epoch + 1) * step_per_epoch, 'pruning_steps': args.pruning_steps, 'initial_ratio': args.initial_ratio, } elif args.pruning_strategy == 'base': configs = None # GMP pruner step 1: initialize a pruner object by calling entry function. pruner = create_unstructured_pruner( train_program, args, place, configs=configs) if use_data_parallel: # Fleet step 3: decorate the origial optimizer and minimize it opt = fleet.distributed_optimizer(opt, strategy=dist_strategy) opt.minimize(avg_cost, no_grad_set=pruner.no_grad_set) exe.run(paddle.static.default_startup_program()) if args.last_epoch > -1: assert args.checkpoint is not None and os.path.exists( args.checkpoint), "Please specify a valid checkpoint path." paddle.fluid.io.load_persistables( executor=exe, dirname=args.checkpoint, main_program=train_program) elif args.pretrained_model: assert os.path.exists( args. pretrained_model), "Pretrained model path {} doesn't exist".format( args.pretrained_model) def if_exist(var): return os.path.exists(os.path.join(args.pretrained_model, var.name)) _logger.info("Load pretrained model from {}".format( args.pretrained_model)) # NOTE: We are using fluid.io.load_vars() because the pretrained model is from an older version which requires this API. # Please consider using paddle.static.load(program, model_path) when possible paddle.fluid.io.load_vars( exe, args.pretrained_model, predicate=if_exist) def test(epoch, program): acc_top1_ns = [] acc_top5_ns = [] _logger.info( "The current sparsity of the inference model is {}%".format( round(100 * UnstructuredPruner.total_sparse( paddle.static.default_main_program()), 2))) for batch_id, data in enumerate(valid_loader): start_time = time.time() acc_top1_n, acc_top5_n = exe.run( program, feed=data, fetch_list=[acc_top1.name, acc_top5.name]) end_time = time.time() if batch_id % args.log_period == 0: _logger.info( "Eval epoch[{}] batch[{}] - acc_top1: {}; acc_top5: {}; time: {}". format(epoch, batch_id, np.mean(acc_top1_n), np.mean(acc_top5_n), end_time - start_time)) acc_top1_ns.append(np.mean(acc_top1_n)) acc_top5_ns.append(np.mean(acc_top5_n)) _logger.info("Final eval epoch[{}] - acc_top1: {}; acc_top5: {}".format( epoch, np.mean(np.array(acc_top1_ns)), np.mean(np.array(acc_top5_ns)))) def train(epoch, program): train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() for batch_id, data in enumerate(train_loader): train_reader_cost += time.time() - reader_start train_start = time.time() loss_n, acc_top1_n, acc_top5_n = exe.run( program, feed=data, fetch_list=[avg_cost.name, acc_top1.name, acc_top5.name]) # GMP pruner step 2: step() to update ratios and other internal states of the pruner. pruner.step() train_run_cost += time.time() - train_start total_samples += args.batch_size loss_n = np.mean(loss_n) acc_top1_n = np.mean(acc_top1_n) acc_top5_n = np.mean(acc_top5_n) if batch_id % args.log_period == 0: _logger.info( "epoch[{}]-batch[{}] lr: {:.6f} - loss: {}; acc_top1: {}; acc_top5: {}; avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec". format(epoch, batch_id, learning_rate.get_lr(), loss_n, acc_top1_n, acc_top5_n, train_reader_cost / args.log_period, ( train_reader_cost + train_run_cost ) / args.log_period, total_samples / args.log_period, total_samples / (train_reader_cost + train_run_cost ))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 learning_rate.step() reader_start = time.time() if use_data_parallel: # Fleet step 4: get the compiled program from fleet compiled_train_program = fleet.main_program else: compiled_train_program = paddle.static.CompiledProgram( paddle.static.default_main_program()) for i in range(args.last_epoch + 1, args.num_epochs): train(i, compiled_train_program) # GMP pruner step 3: update params before summrizing sparsity, saving model or evaluation. pruner.update_params() _logger.info("The current sparsity of the pruned model is: {}%".format( round(100 * UnstructuredPruner.total_sparse( paddle.static.default_main_program()), 2))) if (i + 1) % args.test_period == 0: test(i, val_program) if (i + 1) % args.model_period == 0: if use_data_parallel: fleet.save_persistables(executor=exe, dirname=args.model_path) else: paddle.fluid.io.save_persistables( executor=exe, dirname=args.model_path)
def set_seed(seed): paddle.seed(seed) random.seed(seed) np.random.seed(seed)
def set_seed(args): random.seed(args.seed) np.random.seed(args.seed) paddle.seed(args.seed)
def do_train(args): # Initialize the paddle and paddle fleet execute environment paddle.enable_static() fleet.init(is_collective=True) # Create the random seed for the worker random.seed(args.seed) np.random.seed(args.seed) paddle.seed(args.seed) get_rng_state_tracker().add('global_seed', args.seed) get_rng_state_tracker().add('local_seed', args.seed + fleet.worker_index() + 2021) if args.use_amp and args.amp_level == "O2": assert (args.mp_degree == 1 and args.pp_degree == 1 ), "When amp level is O2, mp_degree and pp_degree should be 1." assert (args.use_sharding == False ), "When amp level is O2, use_sharding should be False." assert args.device in [ "cpu", "gpu", "xpu" ], "Invalid device! Available device should be cpu, gpu, or xpu." place = paddle.set_device(args.device) worker_num = fleet.worker_num() worker_index = fleet.worker_index() local_rank = 0 if fleet.local_rank() is None else int(fleet.local_rank()) topo = Topology(device_rank=worker_index, world_size=worker_num, dp_degree=args.dp_degree, pp_degree=args.pp_degree, sharding_degree=args.sharding_degree, mp_degree=args.mp_degree) logger.info("The topo of hybrid parallelism:\n{}".format(topo)) dist_strategy = dist_optimizer(args, topo) # Create log write, train results show on last card of pipeline. if topo.is_last: log_writer_path = os.path.join( args.output_dir, "train_log", "{}_globalbsz_{}_amp_{}_recompute_{}_card_{}".format( args.model_name_or_path, args.global_batch_size, args.use_amp, args.use_recompute, worker_index).lower()) if os.path.exists(log_writer_path): import shutil shutil.rmtree(log_writer_path) log_writer = LogWriter(log_writer_path) # Define the input data in the static mode model_class, tokenizer_class = MODEL_CLASSES[args.model_type] pretrained_models_list = list( model_class.pretrained_init_configuration.keys()) data_file = get_train_data_file(args) main_program = paddle.static.default_main_program() startup_program = paddle.static.default_startup_program() with paddle.static.program_guard(main_program, startup_program): with paddle.utils.unique_name.guard(): with paddle.static.device_guard('gpu:0'): data_holders = create_data_holder(args) [tokens, loss_mask, position_ids, labels] = data_holders tokenizer = tokenizer_class.from_pretrained( args.model_name_or_path) eos_id = tokenizer.eos_token_id train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset( args, data_file, local_rank=local_rank, data_world_size=topo.data_info.size, data_world_rank=topo.data_info.rank, eos_id=eos_id, max_seq_len=args.max_seq_len, places=paddle.static.cuda_places(), data_holders=data_holders, pipeline_mode=False, ) if args.model_name_or_path in pretrained_models_list: model_config = model_class.pretrained_init_configuration[ args.model_name_or_path] model_config[ "hidden_dropout_prob"] = args.hidden_dropout_prob model_config[ "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob model_config["topo"] = topo model = guard(f'gpu:{args.pp_degree -1}')( GPTForPretraining)( guard(f'gpu:0')(GPTModel)(**model_config)) else: model, _ = GPTForPretraining.from_pretrained( args.model_name_or_path, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args. attention_probs_dropout_prob, topo=topo) # Create the model for the gpt pretrain preds = model(tokens, position_ids) criterion = guard(f'gpu:{args.pp_degree -1}')( GPTPretrainingCriterion)(topo) loss = criterion(preds, labels, loss_mask) # Create the learning_rate sheduler and optimizer if args.decay_steps is None: args.decay_steps = args.max_steps warmup_step = args.warmup_rate * args.decay_steps # TODO @ZHUI Use paddle network to support lr scheduler lr_scheduler = lr.CosineAnnealingWithWarmupDecay( max_lr=args.max_lr, min_lr=args.min_lr, warmup_step=warmup_step, decay_step=args.decay_steps) clip = None if args.grad_clip > 0: clip = paddle.fluid.clip.GradientClipByGlobalNorm( clip_norm=args.grad_clip) decay_param = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=args.adam_beta1, beta2=args.adam_beta2, epsilon=args.adam_epsilon, grad_clip=clip, weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_param) # alias optimizer.apply_optimize = optimizer._apply_optimize if args.use_recompute: dist_strategy.recompute = True dist_strategy.recompute_configs = { "checkpoints": model.gpt.checkpoints } # Use the fleet api to compile the distributed optimizer optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) optimizer.minimize(loss) logger.info(f'final strategy: {fleet._final_strategy()}') logger.info("The training meta optimizer is/are %s" % fleet._get_applied_meta_list()) program_desc_dir = os.path.join(args.output_dir, "program_desc") if not os.path.isdir(program_desc_dir): os.mkdir(program_desc_dir) with open(program_desc_dir + "/main_program.txt.%d" % worker_index, 'w') as f: f.write(str(main_program)) with open(program_desc_dir + "/startup_program.txt.%d" % worker_index, 'w') as f: f.write(str(startup_program)) # Define the Executor for running the static model exe = paddle.static.Executor(place) exe.run(startup_program) test_program = main_program.clone(for_test=True) if args.use_amp and args.amp_level == "O2": optimizer.amp_init(place) if args.model_name_or_path not in pretrained_models_list: logger.info("Try to load checkpoint from %s " % args.model_name_or_path) dygrah_path = os.path.join(args.model_name_or_path, "model_state.pdparams") static_path = os.path.join(args.model_name_or_path, "static_vars") flag_loaded = False if os.path.exists(static_path): if args.mp_degree > 1: logger.warning("MP should init with dygraph params") else: logger.info("Loading parameters from %s" % static_path) paddle.static.load(main_program, static_path, exe) flag_loaded = True if not flag_loaded and os.path.exists(dygrah_path): if args.sharding_degree > 1: logger.warning("Sharding should init with static vars") else: logger.info("Loading parameters from %s" % dygrah_path) init_static_with_params( model, paddle.load(dygrah_path, return_numpy=True), topo, main_program) flag_loaded = True if not flag_loaded: logger.error("No checkpoint load.") global_step = 0 tic_train = time.time() epoch = 0 learning_rate = main_program.global_block().vars["learning_rate_0"] while True: fetchs = [] if topo.is_last: fetchs = [loss, learning_rate] # Bug fix, if not call valid_data_loader, the enumerate will call valid_data_loader # many times. and start a new random dataloader. valid_data_loader = valid_data_loader() test_data_loader = test_data_loader() train_reader_cost = 0.0 train_run_cost = 0.0 reader_start = time.time() for step, batch in enumerate(train_data_loader()): train_reader_cost += time.time() - reader_start train_start = time.time() global_step += 1 ret = exe.run(main_program, feed=batch, fetch_list=fetchs, use_program_cache=True) # In the new 2.0 api, must call this function to change the learning_rate lr_scheduler.step() train_run_cost += time.time() - train_start # Profile for model benchmark profiler.add_profiler_step(args.profiler_options) if global_step % args.logging_freq == 0: if topo.is_last: loss_return, lr_return = ret #speed = args.logging_freq / (time.time() - tic_train) speed = args.logging_freq / (train_reader_cost + train_run_cost) avg_reader_cost = train_reader_cost / args.logging_freq logger.info( "global step %d, epoch: %d, batch: %d, loss: %.9f, avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, speed: %.2f steps/s, ips: %.0f tokens/s, learning rate: %.5e" % (global_step, epoch, step, loss_return[0], avg_reader_cost, 1. / speed, speed, speed * args.global_batch_size * args.max_seq_len, lr_return[0])) log_writer.add_scalar("loss", loss_return[0], global_step) log_writer.add_scalar("learning_rate", lr_return[0], global_step) tic_train = time.time() train_reader_cost = 0.0 train_run_cost = 0.0 if args.check_accuracy: if global_step >= args.max_steps: return else: continue if global_step % args.eval_freq == 0: # TODO, check the input data of validation eval_fetch = [] if topo.is_last: eval_fetch = [loss] run_evaluate(valid_data_loader, exe, test_program, args.eval_iters, log_writer, global_step, args, epoch, topo.is_last, eval_fetch, "valid") tic_train = time.time() if global_step % args.save_steps == 0 or global_step >= args.max_steps: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) logger.debug("saving models to {}".format(output_dir)) save_persistables(exe, os.path.join(output_dir, "static_vars"), main_program) if global_step <= args.save_steps: model.init_config["init_args"][0].init_config.pop( "topo", None) model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) tic_train = time.time() if global_step >= args.max_steps: eval_fetch = [] if topo.is_last: eval_fetch = [loss] run_evaluate(test_data_loader, exe, test_program, args.test_iters, log_writer, global_step, args, epoch, topo.is_last, eval_fetch, "test") del train_data_loader return reader_start = time.time() epoch += 1
def set_seed(args): random.seed(args.seed + paddle.distributed.get_rank()) np.random.seed(args.seed + paddle.distributed.get_rank()) paddle.seed(args.seed + paddle.distributed.get_rank())
def setUp(self): # enable dygraph mode fluid.enable_dygraph() # config seed paddle.seed(SEED) paddle.framework.random._manual_program_seed(SEED)
def main(args): paddle.seed(12345) # load config config = load_yaml(args.config_yaml) dy_model_class = load_dy_model_class(args.abs_dir) config["config_abs_dir"] = args.abs_dir # modify config from command if args.opt: for parameter in args.opt: parameter = parameter.strip() key, value = parameter.split("=") if type(config.get(key)) is int: value = int(value) if type(config.get(key)) is bool: value = (True if value.lower() == "true" else False) config[key] = value # tools.vars use_gpu = config.get("runner.use_gpu", True) config["runner.train_data_dir"] = "../../../test_tipc/data/train" train_data_dir = config.get("runner.train_data_dir", None) epochs = config.get("runner.epochs", None) print_interval = config.get("runner.print_interval", None) model_save_path = config.get("runner.model_save_path", "model_output") model_init_path = config.get("runner.model_init_path", None) end_epoch = config.get("runner.infer_end_epoch", 0) CE = config.get("runner.CE", False) batch_size = config.get("train_batch_size", 1024) logger.info("**************common.configs**********") logger.info( "use_gpu: {}, train_data_dir: {}, epochs: {}, print_interval: {}, model_save_path: {}" .format(use_gpu, train_data_dir, epochs, print_interval, model_save_path)) logger.info("**************common.configs**********") place = paddle.set_device('gpu' if use_gpu else 'cpu') dy_model = dy_model_class.create_model(config) if not CE: model_save_path = os.path.join(model_save_path, str(end_epoch - 1)) load_model(model_init_path, dy_model) paddle.enable_static() num_nodes = paddle.static.data(name='num_nodes', shape=[None], dtype='int32') dy_model = paddle.jit.to_static( dy_model, input_spec=[ paddle.static.InputSpec(shape=[None, 2], dtype='int32', name='edges'), paddle.static.InputSpec(shape=[None, 1], dtype='float32', name='node_feat'), paddle.static.InputSpec(shape=[None, 2], dtype='float32', name='edge_feat'), paddle.static.InputSpec(shape=[None], dtype='int32', name='segment_ids') ]) save_jit_model(dy_model, model_save_path, prefix='tostatic')
import ast import time import paddle import paddle.fluid as fluid from paddle.fluid.dygraph.nn import Linear from paddle.distributed import fleet from paddle.fluid.dygraph import nn from paddle.fluid.framework import _test_eager_guard from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2 from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2 from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import ShardingStage3 from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler epoch = 10 paddle.seed(2021) np.random.seed(2021) base_lr = 0.1 momentum_rate = 0.9 l2_decay = 1e-4 class MLP(fluid.Layer): def __init__(self, linear_size=1000, param_attr=None, bias_attr=None): super(MLP, self).__init__() self._linear1 = Linear(linear_size, linear_size) self._linear2 = Linear(linear_size, linear_size) self._linear3 = Linear(linear_size, 10) def forward(self, inputs):
def do_train(args): paddle.set_device(args.device) trainer_count = dist.get_world_size() rank = dist.get_rank() if trainer_count > 1: dist.init_parallel_env() # Set seed for CE random_seed = eval(str(args.random_seed)) if random_seed is not None: paddle.seed(random_seed) # Define data loader (train_loader), (eval_loader) = reader.create_data_loader( args, places=paddle.get_device()) # Define model transformer = SimultaneousTransformer( args.src_vocab_size, args.trg_vocab_size, args.max_length + 1, args.n_layer, args.n_head, args.d_model, args.d_inner_hid, args.dropout, args.weight_sharing, args.bos_idx, args.eos_idx, args.waitk) print('waitk=', args.waitk) # Define loss criterion = CrossEntropyCriterion(args.label_smooth_eps, args.bos_idx) # Define optimizer scheduler = paddle.optimizer.lr.NoamDecay(args.d_model, args.warmup_steps, args.learning_rate) optimizer = paddle.optimizer.Adam(learning_rate=scheduler, beta1=args.beta1, beta2=args.beta2, epsilon=float(args.eps), parameters=transformer.parameters()) # Init from some checkpoint, to resume the previous training if args.init_from_checkpoint: model_dict = paddle.load( os.path.join(args.init_from_checkpoint, "transformer.pdparams")) opt_dict = paddle.load( os.path.join(args.init_from_checkpoint, "transformer.pdopt")) transformer.set_state_dict(model_dict) optimizer.set_state_dict(opt_dict) print("loaded from checkpoint.") # Init from some pretrain models, to better solve the current task if args.init_from_pretrain_model: model_dict = paddle.load( os.path.join(args.init_from_pretrain_model, "transformer.pdparams")) transformer.set_state_dict(model_dict) print("loaded from pre-trained model.") if trainer_count > 1: transformer = paddle.DataParallel(transformer) # The best cross-entropy value with label smoothing loss_normalizer = -( (1. - args.label_smooth_eps) * np.log((1. - args.label_smooth_eps)) + args.label_smooth_eps * np.log(args.label_smooth_eps / (args.trg_vocab_size - 1) + 1e-20)) step_idx = 0 # For logging reader_cost_avg = AverageStatistical() batch_cost_avg = AverageStatistical() batch_ips_avg = AverageStatistical() # Train loop for pass_id in range(args.epoch): epoch_start = time.time() batch_id = 0 batch_start = time.time() for input_data in train_loader: train_reader_cost = time.time() - batch_start (src_word, trg_word, lbl_word) = input_data if args.use_amp: scaler = paddle.amp.GradScaler( init_loss_scaling=args.scale_loss) with paddle.amp.auto_cast(): logits = transformer(src_word=src_word, trg_word=trg_word) sum_cost, avg_cost, token_num = criterion(logits, lbl_word) scaled_loss = scaler.scale(avg_cost) # scale the loss scaled_loss.backward() # do backward scaler.minimize(optimizer, scaled_loss) # update parameters optimizer.clear_grad() else: logits = transformer(src_word=src_word, trg_word=trg_word) sum_cost, avg_cost, token_num = criterion(logits, lbl_word) avg_cost.backward() optimizer.step() optimizer.clear_grad() if args.max_iter and step_idx + 1 == args.max_iter: return tokens_per_cards = token_num.numpy() train_batch_cost = time.time() - batch_start reader_cost_avg.record(train_reader_cost) batch_cost_avg.record(train_batch_cost) batch_ips_avg.record(train_batch_cost, tokens_per_cards) if step_idx % args.print_step == 0: total_avg_cost = avg_cost.numpy() if step_idx == 0: logger.info( "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, " "normalized loss: %f, ppl: %f " % (step_idx, pass_id, batch_id, total_avg_cost, total_avg_cost - loss_normalizer, np.exp([min(total_avg_cost, 100)]))) else: train_avg_batch_cost = args.print_step / \ batch_cost_avg.get_total_time() logger.info( "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, " "normalized loss: %f, ppl: %f, avg_speed: %.2f step/sec, " "batch_cost: %.5f sec, reader_cost: %.5f sec, tokens: %d, " "ips: %.5f words/sec" % (step_idx, pass_id, batch_id, total_avg_cost, total_avg_cost - loss_normalizer, np.exp([min(total_avg_cost, 100)]), train_avg_batch_cost, batch_cost_avg.get_average(), reader_cost_avg.get_average(), batch_ips_avg.get_total_cnt(), batch_ips_avg.get_average_per_sec())) reader_cost_avg.reset() batch_cost_avg.reset() batch_ips_avg.reset() if step_idx % args.save_step == 0 and step_idx != 0: # Validation transformer.eval() total_sum_cost = 0 total_token_num = 0 with paddle.no_grad(): for input_data in eval_loader: (src_word, trg_word, lbl_word) = input_data logits = transformer(src_word=src_word, trg_word=trg_word) sum_cost, avg_cost, token_num = criterion( logits, lbl_word) total_sum_cost += sum_cost.numpy() total_token_num += token_num.numpy() total_avg_cost = total_sum_cost / total_token_num logger.info( "validation, step_idx: %d, avg loss: %f, " "normalized loss: %f, ppl: %f" % (step_idx, total_avg_cost, total_avg_cost - loss_normalizer, np.exp([min(total_avg_cost, 100)]))) transformer.train() if args.save_model and rank == 0: model_dir = os.path.join(args.save_model, "step_" + str(step_idx)) if not os.path.exists(model_dir): os.makedirs(model_dir) paddle.save( transformer.state_dict(), os.path.join(model_dir, "transformer.pdparams")) paddle.save(optimizer.state_dict(), os.path.join(model_dir, "transformer.pdopt")) batch_id += 1 step_idx += 1 scheduler.step() batch_start = time.time() train_epoch_cost = time.time() - epoch_start logger.info("train epoch: %d, epoch_cost: %.5f s" % (pass_id, train_epoch_cost)) if args.save_model and rank == 0: model_dir = os.path.join(args.save_model, "step_final") if not os.path.exists(model_dir): os.makedirs(model_dir) paddle.save(transformer.state_dict(), os.path.join(model_dir, "transformer.pdparams")) paddle.save(optimizer.state_dict(), os.path.join(model_dir, "transformer.pdopt"))
def test_gnn_float32(self): paddle.seed(90) paddle.framework.random._manual_program_seed(90) startup = fluid.Program() main = fluid.Program() scope = fluid.core.Scope() with new_program_scope(main=main, startup=startup, scope=scope): features = fluid.layers.data( name='features', shape=[1, 100, 50], dtype='float32', append_batch_size=False) # Use selected rows when it's supported. adj = fluid.layers.data( name='adj', shape=[1, 100, 100], dtype='float32', append_batch_size=False) labels = fluid.layers.data( name='labels', shape=[100, 1], dtype='int64', append_batch_size=False) model = GCN('test_gcn', 50) logits = model(features, adj) logits = fluid.layers.reshape(logits, logits.shape[1:]) # In other example, it's nll with log_softmax. However, paddle's # log_loss only supports binary classification now. loss = fluid.layers.softmax_with_cross_entropy(logits, labels) loss = fluid.layers.reduce_sum(loss) adam = AdamOptimizer(learning_rate=1e-3) adam.minimize(loss) exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) exe.run(startup) static_loss = exe.run(feed={ 'features': np.ones( [1, 100, 50], dtype=np.float32), 'adj': np.ones( [1, 100, 100], dtype=np.float32), 'labels': np.ones( [100, 1], dtype=np.int64) }, fetch_list=[loss])[0] static_weight = np.array( scope.find_var(model.gc.weight.name).get_tensor()) with fluid.dygraph.guard(): paddle.seed(90) paddle.framework.random._manual_program_seed(90) features = np.ones([1, 100, 50], dtype=np.float32) # Use selected rows when it's supported. adj = np.ones([1, 100, 100], dtype=np.float32) labels = np.ones([100, 1], dtype=np.int64) model = GCN('test_gcn', 50) logits = model(to_variable(features), to_variable(adj)) logits = fluid.layers.reshape(logits, logits.shape[1:]) # In other example, it's nll with log_softmax. However, paddle's # log_loss only supports binary classification now. loss = fluid.layers.softmax_with_cross_entropy(logits, to_variable(labels)) loss = fluid.layers.reduce_sum(loss) loss.backward() adam = AdamOptimizer( learning_rate=1e-3, parameter_list=model.parameters()) adam.minimize(loss) model.clear_gradients() loss_value = loss.numpy() model_gc_weight_value = model.gc.weight.numpy() with fluid.dygraph.guard(): paddle.seed(90) paddle.framework.random._manual_program_seed(90) features2 = np.ones([1, 100, 50], dtype=np.float32) # Use selected rows when it's supported. adj2 = np.ones([1, 100, 100], dtype=np.float32) labels2 = np.ones([100, 1], dtype=np.int64) model2 = GCN('test_gcn', 50) logits2 = model2(to_variable(features2), to_variable(adj2)) logits2 = fluid.layers.reshape(logits2, logits2.shape[1:]) # In other example, it's nll with log_softmax. However, paddle's # log_loss only supports binary classification now. loss2 = fluid.layers.softmax_with_cross_entropy( logits2, to_variable(labels2)) loss2 = fluid.layers.reduce_sum(loss2) loss2.backward() adam2 = AdamOptimizer( learning_rate=1e-3, parameter_list=model2.parameters()) adam2.minimize(loss2) model2.clear_gradients() loss2_value = loss2.numpy() model2_gc_weight_value = model2.gc.weight.numpy() self.assertEqual(static_loss, loss_value) self.assertTrue(np.allclose(static_weight, model_gc_weight_value)) self.assertEqual(static_loss, loss2_value) self.assertTrue(np.allclose(static_weight, model2_gc_weight_value)) sys.stderr.write('%s %s\n' % (static_loss, loss_value))