def optimize(self, loss, optimizer_type, lr): optimizer = F.optimizer.Adam(learning_rate=lr) dist_strategy = DistributedStrategy() dist_strategy.enable_sequential_execution = True optimizer = cfleet.distributed_optimizer(optimizer, strategy=dist_strategy) _, param_grads = optimizer.minimize(loss, F.default_startup_program())
def optimize(self, metrics): """ Optimize the model by metrics(mainly `metrics["loss"]`). """ # TODO: support dygraph if self.warmup_steps > 0: scheduled_lr = layers.learning_rate_scheduler.noam_decay( 1 / (self.warmup_steps * (self.learning_rate**2)), self.warmup_steps) else: scheduled_lr = layers.create_global_var( name=fluid.unique_name.generate("learning_rate"), shape=[1], value=self.learning_rate, dtype="float32", persistable=True) grad_clip = fluid.clip.GradientClipByGlobalNorm(self.max_grad_norm) self.optimizer = AdamW(learning_rate=scheduled_lr, grad_clip=grad_clip, weight_decay=self.weight_decay) if self.is_distributed: self.optimizer = fleet.distributed_optimizer( self.optimizer, strategy=self.dist_strategy) self.optimizer.minimize(metrics["loss"]) return scheduled_lr
def test_open_sync_batch_norm(self): import paddle.fluid as fluid import paddle.fluid.incubate.fleet.base.role_maker as role_maker from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy if not fluid.core.is_compiled_with_cuda(): # Operator "gen_nccl_id" has not been registered return data = fluid.layers.data(name='X', shape=[1], dtype='float32') hidden = fluid.layers.fc(input=data, size=10) loss = fluid.layers.mean(hidden) optimizer = fluid.optimizer.AdamOptimizer() role = role_maker.UserDefinedCollectiveRoleMaker(0, ['127.0.0.1:6170']) fleet.init(role) dist_strategy = DistributedStrategy() dist_strategy.sync_batch_norm = True dist_optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) dist_optimizer.minimize(loss) self.assertEqual(dist_strategy.exec_strategy.num_threads, 1)
def build_program(is_train, main_prog, startup_prog, args, dist_strategy=None): image_shape = [int(m) for m in args.image_shape.split(",")] model_name = args.model model_list = [m for m in dir(models) if "__" not in m] assert model_name in model_list, "{} is not in lists: {}".format(args.model, model_list) model = models.__dict__[model_name]() with fluid.program_guard(main_prog, startup_prog): use_mixup = args.use_mixup if is_train and use_mixup: py_reader = fluid.layers.py_reader( capacity=16, shapes=[[-1] + image_shape, [-1, 1], [-1, 1], [-1, 1]], lod_levels=[0, 0, 0, 0], dtypes=["float32", "int64", "int64", "float32"], use_double_buffer=True) else: py_reader = fluid.layers.py_reader( capacity=16, shapes=[[-1] + image_shape, [-1, 1]], lod_levels=[0, 0], dtypes=["float32", "int64"], use_double_buffer=True) with fluid.unique_name.guard(): if is_train and use_mixup: image, y_a, y_b, lam = fluid.layers.read_file(py_reader) avg_cost = net_config(image=image, y_a=y_a, y_b=y_b, lam=lam, model=model, args=args, label=0, is_train=True) avg_cost.persistable = True build_program_out = [py_reader, avg_cost] else: image, label = fluid.layers.read_file(py_reader) avg_cost, acc_top1, acc_top5 = net_config(image, model, args, label=label, is_train=is_train) avg_cost.persistable = True acc_top1.persistable = True acc_top5.persistable = True build_program_out = [py_reader, avg_cost, acc_top1, acc_top5] if is_train: params = model.params params["total_images"] = args.total_images params["lr"] = args.lr params["num_epochs"] = args.num_epochs params["learning_strategy"]["batch_size"] = args.batch_size params["learning_strategy"]["name"] = args.lr_strategy params["l2_decay"] = args.l2_decay params["momentum_rate"] = args.momentum_rate optimizer = optimizer_setting(params) global_lr = optimizer._global_learning_rate() if args.fp16: optimizer = fluid.contrib.mixed_precision.decorate(optimizer) dist_optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) _, param_grads = dist_optimizer.minimize(avg_cost) global_lr.persistable=True build_program_out.append(global_lr) return build_program_out
def get_distributed_optimizer(optimizer): """ Get the default collective distributed optimizer under fleet. """ dist_strategy = DistributedStrategy() role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) return optimizer
def setup_optimizer(optimizer, use_cuda, is_distributed): """ Setup the optimizer """ if use_cuda: if is_distributed: dist_strategy = DistributedStrategy() optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy)
def distributed_optimize(optimizer): ''' A part of configuration for distributed training ''' strategy = DistributedStrategy() strategy.fuse_all_reduce_ops = True strategy.nccl_comm_num = 2 strategy.fuse_elewise_add_act_ops=True strategy.fuse_bn_act_ops = True return fleet.distributed_optimizer(optimizer, strategy=strategy)
def build_program(is_train, main_prog, startup_prog, args, dist_strategy=None, data_layout="NCHW"): model_name = args.model model_list = [m for m in dir(models) if "__" not in m] assert model_name in model_list, "{} is not in lists: {}".format(args.model, model_list) model = models.__dict__[model_name]() with fluid.program_guard(main_prog, startup_prog): use_mixup = args.use_mixup data_loader, data = utility.create_data_loader(is_train, args, data_layout=data_layout) with fluid.unique_name.guard(): if is_train and use_mixup: image, y_a, y_b, lam = data[0], data[1], data[2], data[3] avg_cost = net_config(image=image, y_a=y_a, y_b=y_b, lam=lam, model=model, args=args, label=0, is_train=True, data_format=data_layout) avg_cost.persistable = True build_program_out = [data_loader, avg_cost] else: image, label = data[0], data[1], avg_cost, acc_top1, acc_top5 = net_config(image, model, args, label=label, is_train=is_train, data_format=data_layout) avg_cost.persistable = True acc_top1.persistable = True acc_top5.persistable = True build_program_out = [data_loader, avg_cost, acc_top1, acc_top5] if is_train: params = model.params params["total_images"] = args.total_images params["lr"] = args.lr params["num_epochs"] = args.num_epochs params["learning_strategy"]["batch_size"] = args.batch_size params["learning_strategy"]["name"] = args.lr_strategy params["l2_decay"] = args.l2_decay params["momentum_rate"] = args.momentum_rate params["use_dgc"] = args.use_dgc params["rampup_begin_step"] = args.rampup_begin_step optimizer = optimizer_setting(params) global_lr = optimizer._global_learning_rate() if args.fp16: optimizer = fluid.contrib.mixed_precision.decorate(optimizer, init_loss_scaling=args.scale_loss, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling) if args.use_recompute: dist_strategy.forward_recompute = True dist_strategy.enable_sequential_execution=True dist_strategy.recompute_checkpoints = model.checkpoints dist_optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) _, param_grads = dist_optimizer.minimize(avg_cost) global_lr.persistable=True build_program_out.append(global_lr) return build_program_out
def setup_optimizer(optimizer, model, use_cuda, is_distributed): """ Setup the optimizer """ if use_cuda: if is_distributed: dist_strategy.recompute_checkpoints = model.checkpoints optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) else: optimizer = fluid.optimizer.RecomputeOptimizer(optimizer) optimizer._set_checkpoints(model.checkpoints)
def test_distributed_basic(self): checker = acp._get_checker() fs = HDFSClient(checker.hdfs_home, None) fs.delete(checker.hdfs_checkpoint_path) self._reset_generator() logger.info("begin test_distributed_basic") fs = LocalFS() save_dir = "./run_save_0" fs.delete(save_dir) #basic exe, main_prog, startup_prog = self._generate() compiled, data_loader, optimizer, loss, image, label = \ self._init_env(exe, main_prog, startup_prog, minimize=False) #fleet os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ID"] = "0" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070" role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) with fluid.program_guard(main_prog, startup_prog): dist_optimizer = fleet.distributed_optimizer(optimizer) dist_optimizer.minimize(loss) exe.run(startup_prog) o = None i = 0 name = None for i in acp.train_epoch_range(3, 0): o = acp._get_train_epoch_range() name = o.name logger.info("_run_save_0 name:{} epoch_no:{}".format(o.name, i)) for data in data_loader(): fetch = exe.run(fleet.main_program, feed=data, fetch_list=[loss]) self.assertEqual(len(o._exe_status), 1) o = acp._get_train_epoch_range() assert o == None, "now train epoch must not exits now" self.assertEqual(i, 2) fs.delete(save_dir) logger.info("end test_distributed_basic")
def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None): # Input data images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) label = fluid.layers.data(name='label', shape=[1], dtype='int64') # Train program predict = cnn_model(images) cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(x=cost) # Evaluator batch_size_tensor = fluid.layers.create_tensor(dtype='int64') batch_acc = fluid.layers.accuracy(input=predict, label=label, total=batch_size_tensor) inference_program = fluid.default_main_program().clone() # Optimization # TODO(typhoonzero): fix distributed adam optimizer # opt = fluid.optimizer.AdamOptimizer( # learning_rate=0.001, beta1=0.9, beta2=0.999) if not use_dgc: opt = fluid.optimizer.Momentum(learning_rate=self.lr, momentum=0.9) else: opt = fluid.optimizer.DGCMomentumOptimizer(learning_rate=self.lr, momentum=0.9, rampup_begin_step=0) # Reader train_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=batch_size) test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=batch_size) if dist_strategy: dist_opt = fleet.distributed_optimizer(optimizer=opt, strategy=dist_strategy) _, param_grads = dist_opt.minimize(avg_cost) else: opt.minimize(avg_cost) return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
def run_gpu_fleet_api_trainer(self, args): import paddle.distributed.fleet as fleet import paddle.distributed.fleet.base.role_maker as role_maker # 1. enable dygraph paddle.disable_static() # 2. init seed seed = 90 paddle.static.default_startup_program().random_seed = seed paddle.static.default_main_program().random_seed = seed np.random.seed(seed) random.seed = seed # get trainer id args.trainer_id = paddle.distributed.get_rank() # 3. init parallel env if args.update_method == "nccl2": fleet.init(is_collective=True) # 4. train model model, train_reader, opt = self.get_model() if args.update_method == "nccl2": opt = fleet.distributed_optimizer(opt) model = fleet.distributed_model(model) out_losses = [] for step_id, data in enumerate(train_reader()): data = self._get_data(data, args) if step_id == RUN_STEP: break loss = self.run_one_loop(model, opt, data) out_losses.append(loss.numpy()) if args.update_method == "nccl2": loss = model.scale_loss(loss) loss.backward() if args.update_method == "nccl2": model.apply_collective_grads() opt.step() opt.clear_grad() print_to_out(out_losses)
def dist_optimizer(config, optimizer): """ Create a distributed optimizer based on a normal optimizer Args: config(dict): optimizer(): a normal optimizer Returns: optimizer: a distributed optimizer """ exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = 3 exec_strategy.num_iteration_per_drop_scope = 10 dist_strategy = DistributedStrategy() dist_strategy.nccl_comm_num = 1 dist_strategy.fuse_all_reduce_ops = True dist_strategy.exec_strategy = exec_strategy optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) return optimizer
def _test_check_point(self, fs, dir_path): file_name = "persistables" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ID"] = "0" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070" role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) image = fluid.data(name='img', shape=[None, 28, 28], dtype='float32') label = fluid.data(name='label', shape=[None, 1], dtype='int64') feeder = fluid.DataFeeder(feed_list=[image, label], place=fluid.CPUPlace()) predict = fluid.layers.fc(input=image, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=predict, label=label) avg_loss = fluid.layers.mean(loss) optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.001) dist_optimizer = fleet.distributed_optimizer(optimizer) dist_optimizer.minimize(avg_loss) exe = fluid.Executor(fluid.CPUPlace()) exe.run(fluid.default_startup_program()) status = TrainStatus(2) fleet.save_check_point(exe, dir_path, train_status=status, fs=fs) n1 = fleet._get_last_checkpoint_no(dir_path, fs=fs) status2 = fleet.load_check_point(exe, dir_path, trainer_id=0, fs=fs) self.assertEqual(status2, status) fleet.save_check_point(exe, dir_path, train_status=status, fs=fs) n2 = fleet._get_last_checkpoint_no(dir_path, fs=fs) self.assertEqual(n2, n1 + 1) fleet.clean_redundant_check_points(dir_path, fs=fs)
def set_optimizer(self, FLAGS, net_output): """ set optimizer """ optimizer = net_output['optimizer'] if self.is_multi_gpu(FLAGS): trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) num_trainers = int(os.getenv("PADDLE_TRAINERS_NUM")) trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS") logging.info("train_id:%s, num_trainers:%s, trainer_endpoints:%s" % (trainer_id, num_trainers, trainer_endpoints)) trainer_endpoints = trainer_endpoints.split(',') role = role_maker.UserDefinedCollectiveRoleMaker(current_id=trainer_id, worker_endpoints=trainer_endpoints) fleet.init(role) dist_strategy = DistributedStrategy() #num_nodes = len(set([x.split(':')[0] for x in trainer_endpoints])) #if num_nodes == 1: # dist_strategy.use_local_sgd = True #dist_strategy.mode = "collective" #multi node is nccl2 #dist_strategy.collective_mode = "local_sgd" # local_sgd or grad_allreduce # logging.info("use local sgd, not nccl2 for single node.") """ #TODO: dist_strategy.enable_inplace = FLAGS.with_inplace if FLAGS.fuse_ops: dist_strategy.fuse_all_reduce_ops = 1 dist_strategy.nccl_comm_num = FLAGS.nccl_comm_num """ optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) return optimizer.minimize(net_output['loss'])
fleet.init(role) # PE training main_prog = fluid.Program() start_prog = fluid.Program() with fluid.program_guard(main_prog, start_prog): x = fluid.data(name='x', shape=[-1, 2], dtype='float32') label = fluid.data(name='label', shape=[-1, 1], dtype='float32') y = fluid.layers.fc(x, size=1, param_attr=fluid.initializer.Constant(1.0)) cost = fluid.layers.square_error_cost(y, label) loss = fluid.layers.reduce_sum(cost) optimizer = fluid.optimizer.SGD(learning_rate=0.0) strategy = DistributedStrategy() optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(loss, start_prog) place = fluid.CUDAPlace(int(os.environ['FLAGS_selected_gpus'])) exe = fluid.Executor(place) exe.run(start_prog) train_prog = fleet.main_program x_data = np.ones(shape=[1, 2], dtype=np.float32) label_data = np.ones(shape=[1, 1], dtype=np.float32) out = exe.run(train_prog, feed={ 'x': x_data, 'label': label_data }, fetch_list=[loss.name])
def optimization(loss, warmup_steps, num_train_steps, learning_rate, train_program, startup_prog, weight_decay, scheduler='linear_warmup_decay', use_dynamic_loss_scaling=False, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, incr_ratio=2.0, decr_ratio=0.8, dist_strategy=None, use_lamb=False): if warmup_steps > 0: if scheduler == 'noam_decay': scheduled_lr = fluid.layers.learning_rate_scheduler\ .noam_decay(1/(warmup_steps *(learning_rate ** 2)), warmup_steps) elif scheduler == 'linear_warmup_decay': scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps, num_train_steps) else: raise ValueError("Unkown learning rate scheduler, should be " "'noam_decay' or 'linear_warmup_decay'") if use_lamb: optimizer = fluid.optimizer.LambOptimizer( learning_rate=scheduled_lr) else: optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) else: scheduled_lr = fluid.layers.create_global_var( name=fluid.unique_name.generate("learning_rate"), shape=[1], value=learning_rate, dtype='float32', persistable=True) if use_lamb: optimizer = fluid.optimizer.LambOptimizer( learning_rate=scheduled_lr) else: optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) optimizer._learning_rate_map[ fluid.default_main_program()] = scheduled_lr fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=1.0)) def exclude_from_weight_decay(name): if name.find("layer_norm") > -1: return True bias_suffix = ["_bias", "_b", ".b_0"] for suffix in bias_suffix: if name.endswith(suffix): return True return False param_list = dict() for param in train_program.global_block().all_parameters(): param_list[param.name] = param * 1.0 param_list[param.name].stop_gradient = True if dist_strategy is not None: # use fleet api optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) _, param_grads = optimizer.minimize(loss) if weight_decay > 0: for param, grad in param_grads: if exclude_from_weight_decay(param.name): continue with param.block.program._optimized_guard( [param, grad]), fluid.framework.name_scope("weight_decay"): updated_param = param - param_list[ param.name] * weight_decay * scheduled_lr fluid.layers.assign(output=param, input=updated_param) return scheduled_lr
def main(args): cfg = XiaoduHiConfig() cfg.scene_sensor_algo = 'yolov4' wae_ndarray = np.load(os.path.join(args.wae_dir, 'raw_wae.npy')) start_epoch = 0 train_program = fluid.Program() startup_program = fluid.Program() with fluid.program_guard(train_program, startup_program): attention_ctrl = AttentionController( inputs_type=args.inputs_type, num_actions=wae_ndarray.shape[0], act_tr_dim=wae_ndarray.shape[1], act_emb_ndarray=wae_ndarray, num_frames=cfg.ob_window_len, tokens_per_frame=cfg.tokens_per_frame, visual_token_dim=cfg.visual_token_dim, model_dim=args.model_dim, num_decoder_blocks=args.num_decoder_blocks, num_heads=args.num_heads, ffn_dim=args.ffn_dim, dropout=args.dropout, normalize_before=args.normalize_before, frame_emb_trainable=args.frame_emb_trainable, trigger_loss_coef=args.trigger_loss_coef, obj_loss_coef=args.obj_loss_coef, act_loss_coef=args.act_loss_coef, use_last_act_loss=args.use_last_act_loss, mode='train') preds = attention_ctrl.predict() test_program = train_program.clone(for_test=True) optimizer = fluid.optimizer.AdamOptimizer( learning_rate=args.lr, regularization=fluid.regularizer.L2Decay( regularization_coeff=0.1)) if args.distributed_training: optimizer = fleet.distributed_optimizer(optimizer) role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) optimizer.minimize(attention_ctrl.loss) if args.distributed_training: place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0))) else: place = fluid.CUDAPlace(args.gpu) exe = fluid.Executor(place) exe.run(startup_program) if args.inputs_type.startswith('inst_crop') and \ args.inputs_type != 'inst_crop_wo_crop': fluid.io.load_vars( exe, MobileNetV2_Pretrained, main_program=train_program, predicate=lambda v: os.path.exists( os.path.join(MobileNetV2_Pretrained, v.name))) print('Loaded weights from {}'.format(MobileNetV2_Pretrained)) if args.init_params is not None: base = os.path.basename(args.init_params) if base.startswith('epoch_'): start_epoch = int(base[len('epoch_'):]) + 1 tb_state = os.path.join(args.init_params, 'tb_state.txt') if os.path.exists(tb_state): global _update_step global _eval_step with open(tb_state, 'r') as f: update_step, eval_step = f.readline().split(' ') _update_step = int(update_step) _eval_step = int(eval_step) fluid.io.load_vars( exe, args.init_params, main_program=train_program, predicate=lambda v: os.path.exists( os.path.join(args.init_params, v.name))) print('Loaded weights from {}'.format(args.init_params)) if args.distributed_training: train_worker_gpus = [int(os.environ.get('FLAGS_selected_gpus', 0))] test_worker_gpus = train_worker_gpus else: train_worker_gpus = convert_gpu_ids(args.data_worker_gpus_for_train) test_worker_gpus = convert_gpu_ids(args.data_worker_gpus_for_test) if not os.path.exists(args.save): os.makedirs(args.save) if not args.use_decord: train_dataloader = XiaoduHiDataloaderv2( attention_ctrl.feed_list, [place], args.yolov4_model_dir, args.video_tracking_dir, args.train_dataset, full_neg_txt=args.full_neg_train, batch_size=args.bs, num_workers=args.data_workers_for_train, worker_gpus=train_worker_gpus, roi_feat_resolution=cfg.roi_feat_resolution, ob_window_len=cfg.ob_window_len, interval=cfg.interval, tokens_per_frame=cfg.tokens_per_frame, visual_token_dim=cfg.visual_token_dim, augment=False, resample_negs_per_epoch=True) test_dataloader = XiaoduHiDataloaderv2( attention_ctrl.feed_list, [place], args.yolov4_model_dir, args.video_tracking_dir, args.test_dataset, full_neg_txt=args.full_neg_test, batch_size=args.bs, num_workers=args.data_workers_for_test, worker_gpus=test_worker_gpus, roi_feat_resolution=cfg.roi_feat_resolution, ob_window_len=cfg.ob_window_len, interval=cfg.interval, tokens_per_frame=cfg.tokens_per_frame, visual_token_dim=cfg.visual_token_dim, augment=False, resample_negs_per_epoch=False, for_test=True) test_dataloader.save_to_txt( os.path.join(args.save, 'eval_data.txt'), dt=200) else: train_dataloader = XiaoduHiDecordLoader( attention_ctrl.feed_list, [place], args.yolov4_model_dir, args.decord_ds_pkl, decord_readers=args.decord_readers, yolov4_detectors=args.decord_detectors, post_workers=args.decord_post_workers, batch_size=args.bs, detector_gpus=train_worker_gpus, roi_feat_resolution=cfg.roi_feat_resolution, tokens_per_frame=cfg.tokens_per_frame, visual_token_dim=cfg.visual_token_dim, for_test=False) test_dataloader = XiaoduHiDecordLoader( attention_ctrl.feed_list, [place], args.yolov4_model_dir, args.decord_ds_pkl, decord_readers=args.decord_readers, yolov4_detectors=args.decord_detectors, post_workers=args.decord_post_workers, batch_size=args.bs, detector_gpus=test_worker_gpus, roi_feat_resolution=cfg.roi_feat_resolution, tokens_per_frame=cfg.tokens_per_frame, visual_token_dim=cfg.visual_token_dim, for_test=True) train_dataloader.start_workers() test_dataloader.start_workers() train_log = os.path.join(args.save, 'loss.csv') eval_log = os.path.join(args.save, 'eval.txt') with open(os.path.join(args.save, 'args.txt'), 'w') as f: f.write(str(args)) tb_writer = SummaryWriter( logdir=os.path.join(args.save, 'logdir'), purge_step=None if _update_step == 0 else _update_step) worker_index = None if not args.distributed_training \ else fleet.worker_index() # if worker_index == 0: # eval_model(exe, test_program, preds, attention_ctrl.act_loss, # test_dataloader, -1, log_file=eval_log, # tb_writer=tb_writer, worker_index=worker_index) for epoch_id in range(start_epoch, args.epochs): print('--------------- Epoch %d ---------------' % epoch_id) train_epoch(exe, train_program, attention_ctrl, train_dataloader, log_file=train_log, tb_writer=tb_writer, worker_index=worker_index) save_dir = os.path.join(args.save, 'epoch_{}'.format(epoch_id)) shutil.rmtree(save_dir, ignore_errors=True) os.mkdir(save_dir) fluid.io.save_params(exe, save_dir, main_program=train_program) if epoch_id > 0 and epoch_id % args.run_eval_after_epochs == 0: eval_model(exe, test_program, preds, attention_ctrl.act_loss, test_dataloader, epoch_id, log_file=eval_log, tb_writer=tb_writer) tb_state = os.path.join(save_dir, 'tb_state.txt') with open(tb_state, 'w') as f: f.write('{} {}'.format(_update_step, _eval_step)) if epoch_id % args.run_eval_after_epochs != 0: eval_model(exe, test_program, preds, attention_ctrl.act_loss, test_dataloader, epoch_id, log_file=eval_log, tb_writer=tb_writer, worker_index=worker_index) train_dataloader.stop_workers() test_dataloader.stop_workers()
def main(): role = role_maker.PaddleCloudRoleMaker(is_collective=True) # new line 3 fleet.init(role) # new line 4 env = os.environ num_trainers = int(env.get('PADDLE_TRAINERS_NUM', 0)) assert num_trainers != 0, "multi-machine training process must be started using distributed.launch..." trainer_id = int(env.get("PADDLE_TRAINER_ID", 0)) # set different seeds for different trainers random.seed(trainer_id) np.random.seed(trainer_id) if FLAGS.enable_ce: random.seed(0) np.random.seed(0) cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) check_config(cfg) # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) # check if paddlepaddle version is satisfied check_version() save_only = getattr(cfg, 'save_prediction_only', False) if save_only: raise NotImplementedError('The config file only support prediction,' ' training stage is not implemented now') main_arch = cfg.architecture assert cfg.use_gpu == True, "GPU must be supported for multi-machine training..." devices_num = fluid.core.get_cuda_device_count() if 'FLAGS_selected_gpus' in env: device_id = int(env['FLAGS_selected_gpus']) else: device_id = 0 place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) lr_builder = create('LearningRate') optim_builder = create('OptimizerBuilder') # build program startup_prog = fluid.Program() train_prog = fluid.Program() if FLAGS.enable_ce: startup_prog.random_seed = 1000 train_prog.random_seed = 1000 with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) if FLAGS.fp16: assert (getattr(model.backbone, 'norm_type', None) != 'affine_channel'), \ '--fp16 currently does not support affine channel, ' \ ' please modify backbone settings to use batch norm' with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx: inputs_def = cfg['TrainReader']['inputs_def'] feed_vars, train_loader = model.build_inputs(**inputs_def) train_fetches = model.train(feed_vars) loss = train_fetches['loss'] if FLAGS.fp16: loss *= ctx.get_loss_scale_var() lr = lr_builder() optimizer = optim_builder(lr) dist_strategy = DistributedStrategy() sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn' dist_strategy.sync_batch_norm = sync_bn dist_strategy.nccl_comm_num = 1 exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = 3 exec_strategy.num_iteration_per_drop_scope = 30 dist_strategy.exec_strategy = exec_strategy dist_strategy.fuse_all_reduce_ops = True optimizer = fleet.distributed_optimizer( optimizer, strategy=dist_strategy) # new line 5 optimizer.minimize(loss) if FLAGS.fp16: loss /= ctx.get_loss_scale_var() if 'use_ema' in cfg and cfg['use_ema']: global_steps = _decay_step_counter() ema = ExponentialMovingAverage(cfg['ema_decay'], thres_steps=global_steps) ema.update() # parse train fetches train_keys, train_values, _ = parse_fetches(train_fetches) train_values.append(lr) if FLAGS.eval: eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg['EvalReader']['inputs_def'] feed_vars, eval_loader = model.build_inputs(**inputs_def) fetches = model.eval(feed_vars) eval_prog = eval_prog.clone(True) eval_reader = create_reader(cfg.EvalReader, devices_num=1) # When iterable mode, set set_sample_list_generator(eval_reader, place) eval_loader.set_sample_list_generator(eval_reader) # parse eval fetches extra_keys = [] if cfg.metric == 'COCO': extra_keys = ['im_info', 'im_id', 'im_shape'] if cfg.metric == 'VOC': extra_keys = ['gt_bbox', 'gt_class', 'is_difficult'] if cfg.metric == 'WIDERFACE': extra_keys = ['im_id', 'im_shape', 'gt_bbox'] eval_keys, eval_values, eval_cls = parse_fetches( fetches, eval_prog, extra_keys) exe.run(startup_prog) compiled_train_prog = fleet.main_program if FLAGS.eval: compiled_eval_prog = fluid.CompiledProgram(eval_prog) fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel' ignore_params = cfg.finetune_exclude_pretrained_params \ if 'finetune_exclude_pretrained_params' in cfg else [] start_iter = 0 if FLAGS.resume_checkpoint: checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint) start_iter = checkpoint.global_step() elif cfg.pretrain_weights and fuse_bn and not ignore_params: checkpoint.load_and_fusebn(exe, train_prog, cfg.pretrain_weights) elif cfg.pretrain_weights: checkpoint.load_params(exe, train_prog, cfg.pretrain_weights, ignore_params=ignore_params) train_reader = create_reader(cfg.TrainReader, (cfg.max_iters - start_iter) * devices_num, cfg, devices_num=devices_num) # When iterable mode, set set_sample_list_generator(train_reader, place) train_loader.set_sample_list_generator(train_reader) # whether output bbox is normalized in model output layer is_bbox_normalized = False if hasattr(model, 'is_bbox_normalized') and \ callable(model.is_bbox_normalized): is_bbox_normalized = model.is_bbox_normalized() # if map_type not set, use default 11point, only use in VOC eval map_type = cfg.map_type if 'map_type' in cfg else '11point' train_stats = TrainingStats(cfg.log_iter, train_keys) train_loader.start() start_time = time.time() end_time = time.time() cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) time_stat = deque(maxlen=cfg.log_iter) best_box_ap_list = [0.0, 0] #[map, iter] # use VisualDL to log data if FLAGS.use_vdl: assert six.PY3, "VisualDL requires Python >= 3.5" from visualdl import LogWriter vdl_writer = LogWriter(FLAGS.vdl_log_dir) vdl_loss_step = 0 vdl_mAP_step = 0 for it in range(start_iter, cfg.max_iters): start_time = end_time end_time = time.time() time_stat.append(end_time - start_time) time_cost = np.mean(time_stat) eta_sec = (cfg.max_iters - it) * time_cost eta = str(datetime.timedelta(seconds=int(eta_sec))) outs = exe.run(compiled_train_prog, fetch_list=train_values) stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])} # use vdl-paddle to log loss if FLAGS.use_vdl: if it % cfg.log_iter == 0: for loss_name, loss_value in stats.items(): vdl_writer.add_scalar(loss_name, loss_value, vdl_loss_step) vdl_loss_step += 1 train_stats.update(stats) logs = train_stats.log() if it % cfg.log_iter == 0 and trainer_id == 0: strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format( it, np.mean(outs[-1]), logs, time_cost, eta) logger.info(strs) # NOTE : profiler tools, used for benchmark if FLAGS.is_profiler and it == 5: profiler.start_profiler("All") elif FLAGS.is_profiler and it == 10: profiler.stop_profiler("total", FLAGS.profiler_path) return if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \ and trainer_id == 0: save_name = str(it) if it != cfg.max_iters - 1 else "model_final" if 'use_ema' in cfg and cfg['use_ema']: exe.run(ema.apply_program) checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name)) if FLAGS.eval: # evaluation resolution = None if 'Mask' in cfg.architecture: resolution = model.mask_head.resolution results = eval_run(exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls, cfg, resolution=resolution) box_ap_stats = eval_results(results, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type, cfg['EvalReader']['dataset']) # use vdl_paddle to log mAP if FLAGS.use_vdl: vdl_writer.add_scalar("mAP", box_ap_stats[0], vdl_mAP_step) vdl_mAP_step += 1 if box_ap_stats[0] > best_box_ap_list[0]: best_box_ap_list[0] = box_ap_stats[0] best_box_ap_list[1] = it checkpoint.save(exe, train_prog, os.path.join(save_dir, "best_model")) logger.info("Best test box ap: {}, in iter: {}".format( best_box_ap_list[0], best_box_ap_list[1])) if 'use_ema' in cfg and cfg['use_ema']: exe.run(ema.restore_program) train_loader.reset()
def do_training(self, fleet, args): """ begin training. Args: fleet (Collective): Collective inherited base class Fleet args (ArgumentParser): run args to config dist fleet. Returns: tuple: the value is train losses """ args = parse_args() logging.info(args) gpu_id = int(os.environ.get('FLAGS_selected_gpus', 4)) place = fluid.CUDAPlace(gpu_id) dev_count = 1 exe = fluid.Executor(place) train_program = fluid.Program() startup_program = fluid.Program() args.num_trainers = fleet.worker_num() args.trainer_id = fleet.worker_index() args.run_params = json.loads(args.run_params) dist_strategy = DistributedStrategy() dist_strategy.enable_inplace = args.run_params['enable_inplace'] dist_strategy.fuse_all_reduce_ops = args.run_params[ 'fuse_all_reduce_ops'] dist_strategy.nccl_comm_num = args.run_params['nccl_comm_num'] dist_strategy.use_local_sgd = args.run_params['use_local_sgd'] dist_strategy.mode = args.run_params["mode"] dist_strategy.collective_mode = args.run_params["collective"] with fluid.program_guard(train_program, startup_program): with fluid.unique_name.guard(): sum_cost, avg_cost, predict, token_num, pyreader = transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.prepostprocess_dropout, ModelHyperParams.attention_dropout, ModelHyperParams.relu_dropout, ModelHyperParams.preprocess_cmd, ModelHyperParams.postprocess_cmd, ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps, ModelHyperParams.bos_idx, use_py_reader=args.use_py_reader, is_test=False) optimizer = fluid.optimizer.SGD(0.003) if args.run_params["fp16"]: optimizer = decorate(optimizer, init_loss_scaling=64.0) optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) optimizer.minimize(avg_cost, startup_program) train_program = fleet.main_program exe.run(startup_program) train_data = prepare_data_generator( args, is_test=False, count=dev_count, pyreader=pyreader, py_reader_provider_wrapper=py_reader_provider_wrapper) loss_normalizer = -( (1. - TrainTaskConfig.label_smooth_eps) * np.log( (1. - TrainTaskConfig.label_smooth_eps)) + TrainTaskConfig.label_smooth_eps * np.log(TrainTaskConfig.label_smooth_eps / (ModelHyperParams.trg_vocab_size - 1) + 1e-20)) step_idx = 0 init_flag = True result_loss = [] result_ppl = [] train_info = [] for pass_id in six.moves.xrange(args.num_epochs): pass_start_time = time.time() if args.use_py_reader: pyreader.start() data_generator = None else: data_generator = train_data() batch_id = 0 while True: try: feed_dict_list = prepare_feed_dict_list( data_generator, init_flag, dev_count) t1 = time.time() outs = exe.run(program=train_program, fetch_list=[sum_cost.name, token_num.name] if step_idx % args.fetch_steps == 0 else [], feed=feed_dict_list) if step_idx % args.fetch_steps == 0: sum_cost_val, token_num_val = np.array( outs[0]), np.array(outs[1]) total_sum_cost = sum_cost_val.sum() total_token_num = token_num_val.sum() total_avg_cost = total_sum_cost / total_token_num result_loss.append(total_avg_cost - loss_normalizer) result_ppl.append( np.exp([min(total_avg_cost, 100)]).item(0)) train_info.append(result_loss) init_flag = False batch_id += 1 step_idx += 1 if batch_id >= 5: break except (StopIteration, fluid.core.EOFException): if args.use_py_reader: pyreader.reset() break train_info = [round(i, 6) for i in train_info[0]] return train_info
def build_program(self, is_train=True, use_parallel_test=False, dist_strategy=None): model_name = self.model_name assert not (is_train and use_parallel_test), \ "is_train and use_parallel_test cannot be set simultaneously." trainer_id = self.trainer_id num_trainers = self.num_trainers image_shape = [int(m) for m in self.image_shape] # model definition model = self.model if model is None: model = resnet.__dict__[model_name](emb_dim=self.emb_dim) main_program = self.train_program if is_train else self.test_program startup_program = self.startup_program with fluid.program_guard(main_program, startup_program): with fluid.unique_name.guard(): image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') emb, loss, prob = model.get_output( input=image, label=label, num_ranks=num_trainers, rank_id=trainer_id, is_train=is_train, num_classes=self.num_classes, loss_type=self.loss_type, param_attr=self.param_attr, bias_attr=self.bias_attr, margin=self.margin, scale=self.scale) acc1 = None acc5 = None if self.loss_type in ["dist_softmax", "dist_arcface"]: if self.calc_train_acc: shard_prob = loss._get_info("shard_prob") prob_all = fluid.layers.collective._c_allgather( shard_prob, nranks=num_trainers, use_calc_stream=True) prob_list = fluid.layers.split( prob_all, dim=0, num_or_sections=num_trainers) prob = fluid.layers.concat(prob_list, axis=1) label_all = fluid.layers.collective._c_allgather( label, nranks=num_trainers, use_calc_stream=True) acc1 = fluid.layers.accuracy(input=prob, label=label_all, k=1) acc5 = fluid.layers.accuracy(input=prob, label=label_all, k=5) else: if self.calc_train_acc: acc1 = fluid.layers.accuracy(input=prob, label=label, k=1) acc5 = fluid.layers.accuracy(input=prob, label=label, k=5) optimizer = None if is_train: # initialize optimizer optimizer = self._get_optimizer() if self.num_trainers > 1: dist_optimizer = fleet.distributed_optimizer( optimizer, strategy=dist_strategy) dist_optimizer.minimize(loss) else: # single card training optimizer.minimize(loss) if "dist" in self.loss_type or self.use_fp16: optimizer = optimizer._optimizer elif use_parallel_test: emb = fluid.layers.collective._c_allgather( emb, nranks=num_trainers, use_calc_stream=True) return emb, loss, acc1, acc5, optimizer
def optimization(loss, warmup_steps, num_train_steps, learning_rate, train_program, startup_prog, weight_decay, scheduler='linear_warmup_decay', use_fp16=False, use_dynamic_loss_scaling=False, init_loss_scaling=1.0, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, incr_ratio=2.0, decr_ratio=0.8, dist_strategy=None): """ optimization """ if warmup_steps > 0: if scheduler == 'noam_decay': scheduled_lr = fluid.layers.learning_rate_scheduler \ .noam_decay(1 / (warmup_steps * (learning_rate ** 2)), warmup_steps) elif scheduler == 'linear_warmup_decay': scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps, num_train_steps) else: raise ValueError("Unkown learning rate scheduler, should be " "'noam_decay' or 'linear_warmup_decay'") optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr, epsilon=1e-06) # optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) else: scheduled_lr = fluid.layers.create_global_var( name=fluid.unique_name.generate("learning_rate"), shape=[1], value=learning_rate, dtype='float32', persistable=True) optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr, epsilon=1e-06) # optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) optimizer._learning_rate_map[ fluid.default_main_program()] = scheduled_lr fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=1.0)) def exclude_from_weight_decay(name): """ exclude_from_weight_decay """ if name.find("layer_norm") > -1: return True bias_suffix = ["_bias", "_b", ".b_0"] for suffix in bias_suffix: if name.endswith(suffix): return True return False param_list = dict() loss_scaling = fluid.layers.create_global_var( name=fluid.unique_name.generate("loss_scaling"), shape=[1], value=init_loss_scaling, dtype='float32', persistable=True) if use_fp16: loss *= loss_scaling param_grads = optimizer.backward(loss) master_param_grads = create_master_params_grads( param_grads, train_program, startup_prog, loss_scaling) for param, _ in master_param_grads: param_list[param.name] = param * 1.0 param_list[param.name].stop_gradient = True if use_dynamic_loss_scaling: apply_dynamic_loss_scaling(loss_scaling, master_param_grads, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio) optimizer.apply_gradients(master_param_grads) if weight_decay > 0: for param, grad in master_param_grads: if exclude_from_weight_decay(param.name.rstrip(".master")): continue with param.block.program._optimized_guard( [param, grad]), fluid.framework.name_scope("weight_decay"): updated_param = param - param_list[ param.name] * weight_decay * scheduled_lr fluid.layers.assign(output=param, input=updated_param) master_param_to_train_param(master_param_grads, param_grads, train_program) else: for param in train_program.global_block().all_parameters(): param_list[param.name] = param * 1.0 param_list[param.name].stop_gradient = True if dist_strategy is not None: optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) _, param_grads = optimizer.minimize(loss) if weight_decay > 0: for param, grad in param_grads: if exclude_from_weight_decay(param.name): continue with param.block.program._optimized_guard( [param, grad]), fluid.framework.name_scope("weight_decay"): updated_param = param - param_list[ param.name] * weight_decay * scheduled_lr fluid.layers.assign(output=param, input=updated_param) result = collections.OrderedDict() result['scheduled_lr'] = scheduled_lr if use_fp16: result['loss_scaling'] = loss_scaling return result
def train(args): # implement distributed training by fleet use_fleet = True if use_fleet: role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) args.num_trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "1")) args.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) print('-------------', args.num_trainers, args.trainer_id) if args.trainer_id == 0: if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) # parse config config = parse_config(args.config) train_config = merge_configs(config, 'train', vars(args)) print_configs(train_config, 'Train') train_model = models.get_model(args.model_name, train_config, mode='train') # build model startup = fluid.Program() train_prog = fluid.Program() if args.fix_random_seed: startup.random_seed = 1000 train_prog.random_seed = 1000 with fluid.program_guard(train_prog, startup): with fluid.unique_name.guard(): train_model.build_input(use_dataloader=True) train_model.build_model() # for the input, has the form [data1, data2,..., label], so train_feeds[-1] is label train_feeds = train_model.feeds() train_fetch_list = train_model.fetches() train_loss = train_fetch_list[0] optimizer = train_model.optimizer() if use_fleet: optimizer = fleet.distributed_optimizer(optimizer) optimizer.minimize(train_loss) train_dataloader = train_model.dataloader() gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup) if args.resume: # if resume weights is given, load resume weights directly assert os.path.exists(args.resume + '.pdparams'), \ "Given resume weight dir {}.pdparams not exist.".format(args.resume) fluid.load(train_prog, model_path=args.resume, executor=exe) else: # if not in resume mode, load pretrain weights if args.pretrain: assert os.path.exists(args.pretrain), \ "Given pretrain weight dir {} not exist.".format(args.pretrain) pretrain = args.pretrain or train_model.get_pretrain_weights() if pretrain: train_model.load_pretrain_params(exe, pretrain, train_prog, place) build_strategy = fluid.BuildStrategy() build_strategy.enable_inplace = True if args.model_name in ['CTCN']: build_strategy.enable_sequential_execution = True exec_strategy = fluid.ExecutionStrategy() if use_fleet: compiled_train_prog = fleet.main_program else: compiled_train_prog = fluid.compiler.CompiledProgram( train_prog).with_data_parallel(loss_name=train_loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) # get reader bs_denominator = 1 if args.use_gpu: # check number of GPUs gpus = os.getenv("CUDA_VISIBLE_DEVICES", "") if gpus == "": pass else: gpus = gpus.split(",") num_gpus = len(gpus) assert num_gpus == train_config.TRAIN.num_gpus, \ "num_gpus({}) set by CUDA_VISIBLE_DEVICES " \ "shoud be the same as that " \ "set in {}({})".format( num_gpus, args.config, train_config.TRAIN.num_gpus) bs_denominator = train_config.TRAIN.num_gpus train_config.TRAIN.batch_size = int(train_config.TRAIN.batch_size / bs_denominator) train_reader = get_reader(args.model_name.upper(), 'train', train_config) # get metrics train_metrics = get_metrics(args.model_name.upper(), 'train', train_config) epochs = args.epoch or train_model.epoch_num() exe_places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() train_dataloader.set_batch_generator(train_reader, places=place) train_with_dataloader(exe, train_prog, compiled_train_prog, train_dataloader, train_fetch_list, train_metrics, epochs=epochs, log_interval=args.log_interval, save_dir=args.save_dir, num_trainers=args.num_trainers, trainer_id=args.trainer_id, save_model_name=args.model_name, fix_random_seed=args.fix_random_seed, is_profiler=args.is_profiler, profiler_path=args.profiler_path)
def main(args): with open(args.config, 'r') as f: config = json.load(f) logging.info('Load data ...') if len(args.dataset.split(',')) > 1: # for large pretraining dataset, ZINC15 and ChEMBL # directly load the processed npz files train_data_list = [] for ds in args.dataset.split(','): # use processed data.npz train_data_list.extend( load_data(os.path.join(args.root, ds, 'processed'))) # dataset = MoleculeDataset( # args.root, ds, # add_symmetry=False, # add_self_loop=False) # data_list = dataset.get_data_list() # processed_dir = os.path.join(args.root, ds, 'processed') # os.makedirs(processed_dir, exist_ok=True) # save_data_list_to_npz( # data_list, os.path.join(processed_dir, 'data.npz')) # logging.info('Processed {}'.format(ds)) # train_data_list.extend(data_list) else: if args.dataset == 'mutag': train_data_list, _ = load_mutag_dataset( os.path.join(args.root, args.dataset, 'raw')) elif args.dataset == 'ptc_mr': train_data_list, _ = load_ptc_mr_dataset( os.path.join(args.root, args.dataset, 'raw')) else: raise ValueError('Unsupported dataset') if args.is_fleet: train_data_list = [ x for i, x in enumerate(train_data_list) if i % fleet.worker_num() == fleet.worker_index() ] logging.info("Data loaded.") logging.info("Train Examples: %s" % len(train_data_list)) sys.stdout.flush() if args.emb_dir is not None: os.makedirs(args.emb_dir, exist_ok=True) train_prog = F.Program() test_prog = F.Program() startup_prog = F.Program() with F.program_guard(train_prog, startup_prog): with F.unique_name.guard(): agent = create_model(args, config) test_prog = train_prog.clone(for_test=True) opt = F.optimizer.Adam(learning_rate=args.lr) if args.is_fleet: dist_strategy = DistributedStrategy() role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) opt = fleet.distributed_optimizer(opt, strategy=dist_strategy) opt.minimize(agent.loss) place = F.CUDAPlace(0) if args.use_cuda else F.CPUPlace() exe = F.Executor(place) exe.run(startup_prog) if (not args.dont_save_emb) and \ (not args.is_fleet or fleet.worker_index() == 0): save_embedding(args, exe, test_prog, agent, train_data_list, -1) for epoch_id in range(args.max_epoch): train(args, exe, train_prog, agent, train_data_list, epoch_id) if not args.is_fleet or fleet.worker_index() == 0: F.io.save_params(exe, '%s/epoch%s' % (args.model_dir, epoch_id), train_prog) if not args.dont_save_emb: save_embedding(args, exe, test_prog, agent, train_data_list, epoch_id)
def _test_checkpoint(self, fs, dir_path): file_name = "persistables" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ID"] = "0" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070" role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) image = fluid.data(name='img', shape=[None, 28, 28], dtype='float32') label = fluid.data(name='label', shape=[None, 1], dtype='int64') feeder = fluid.DataFeeder(feed_list=[image, label], place=fluid.CPUPlace()) predict = fluid.layers.fc(input=image, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=predict, label=label) avg_loss = fluid.layers.mean(loss) optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.001) dist_optimizer = fleet.distributed_optimizer(optimizer) dist_optimizer.minimize(avg_loss) exe = fluid.Executor(fluid.CPUPlace()) exe.run(fluid.default_startup_program()) status = ExeTrainStatus() status.epoch_no = 2 _, n1 = fleet.save_checkpoint(exe, dir_path, trainer_id=0, train_status=status, fs=fs) status2 = ExeTrainStatus() fleet.load_checkpoint(exe, dir_path, trainer_id=0, fs=fs, train_status=status2) self.assertEqual(status2, status) _, n2 = fleet.save_checkpoint(exe, dir_path, trainer_id=0, train_status=status, fs=fs, remain_all_checkpoint=False) self.assertEqual(n2, n1 + 1) c = CheckpointSaver(fs) cp_nos = c.get_checkpoint_no(dir_path) assert len(cp_nos) == 1 # cleanup all others # unnormal # test remain_all_checkpoint fleet.save_checkpoint(exe, dir_path, trainer_id=0, train_status=status, fs=fs, remain_all_checkpoint=False) # can't save under a file fs = LocalFS() cache_path = "./.load_cache" fs.touch(cache_path) try: fleet.save_checkpoint(exe, dir_path, trainer_id=0, train_status=status, fs=fs, cache_path=cache_path) self.assertFalse(True) except: pass # can't load under a file try: fleet.load_checkpoint(exe, dir_path, trainer_id=0, train_status=status2, fs=fs, cache_path=cache_path) self.assertFalse(True) except: pass fs.delete(cache_path)
def _make_program(self, mode): prog = self._progs.get(mode, None) if prog is not None: return prog = self._orig_prog.clone() # NOTE: When defining learning rate scheduling in static-graph, ops to # increase the global step var and calculate learning rate would be # prepended into _orig_prog. test program maked by `_orig_prog.clone` # also would include these ops. Thus must prune these ops in test # program, otherwise the global step would be changed in test. if mode != 'train': for op in list(prog.global_block().ops): prog.global_block()._remove_op(0) if mode == 'train' and self.model._optimizer \ and self.model._optimizer._learning_rate_map: # HACK workaround learning rate map issue lr_var = self.model._optimizer._learning_rate_map[self._orig_prog] self.model._optimizer._learning_rate_map[prog] = lr_var losses = [] metrics = [] with fluid.program_guard(prog, self._startup_prog): ins = self.model._inputs lbls = self.model._labels if self.model._labels else [] inputs = [k.forward() for k in to_list(ins)] labels = [k.forward() for k in to_list(lbls)] self._label_vars[mode] = labels outputs = to_list(self.model.forward(*inputs)) if mode != 'test' and self.model._loss_function: losses = self.model._loss_function(outputs, labels) if self._nranks > 1 and mode != 'train': outputs = [_all_gather(o, self._nranks) for o in outputs] if mode != 'test': labels = [_all_gather(l, self._nranks) for l in labels] if mode != 'test': for metric in self.model._metrics: metrics.append( to_list(metric.add_metric_op(outputs, labels))) if mode == 'train' and self.model._optimizer: self._loss_endpoint = fluid.layers.sum(losses) if self._nranks > 1: role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) dist_strategy = DistributedStrategy() dist_strategy.mode = "collective" dist_strategy.collective_mode = "grad_allreduce" self.model._optimizer = fleet.distributed_optimizer( self.model._optimizer, strategy=dist_strategy) self.model._optimizer.minimize(self._loss_endpoint) if mode != 'train': # clone again to put it in test mode prog = prog.clone(for_test=True) self._input_vars[mode] = inputs self._progs[mode] = prog self._endpoints[mode] = { "output": outputs, "loss": losses, "metric": metrics }
def optimization(loss, warmup_steps, num_train_steps, learning_rate, train_program, startup_prog, weight_decay, dist_strategy=None, scheduler='linear_warmup_decay', use_fp16=False, loss_scaling=1.0): if use_fp16: print("fp16 is not supported for now, please contact the author") exit() if warmup_steps > 0: if scheduler == 'noam_decay': scheduled_lr = fluid.layers.learning_rate_scheduler\ .noam_decay(1/(warmup_steps *(learning_rate ** 2)), warmup_steps) elif scheduler == 'linear_warmup_decay': scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps, num_train_steps) else: raise ValueError("Unkown learning rate scheduler, should be " "'noam_decay' or 'linear_warmup_decay'") optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) else: optimizer = fluid.optimizer.Adam(learning_rate=learning_rate) scheduled_lr = learning_rate clip_norm_thres = 1.0 # When using mixed precision training, scale the gradient clip threshold # by loss_scaling if use_fp16 and loss_scaling > 1.0: clip_norm_thres *= loss_scaling fluid.clip.set_gradient_clip( clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=clip_norm_thres)) def exclude_from_weight_decay(name): if name.find("layer_norm") > -1: return True bias_suffix = ["_bias", "_b", ".b_0"] for suffix in bias_suffix: if name.endswith(suffix): return True return False param_list = dict() """ if use_fp16: param_grads = optimizer.backward(loss) master_param_grads = create_master_params_grads( param_grads, train_program, startup_prog, loss_scaling) for param, _ in master_param_grads: param_list[param.name] = param * 1.0 param_list[param.name].stop_gradient = True optimizer.apply_gradients(master_param_grads) if weight_decay > 0: for param, grad in master_param_grads: if exclude_from_weight_decay(param.name.rstrip(".master")): continue with param.block.program._optimized_guard( [param, grad]), fluid.framework.name_scope("weight_decay"): updated_param = param - param_list[ param.name] * weight_decay * scheduled_lr fluid.layers.assign(output=param, input=updated_param) master_param_to_train_param(master_param_grads, param_grads, train_program) """ if use_fp16: pass else: for param in train_program.global_block().all_parameters(): param_list[param.name] = param * 1.0 param_list[param.name].stop_gradient = True #optimizer = fluid.contrib.mixed_precision.decorate(optimizer, # init_loss_scaling=1.0, # use_dynamic_loss_scaling=True) if dist_strategy is not None: optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) _, param_grads = optimizer.minimize(loss) if weight_decay > 0: for param, grad in param_grads: if exclude_from_weight_decay(param.name): continue with param.block.program._optimized_guard( [param, grad]), fluid.framework.name_scope("weight_decay"): updated_param = param - param_list[ param.name] * weight_decay * scheduled_lr fluid.layers.assign(output=param, input=updated_param) return scheduled_lr
def compress(args): shuffle = True if args.ce_test: # set seed seed = 111 paddle.seed(seed) np.random.seed(seed) random.seed(seed) args.num_workers = 0 shuffle = False env = os.environ num_trainers = int(env.get('PADDLE_TRAINERS_NUM', 1)) use_data_parallel = num_trainers > 1 if use_data_parallel: # Fleet step 1: initialize the distributed environment role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) train_reader = None test_reader = None if args.data == "mnist": transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = paddle.vision.datasets.MNIST( mode='train', backend="cv2", transform=transform) val_dataset = paddle.vision.datasets.MNIST( mode='test', backend="cv2", transform=transform) class_dim = 10 image_shape = "1,28,28" args.pretrained_model = False elif args.data == "cifar10": transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = paddle.vision.datasets.Cifar10( mode="train", backend="cv2", transform=transform) val_dataset = paddle.vision.datasets.Cifar10( mode="test", backend="cv2", transform=transform) class_dim = 10 image_shape = "3, 32, 32" args.pretrained_model = False elif args.data == "imagenet": import imagenet_reader as reader train_dataset = reader.ImageNetDataset(mode='train') val_dataset = reader.ImageNetDataset(mode='val') class_dim = 1000 image_shape = "3,224,224" else: raise ValueError("{} is not supported.".format(args.data)) image_shape = [int(m) for m in image_shape.split(",")] assert args.model in model_list, "{} is not in lists: {}".format(args.model, model_list) if args.use_gpu: places = paddle.static.cuda_places() else: places = paddle.static.cpu_places() place = places[0] exe = paddle.static.Executor(place) image = paddle.static.data( name='image', shape=[None] + image_shape, dtype='float32') label = paddle.static.data(name='label', shape=[None, 1], dtype='int64') batch_size_per_card = args.batch_size batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=batch_size_per_card, shuffle=shuffle, drop_last=True) train_loader = paddle.io.DataLoader( train_dataset, places=place, batch_sampler=batch_sampler, feed_list=[image, label], return_list=False, use_shared_memory=True, num_workers=args.num_workers) valid_loader = paddle.io.DataLoader( val_dataset, places=place, feed_list=[image, label], drop_last=False, return_list=False, use_shared_memory=True, batch_size=args.batch_size_for_validation, shuffle=False) step_per_epoch = int( np.ceil(len(train_dataset) * 1. / args.batch_size / num_trainers)) # model definition model = models.__dict__[args.model]() out = model.net(input=image, class_dim=class_dim) if args.data == 'cifar10': label = paddle.reshape(label, [-1, 1]) cost = paddle.nn.functional.loss.cross_entropy(input=out, label=label) avg_cost = paddle.mean(x=cost) acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1) acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5) val_program = paddle.static.default_main_program().clone(for_test=True) opt, learning_rate = create_optimizer(args, step_per_epoch) # Fleet step 2: distributed strategy if use_data_parallel: dist_strategy = DistributedStrategy() dist_strategy.sync_batch_norm = False dist_strategy.exec_strategy = paddle.static.ExecutionStrategy() dist_strategy.fuse_all_reduce_ops = False train_program = paddle.static.default_main_program() if args.pruning_strategy == 'gmp': # GMP pruner step 0: define configs for GMP, no need to define configs for the base training. configs = { 'stable_iterations': args.stable_epochs * step_per_epoch, 'pruning_iterations': args.pruning_epochs * step_per_epoch, 'tunning_iterations': args.tunning_epochs * step_per_epoch, 'resume_iteration': (args.last_epoch + 1) * step_per_epoch, 'pruning_steps': args.pruning_steps, 'initial_ratio': args.initial_ratio, } elif args.pruning_strategy == 'base': configs = None # GMP pruner step 1: initialize a pruner object by calling entry function. pruner = create_unstructured_pruner( train_program, args, place, configs=configs) if use_data_parallel: # Fleet step 3: decorate the origial optimizer and minimize it opt = fleet.distributed_optimizer(opt, strategy=dist_strategy) opt.minimize(avg_cost, no_grad_set=pruner.no_grad_set) exe.run(paddle.static.default_startup_program()) if args.last_epoch > -1: assert args.checkpoint is not None and os.path.exists( args.checkpoint), "Please specify a valid checkpoint path." paddle.fluid.io.load_persistables( executor=exe, dirname=args.checkpoint, main_program=train_program) elif args.pretrained_model: assert os.path.exists( args. pretrained_model), "Pretrained model path {} doesn't exist".format( args.pretrained_model) def if_exist(var): return os.path.exists(os.path.join(args.pretrained_model, var.name)) _logger.info("Load pretrained model from {}".format( args.pretrained_model)) # NOTE: We are using fluid.io.load_vars() because the pretrained model is from an older version which requires this API. # Please consider using paddle.static.load(program, model_path) when possible paddle.fluid.io.load_vars( exe, args.pretrained_model, predicate=if_exist) def test(epoch, program): acc_top1_ns = [] acc_top5_ns = [] _logger.info( "The current sparsity of the inference model is {}%".format( round(100 * UnstructuredPruner.total_sparse( paddle.static.default_main_program()), 2))) for batch_id, data in enumerate(valid_loader): start_time = time.time() acc_top1_n, acc_top5_n = exe.run( program, feed=data, fetch_list=[acc_top1.name, acc_top5.name]) end_time = time.time() if batch_id % args.log_period == 0: _logger.info( "Eval epoch[{}] batch[{}] - acc_top1: {}; acc_top5: {}; time: {}". format(epoch, batch_id, np.mean(acc_top1_n), np.mean(acc_top5_n), end_time - start_time)) acc_top1_ns.append(np.mean(acc_top1_n)) acc_top5_ns.append(np.mean(acc_top5_n)) _logger.info("Final eval epoch[{}] - acc_top1: {}; acc_top5: {}".format( epoch, np.mean(np.array(acc_top1_ns)), np.mean(np.array(acc_top5_ns)))) def train(epoch, program): train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() for batch_id, data in enumerate(train_loader): train_reader_cost += time.time() - reader_start train_start = time.time() loss_n, acc_top1_n, acc_top5_n = exe.run( program, feed=data, fetch_list=[avg_cost.name, acc_top1.name, acc_top5.name]) # GMP pruner step 2: step() to update ratios and other internal states of the pruner. pruner.step() train_run_cost += time.time() - train_start total_samples += args.batch_size loss_n = np.mean(loss_n) acc_top1_n = np.mean(acc_top1_n) acc_top5_n = np.mean(acc_top5_n) if batch_id % args.log_period == 0: _logger.info( "epoch[{}]-batch[{}] lr: {:.6f} - loss: {}; acc_top1: {}; acc_top5: {}; avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec". format(epoch, batch_id, learning_rate.get_lr(), loss_n, acc_top1_n, acc_top5_n, train_reader_cost / args.log_period, ( train_reader_cost + train_run_cost ) / args.log_period, total_samples / args.log_period, total_samples / (train_reader_cost + train_run_cost ))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 learning_rate.step() reader_start = time.time() if use_data_parallel: # Fleet step 4: get the compiled program from fleet compiled_train_program = fleet.main_program else: compiled_train_program = paddle.static.CompiledProgram( paddle.static.default_main_program()) for i in range(args.last_epoch + 1, args.num_epochs): train(i, compiled_train_program) # GMP pruner step 3: update params before summrizing sparsity, saving model or evaluation. pruner.update_params() _logger.info("The current sparsity of the pruned model is: {}%".format( round(100 * UnstructuredPruner.total_sparse( paddle.static.default_main_program()), 2))) if (i + 1) % args.test_period == 0: test(i, val_program) if (i + 1) % args.model_period == 0: if use_data_parallel: fleet.save_persistables(executor=exe, dirname=args.model_path) else: paddle.fluid.io.save_persistables( executor=exe, dirname=args.model_path)
def net(self, args=None): """ resnet struct. Args: fleet: args (ArgumentParser): run args to config dist fleet. Returns: tuple: the return value contains avg_cost, py_reader """ from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy from thirdparty.image_classfication.models.resnet import ResNet50 from thirdparty.image_classfication.train import parser from thirdparty.image_classfication.train import optimizer_setting parser.add_argument('--update_method', type=str, required=True, choices=['pserver', 'nccl']) parser.add_argument('--role', type=str, required=True, choices=['pserver', 'trainer']) parser.add_argument('--endpoints', type=str, required=False, default="") parser.add_argument('--current_id', type=int, required=False, default=0) parser.add_argument('--trainers', type=int, required=False, default=1) # parser.add_argument('--sync_mode', action='store_true') parser.add_argument('--run_params', type=str, required=False, default='{}') args = parser.parse_args() args.run_params = json.loads(args.run_params) image_shape = [3, 224, 224] scale_loss = 1.0 self.py_reader = fluid.layers.py_reader(capacity=16, shapes=[[-1] + image_shape, [-1, 1]], lod_levels=[0, 0], dtypes=["float32", "int64"], use_double_buffer=True) image, label = fluid.layers.read_file(self.py_reader) run_model = ResNet50() out = run_model.net(image, 4) softmax_out = fluid.layers.softmax(out, use_cudnn=False) cost, prob = fluid.layers.softmax_with_cross_entropy( out, label, return_softmax=True) self.avg_cost = fluid.layers.mean(cost) params = run_model.params params["total_images"] = args.total_images params["lr"] = 1e-5 params["num_epochs"] = args.num_epochs params["learning_strategy"]["batch_size"] = args.batch_size params["learning_strategy"]["name"] = args.lr_strategy params["l2_decay"] = args.l2_decay params["momentum_rate"] = args.momentum_rate optimizer = optimizer_setting(params) global_lr = optimizer._global_learning_rate() global_lr.persistable = True exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = 1 exec_strategy.num_iteration_per_drop_scope = 30 dist_strategy = DistributedStrategy() dist_strategy.exec_strategy = exec_strategy dist_strategy.enable_inplace = args.run_params['enable_inplace'] dist_strategy.fuse_all_reduce_ops = args.run_params[ 'fuse_all_reduce_ops'] dist_strategy.nccl_comm_num = args.run_params['nccl_comm_num'] dist_strategy.use_local_sgd = args.run_params['use_local_sgd'] dist_strategy.mode = args.run_params["mode"] dist_strategy.collective_mode = args.run_params["collective"] if args.run_params["fp16"]: optimizer = fluid.contrib.mixed_precision.decorate( optimizer, init_loss_scaling=128.0, use_dynamic_loss_scaling=True) if "use_dgc" in args.run_params and args.run_params["use_dgc"]: # use dgc must close fuse dist_strategy.fuse_all_reduce_ops = False optimizer = fluid.optimizer.DGCMomentumOptimizer( learning_rate=0.001, momentum=0.9, rampup_begin_step=0) dist_optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) _, param_grads = dist_optimizer.minimize(self.avg_cost) shuffle_seed = 1 train_reader = reader.train(settings=args, data_dir=DATA_DIR, pass_id_as_seed=shuffle_seed) self.py_reader.decorate_paddle_reader( paddle.batch(train_reader, batch_size=self.batch_size)) if scale_loss > 1: avg_cost = fluid.layers.mean(x=cost) * scale_loss return self.avg_cost, self.py_reader
def train(args): """train start""" logging.info(args) gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) dev_count = 1 exe = fluid.Executor(place) train_program = fluid.Program() startup_program = fluid.Program() # For Distributed Training. role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) args.num_trainers = fleet.worker_num() args.trainer_id = fleet.worker_index() dist_strategy = DistributedStrategy() with fluid.program_guard(train_program, startup_program): with fluid.unique_name.guard(): sum_cost, avg_cost, predict, token_num, pyreader = transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.prepostprocess_dropout, ModelHyperParams.attention_dropout, ModelHyperParams.relu_dropout, ModelHyperParams.preprocess_cmd, ModelHyperParams.postprocess_cmd, ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps, ModelHyperParams.bos_idx, use_py_reader=args.use_py_reader, is_test=False) optimizer = None if args.sync: lr_decay = fluid.layers.learning_rate_scheduler.noam_decay( ModelHyperParams.d_model, TrainTaskConfig.warmup_steps) with fluid.default_main_program()._lr_schedule_guard(): learning_rate = lr_decay * TrainTaskConfig.learning_rate optimizer = fluid.optimizer.Adam(learning_rate=learning_rate, beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps) else: optimizer = fluid.optimizer.SGD(0.003) if args.use_fp16: optimizer = decorate(optimizer, init_loss_scaling=args.loss_scaling) optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) optimizer.minimize(avg_cost, startup_program) train_program = fleet.main_program orig_train_program = fleet._origin_program train_loop(args, exe, train_program, orig_train_program, startup_program, dev_count, sum_cost, avg_cost, token_num, predict, pyreader)