def main(): env = os.environ FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env if FLAGS.dist: trainer_id = int(env['PADDLE_TRAINER_ID']) import random local_seed = (99 + trainer_id) random.seed(local_seed) np.random.seed(local_seed) cfg = load_config(FLAGS.config) if 'architecture' in cfg: main_arch = cfg.architecture else: raise ValueError("'architecture' not specified in config file.") merge_config(FLAGS.opt) if 'log_iter' not in cfg: cfg.log_iter = 20 # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) # check if paddlepaddle version is satisfied check_version() if cfg.use_gpu: devices_num = fluid.core.get_cuda_device_count() else: devices_num = int(os.environ.get('CPU_NUM', 1)) if 'FLAGS_selected_gpus' in env: device_id = int(env['FLAGS_selected_gpus']) else: device_id = 0 place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) lr_builder = create('LearningRate') optim_builder = create('OptimizerBuilder') # add NAS config = ([(cfg.search_space)]) server_address = (cfg.server_ip, cfg.server_port) load_checkpoint = FLAGS.resume_checkpoint if FLAGS.resume_checkpoint else None sa_nas = SANAS(config, server_addr=server_address, init_temperature=cfg.init_temperature, reduce_rate=cfg.reduce_rate, search_steps=cfg.search_steps, save_checkpoint=cfg.save_dir, load_checkpoint=load_checkpoint, is_server=cfg.is_server) start_iter = 0 train_reader = create_reader(cfg.TrainReader, (cfg.max_iters - start_iter) * devices_num, cfg) eval_reader = create_reader(cfg.EvalReader) constraint = create('Constraint') for step in range(cfg.search_steps): logger.info('----->>> search step: {} <<<------'.format(step)) archs = sa_nas.next_archs()[0] # build program startup_prog = fluid.Program() train_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) if FLAGS.fp16: assert (getattr(model.backbone, 'norm_type', None) != 'affine_channel'), \ '--fp16 currently does not support affine channel, ' \ ' please modify backbone settings to use batch norm' with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx: inputs_def = cfg['TrainReader']['inputs_def'] feed_vars, train_loader = model.build_inputs(**inputs_def) train_fetches = archs(feed_vars, 'train', cfg) loss = train_fetches['loss'] if FLAGS.fp16: loss *= ctx.get_loss_scale_var() lr = lr_builder() optimizer = optim_builder(lr) optimizer.minimize(loss) if FLAGS.fp16: loss /= ctx.get_loss_scale_var() current_constraint = constraint.compute_constraint(train_prog) logger.info('current steps: {}, constraint {}'.format( step, current_constraint)) if (constraint.max_constraint != None and current_constraint > constraint.max_constraint) or ( constraint.min_constraint != None and current_constraint < constraint.min_constraint): continue # parse train fetches train_keys, train_values, _ = parse_fetches(train_fetches) train_values.append(lr) if FLAGS.eval: eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg['EvalReader']['inputs_def'] feed_vars, eval_loader = model.build_inputs(**inputs_def) fetches = archs(feed_vars, 'eval', cfg) eval_prog = eval_prog.clone(True) eval_loader.set_sample_list_generator(eval_reader, place) extra_keys = ['im_id', 'im_shape', 'gt_bbox'] eval_keys, eval_values, eval_cls = parse_fetches( fetches, eval_prog, extra_keys) # compile program for multi-devices build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_optimizer_ops = False build_strategy.fuse_elewise_add_act_ops = True exec_strategy = fluid.ExecutionStrategy() # iteration number when CompiledProgram tries to drop local execution scopes. # Set it to be 1 to save memory usages, so that unused variables in # local execution scopes can be deleted after each iteration. exec_strategy.num_iteration_per_drop_scope = 1 if FLAGS.dist: dist_utils.prepare_for_multi_process(exe, build_strategy, startup_prog, train_prog) exec_strategy.num_threads = 1 exe.run(startup_prog) compiled_train_prog = fluid.CompiledProgram( train_prog).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) if FLAGS.eval: compiled_eval_prog = fluid.compiler.CompiledProgram(eval_prog) train_loader.set_sample_list_generator(train_reader, place) train_stats = TrainingStats(cfg.log_smooth_window, train_keys) train_loader.start() end_time = time.time() cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) time_stat = deque(maxlen=cfg.log_smooth_window) ap = 0 for it in range(start_iter, cfg.max_iters): start_time = end_time end_time = time.time() time_stat.append(end_time - start_time) time_cost = np.mean(time_stat) eta_sec = (cfg.max_iters - it) * time_cost eta = str(datetime.timedelta(seconds=int(eta_sec))) outs = exe.run(compiled_train_prog, fetch_list=train_values) stats = { k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1]) } train_stats.update(stats) logs = train_stats.log() if it % cfg.log_iter == 0 and (not FLAGS.dist or trainer_id == 0): strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format( it, np.mean(outs[-1]), logs, time_cost, eta) logger.info(strs) if (it > 0 and it == cfg.max_iters - 1) and (not FLAGS.dist or trainer_id == 0): save_name = str( it) if it != cfg.max_iters - 1 else "model_final" checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name)) if FLAGS.eval: # evaluation results = eval_run(exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls) ap = calculate_ap_py(results) train_loader.reset() eval_loader.reset() logger.info('rewards: ap is {}'.format(ap)) sa_nas.reward(float(ap)) current_best_tokens = sa_nas.current_info()['best_tokens'] logger.info("All steps end, the best BlazeFace-NAS structure is: ") sa_nas.tokens2arch(current_best_tokens)
def main(): cfg = load_config(FLAGS.config) if 'architecture' in cfg: main_arch = cfg.architecture else: raise ValueError("'architecture' not specified in config file.") merge_config(FLAGS.opt) if 'log_iter' not in cfg: cfg.log_iter = 20 # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) if cfg.use_gpu: devices_num = fluid.core.get_cuda_device_count() else: devices_num = int( os.environ.get('CPU_NUM', multiprocessing.cpu_count())) if 'train_feed' not in cfg: train_feed = create(main_arch + 'TrainFeed') else: train_feed = create(cfg.train_feed) if FLAGS.eval: if 'eval_feed' not in cfg: eval_feed = create(main_arch + 'EvalFeed') else: eval_feed = create(cfg.eval_feed) place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) lr_builder = create('LearningRate') optim_builder = create('OptimizerBuilder') # build program startup_prog = fluid.Program() train_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) train_pyreader, feed_vars = create_feed(train_feed) train_fetches = model.train(feed_vars) loss = train_fetches['loss'] lr = lr_builder() optimizer = optim_builder(lr) optimizer.minimize(loss) train_reader = create_reader(train_feed, cfg.max_iters * devices_num, FLAGS.dataset_dir) train_pyreader.decorate_sample_list_generator(train_reader, place) # parse train fetches train_keys, train_values, _ = parse_fetches(train_fetches) train_values.append(lr) if FLAGS.eval: eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) eval_pyreader, feed_vars = create_feed(eval_feed) fetches = model.eval(feed_vars) eval_prog = eval_prog.clone(True) eval_reader = create_reader(eval_feed, args_path=FLAGS.dataset_dir) eval_pyreader.decorate_sample_list_generator(eval_reader, place) # parse eval fetches extra_keys = [] if cfg.metric == 'COCO': extra_keys = ['im_info', 'im_id', 'im_shape'] if cfg.metric == 'VOC': extra_keys = ['gt_box', 'gt_label', 'is_difficult'] eval_keys, eval_values, eval_cls = parse_fetches( fetches, eval_prog, extra_keys) # compile program for multi-devices build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = False build_strategy.enable_inplace = False sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn' # only enable sync_bn in multi GPU devices build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \ and cfg.use_gpu train_compile_program = fluid.compiler.CompiledProgram( train_prog).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy) if FLAGS.eval: eval_compile_program = fluid.compiler.CompiledProgram(eval_prog) exe.run(startup_prog) fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel' start_iter = 0 if FLAGS.resume_checkpoint: checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint) start_iter = checkpoint.global_step() elif cfg.pretrain_weights and fuse_bn: checkpoint.load_and_fusebn(exe, train_prog, cfg.pretrain_weights) elif cfg.pretrain_weights: checkpoint.load_pretrain(exe, train_prog, cfg.pretrain_weights) # whether output bbox is normalized in model output layer is_bbox_normalized = False if hasattr(model, 'is_bbox_normalized') and \ callable(model.is_bbox_normalized): is_bbox_normalized = model.is_bbox_normalized() train_stats = TrainingStats(cfg.log_smooth_window, train_keys) train_pyreader.start() start_time = time.time() end_time = time.time() cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) time_stat = deque(maxlen=cfg.log_iter) best_box_ap_list = [0.0, 0] #[map, iter] for it in range(start_iter, cfg.max_iters): start_time = end_time end_time = time.time() time_stat.append(end_time - start_time) time_cost = np.mean(time_stat) eta_sec = (cfg.max_iters - it) * time_cost eta = str(datetime.timedelta(seconds=int(eta_sec))) outs = exe.run(train_compile_program, fetch_list=train_values) stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])} train_stats.update(stats) logs = train_stats.log() if it % cfg.log_iter == 0: strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format( it, np.mean(outs[-1]), logs, time_cost, eta) logger.info(strs) if it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1: save_name = str(it) if it != cfg.max_iters - 1 else "model_final" checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name)) if FLAGS.eval: # evaluation results = eval_run(exe, eval_compile_program, eval_pyreader, eval_keys, eval_values, eval_cls) resolution = None if 'mask' in results[0]: resolution = model.mask_head.resolution box_ap_stats = eval_results(results, eval_feed, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval) if box_ap_stats[0] > best_box_ap_list[0]: best_box_ap_list[0] = box_ap_stats[0] best_box_ap_list[1] = it checkpoint.save(exe, train_prog, os.path.join(save_dir, "best_model")) logger.info("Best test box ap: {}, in iter: {}".format( best_box_ap_list[0], best_box_ap_list[1])) train_pyreader.reset()
def main(): env = os.environ FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env if FLAGS.dist: trainer_id = int(env['PADDLE_TRAINER_ID']) local_seed = (99 + trainer_id) random.seed(local_seed) np.random.seed(local_seed) if FLAGS.enable_ce: random.seed(0) np.random.seed(0) cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) check_config(cfg) # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) # check if paddlepaddle version is satisfied check_version() save_only = getattr(cfg, 'save_prediction_only', False) if save_only: raise NotImplementedError('The config file only support prediction,' ' training stage is not implemented now') main_arch = cfg.architecture if cfg.use_gpu: devices_num = fluid.core.get_cuda_device_count() else: devices_num = int(os.environ.get('CPU_NUM', 1)) if 'FLAGS_selected_gpus' in env: device_id = int(env['FLAGS_selected_gpus']) else: device_id = 0 place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) lr_builder = create('LearningRate') optim_builder = create('OptimizerBuilder') # build program startup_prog = fluid.Program() train_prog = fluid.Program() if FLAGS.enable_ce: startup_prog.random_seed = 1000 train_prog.random_seed = 1000 with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) if FLAGS.fp16: assert (getattr(model.backbone, 'norm_type', None) != 'affine_channel'), \ '--fp16 currently does not support affine channel, ' \ ' please modify backbone settings to use batch norm' with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx: inputs_def = cfg['TrainReader']['inputs_def'] feed_vars, train_loader = model.build_inputs(**inputs_def) train_fetches = model.train(feed_vars) loss = train_fetches['loss'] if FLAGS.fp16: loss *= ctx.get_loss_scale_var() lr = lr_builder() optimizer = optim_builder(lr) optimizer.minimize(loss) if FLAGS.fp16: loss /= ctx.get_loss_scale_var() if 'use_ema' in cfg and cfg['use_ema']: global_steps = _decay_step_counter() ema = ExponentialMovingAverage( cfg['ema_decay'], thres_steps=global_steps) ema.update() # parse train fetches train_keys, train_values, _ = parse_fetches(train_fetches) train_values.append(lr) if FLAGS.eval: eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg['EvalReader']['inputs_def'] feed_vars, eval_loader = model.build_inputs(**inputs_def) fetches = model.eval(feed_vars) eval_prog = eval_prog.clone(True) eval_reader = create_reader(cfg.EvalReader, devices_num=1) eval_loader.set_sample_list_generator(eval_reader, place) # parse eval fetches extra_keys = [] if cfg.metric == 'COCO': extra_keys = ['im_info', 'im_id', 'im_shape'] if cfg.metric == 'VOC': extra_keys = ['gt_bbox', 'gt_class', 'is_difficult'] if cfg.metric == 'WIDERFACE': extra_keys = ['im_id', 'im_shape', 'gt_bbox'] eval_keys, eval_values, eval_cls = parse_fetches(fetches, eval_prog, extra_keys) # compile program for multi-devices build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_optimizer_ops = False # only enable sync_bn in multi GPU devices sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn' build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \ and cfg.use_gpu exec_strategy = fluid.ExecutionStrategy() # iteration number when CompiledProgram tries to drop local execution scopes. # Set it to be 1 to save memory usages, so that unused variables in # local execution scopes can be deleted after each iteration. exec_strategy.num_iteration_per_drop_scope = 1 if FLAGS.dist: dist_utils.prepare_for_multi_process(exe, build_strategy, startup_prog, train_prog) exec_strategy.num_threads = 1 exe.run(startup_prog) compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) if FLAGS.eval: compiled_eval_prog = fluid.CompiledProgram(eval_prog) fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel' ignore_params = cfg.finetune_exclude_pretrained_params \ if 'finetune_exclude_pretrained_params' in cfg else [] start_iter = 0 if FLAGS.resume_checkpoint: checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint) start_iter = checkpoint.global_step() elif cfg.pretrain_weights and fuse_bn and not ignore_params: checkpoint.load_and_fusebn(exe, train_prog, cfg.pretrain_weights) elif cfg.pretrain_weights: checkpoint.load_params( exe, train_prog, cfg.pretrain_weights, ignore_params=ignore_params) train_reader = create_reader( cfg.TrainReader, (cfg.max_iters - start_iter) * devices_num, cfg, devices_num=devices_num) train_loader.set_sample_list_generator(train_reader, place) # whether output bbox is normalized in model output layer is_bbox_normalized = False if hasattr(model, 'is_bbox_normalized') and \ callable(model.is_bbox_normalized): is_bbox_normalized = model.is_bbox_normalized() # if map_type not set, use default 11point, only use in VOC eval map_type = cfg.map_type if 'map_type' in cfg else '11point' train_stats = TrainingStats(cfg.log_smooth_window, train_keys) train_loader.start() start_time = time.time() end_time = time.time() cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) time_stat = deque(maxlen=cfg.log_smooth_window) best_box_ap_list = [0.0, 0] #[map, iter] # use VisualDL to log data if FLAGS.use_vdl: from visualdl import LogWriter vdl_writer = LogWriter(FLAGS.vdl_log_dir) vdl_loss_step = 0 vdl_mAP_step = 0 for it in range(start_iter, cfg.max_iters): start_time = end_time end_time = time.time() time_stat.append(end_time - start_time) time_cost = np.mean(time_stat) eta_sec = (cfg.max_iters - it) * time_cost eta = str(datetime.timedelta(seconds=int(eta_sec))) outs = exe.run(compiled_train_prog, fetch_list=train_values) stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])} # use vdl-paddle to log loss if FLAGS.use_vdl: if it % cfg.log_iter == 0: for loss_name, loss_value in stats.items(): vdl_writer.add_scalar(loss_name, loss_value, vdl_loss_step) vdl_loss_step += 1 train_stats.update(stats) logs = train_stats.log() if it % cfg.log_iter == 0 and (not FLAGS.dist or trainer_id == 0): strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format( it, np.mean(outs[-1]), logs, time_cost, eta) logger.info(strs) # NOTE : profiler tools, used for benchmark if FLAGS.is_profiler and it == 5: profiler.start_profiler("All") elif FLAGS.is_profiler and it == 10: profiler.stop_profiler("total", FLAGS.profiler_path) return if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \ and (not FLAGS.dist or trainer_id == 0): save_name = str(it) if it != cfg.max_iters - 1 else "model_final" if 'use_ema' in cfg and cfg['use_ema']: exe.run(ema.apply_program) checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name)) if FLAGS.eval: # evaluation resolution = None if 'Mask' in cfg.architecture: resolution = model.mask_head.resolution results = eval_run( exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls, cfg, resolution=resolution) box_ap_stats = eval_results( results, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type, cfg['EvalReader']['dataset']) # use vdl_paddle to log mAP if FLAGS.use_vdl: vdl_writer.add_scalar("mAP", box_ap_stats[0], vdl_mAP_step) vdl_mAP_step += 1 if box_ap_stats[0] > best_box_ap_list[0]: best_box_ap_list[0] = box_ap_stats[0] best_box_ap_list[1] = it checkpoint.save(exe, train_prog, os.path.join(save_dir, "best_model")) logger.info("Best test box ap: {}, in iter: {}".format( best_box_ap_list[0], best_box_ap_list[1])) if 'use_ema' in cfg and cfg['use_ema']: exe.run(ema.restore_program) train_loader.reset()
def main(): env = os.environ cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) check_config(cfg) # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) main_arch = cfg.architecture if cfg.use_gpu: devices_num = fluid.core.get_cuda_device_count() else: devices_num = int(os.environ.get('CPU_NUM', 1)) if 'FLAGS_selected_gpus' in env: device_id = int(env['FLAGS_selected_gpus']) else: device_id = 0 place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) # build program model = create(main_arch) inputs_def = cfg['TrainReader']['inputs_def'] train_feed_vars, train_loader = model.build_inputs(**inputs_def) train_fetches = model.train(train_feed_vars) loss = train_fetches['loss'] start_iter = 0 train_reader = create_reader(cfg.TrainReader, (cfg.max_iters - start_iter) * devices_num, cfg) train_loader.set_sample_list_generator(train_reader, place) # get all student variables student_vars = [] for v in fluid.default_main_program().list_vars(): try: student_vars.append((v.name, v.shape)) except: pass # uncomment the following lines to print all student variables # print("="*50 + "student_model_vars" + "="*50) # print(student_vars) eval_prog = fluid.Program() with fluid.program_guard(eval_prog, fluid.default_startup_program()): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg['EvalReader']['inputs_def'] test_feed_vars, eval_loader = model.build_inputs(**inputs_def) fetches = model.eval(test_feed_vars) eval_prog = eval_prog.clone(True) eval_reader = create_reader(cfg.EvalReader) eval_loader.set_sample_list_generator(eval_reader, place) # parse eval fetches extra_keys = [] if cfg.metric == 'COCO': extra_keys = ['im_info', 'im_id', 'im_shape'] if cfg.metric == 'VOC': extra_keys = ['gt_bbox', 'gt_class', 'is_difficult'] eval_keys, eval_values, eval_cls = parse_fetches(fetches, eval_prog, extra_keys) teacher_cfg = load_config(FLAGS.teacher_config) merge_config(FLAGS.opt) teacher_arch = teacher_cfg.architecture teacher_program = fluid.Program() teacher_startup_program = fluid.Program() with fluid.program_guard(teacher_program, teacher_startup_program): with fluid.unique_name.guard(): teacher_feed_vars = OrderedDict() for name, var in train_feed_vars.items(): teacher_feed_vars[name] = teacher_program.global_block( )._clone_variable(var, force_persistable=False) model = create(teacher_arch) train_fetches = model.train(teacher_feed_vars) teacher_loss = train_fetches['loss'] # get all teacher variables teacher_vars = [] for v in teacher_program.list_vars(): try: teacher_vars.append((v.name, v.shape)) except: pass # uncomment the following lines to print all teacher variables # print("="*50 + "teacher_model_vars" + "="*50) # print(teacher_vars) exe.run(teacher_startup_program) assert FLAGS.teacher_pretrained, "teacher_pretrained should be set" checkpoint.load_params(exe, teacher_program, FLAGS.teacher_pretrained) teacher_program = teacher_program.clone(for_test=True) cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) data_name_map = { 'target0': 'target0', 'target1': 'target1', 'target2': 'target2', 'image': 'image', 'gt_bbox': 'gt_bbox', 'gt_class': 'gt_class', 'gt_score': 'gt_score' } merge(teacher_program, fluid.default_main_program(), data_name_map, place) yolo_output_names = [ 'strided_slice_0.tmp_0', 'strided_slice_1.tmp_0', 'strided_slice_2.tmp_0', 'strided_slice_3.tmp_0', 'strided_slice_4.tmp_0', 'transpose_0.tmp_0', 'strided_slice_5.tmp_0', 'strided_slice_6.tmp_0', 'strided_slice_7.tmp_0', 'strided_slice_8.tmp_0', 'strided_slice_9.tmp_0', 'transpose_2.tmp_0', 'strided_slice_10.tmp_0', 'strided_slice_11.tmp_0', 'strided_slice_12.tmp_0', 'strided_slice_13.tmp_0', 'strided_slice_14.tmp_0', 'transpose_4.tmp_0' ] distill_pairs = [['teacher_conv2d_6.tmp_1', 'conv2d_20.tmp_1'], ['teacher_conv2d_14.tmp_1', 'conv2d_28.tmp_1'], ['teacher_conv2d_22.tmp_1', 'conv2d_36.tmp_1']] distill_loss = l2_distill( distill_pairs, 100) if not cfg.use_fine_grained_loss else split_distill( yolo_output_names, 1000) loss = distill_loss + loss lr_builder = create('LearningRate') optim_builder = create('OptimizerBuilder') lr = lr_builder() opt = optim_builder(lr) opt.minimize(loss) exe.run(fluid.default_startup_program()) fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel' ignore_params = cfg.finetune_exclude_pretrained_params \ if 'finetune_exclude_pretrained_params' in cfg else [] if FLAGS.resume_checkpoint: checkpoint.load_checkpoint(exe, fluid.default_main_program(), FLAGS.resume_checkpoint) start_iter = checkpoint.global_step() elif cfg.pretrain_weights and fuse_bn and not ignore_params: checkpoint.load_and_fusebn(exe, fluid.default_main_program(), cfg.pretrain_weights) elif cfg.pretrain_weights: checkpoint.load_params(exe, fluid.default_main_program(), cfg.pretrain_weights, ignore_params=ignore_params) build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_reduce_ops = False build_strategy.fuse_all_optimizer_ops = False # only enable sync_bn in multi GPU devices sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn' build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \ and cfg.use_gpu exec_strategy = fluid.ExecutionStrategy() # iteration number when CompiledProgram tries to drop local execution scopes. # Set it to be 1 to save memory usages, so that unused variables in # local execution scopes can be deleted after each iteration. exec_strategy.num_iteration_per_drop_scope = 1 parallel_main = fluid.CompiledProgram( fluid.default_main_program()).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) compiled_eval_prog = fluid.compiler.CompiledProgram(eval_prog) # whether output bbox is normalized in model output layer is_bbox_normalized = False if hasattr(model, 'is_bbox_normalized') and \ callable(model.is_bbox_normalized): is_bbox_normalized = model.is_bbox_normalized() map_type = cfg.map_type if 'map_type' in cfg else '11point' best_box_ap_list = [0.0, 0] #[map, iter] cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) train_loader.start() for step_id in range(start_iter, cfg.max_iters): teacher_loss_np, distill_loss_np, loss_np, lr_np = exe.run( parallel_main, fetch_list=[ 'teacher_' + teacher_loss.name, distill_loss.name, loss.name, lr.name ]) if step_id % cfg.log_iter == 0: logger.info( "step {} lr {:.6f}, loss {:.6f}, distill_loss {:.6f}, teacher_loss {:.6f}" .format(step_id, lr_np[0], loss_np[0], distill_loss_np[0], teacher_loss_np[0])) if step_id % cfg.snapshot_iter == 0 and step_id != 0 or step_id == cfg.max_iters - 1: save_name = str( step_id) if step_id != cfg.max_iters - 1 else "model_final" checkpoint.save(exe, fluid.default_main_program(), os.path.join(save_dir, save_name)) if FLAGS.save_inference: feeded_var_names = ['image', 'im_size'] targets = list(fetches.values()) fluid.io.save_inference_model(save_dir + '/infer', feeded_var_names, targets, exe, eval_prog) # eval results = eval_run(exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls, cfg) resolution = None box_ap_stats = eval_results(results, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type, cfg['EvalReader']['dataset']) if box_ap_stats[0] > best_box_ap_list[0]: best_box_ap_list[0] = box_ap_stats[0] best_box_ap_list[1] = step_id checkpoint.save(exe, fluid.default_main_program(), os.path.join(save_dir, "best_model")) if FLAGS.save_inference: feeded_var_names = ['image', 'im_size'] targets = list(fetches.values()) fluid.io.save_inference_model(save_dir + '/infer', feeded_var_names, targets, exe, eval_prog) logger.info("Best test box ap: {}, in step: {}".format( best_box_ap_list[0], best_box_ap_list[1])) train_loader.reset()
def main(): env = os.environ cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) check_config(cfg) # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) check_version() main_arch = cfg.architecture if cfg.use_gpu: devices_num = fluid.core.get_cuda_device_count() else: devices_num = int(os.environ.get('CPU_NUM', 1)) if 'FLAGS_selected_gpus' in env: device_id = int(env['FLAGS_selected_gpus']) else: device_id = 0 place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) # build program model = create(main_arch) inputs_def = cfg['TrainReader']['inputs_def'] train_feed_vars, train_loader = model.build_inputs(**inputs_def) train_fetches = model.train(train_feed_vars) loss = train_fetches['loss'] start_iter = 0 train_reader = create_reader(cfg.TrainReader, (cfg.max_iters - start_iter) * devices_num, cfg) # When iterable mode, set set_sample_list_generator(train_reader, place) train_loader.set_sample_list_generator(train_reader) eval_prog = fluid.Program() with fluid.program_guard(eval_prog, fluid.default_startup_program()): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg['EvalReader']['inputs_def'] test_feed_vars, eval_loader = model.build_inputs(**inputs_def) fetches = model.eval(test_feed_vars) eval_prog = eval_prog.clone(True) eval_reader = create_reader(cfg.EvalReader) # When iterable mode, set set_sample_list_generator(eval_reader, place) eval_loader.set_sample_list_generator(eval_reader) teacher_cfg = load_config(FLAGS.teacher_config) merge_config(FLAGS.opt) teacher_arch = teacher_cfg.architecture teacher_program = fluid.Program() teacher_startup_program = fluid.Program() with fluid.program_guard(teacher_program, teacher_startup_program): with fluid.unique_name.guard(): teacher_feed_vars = OrderedDict() for name, var in train_feed_vars.items(): teacher_feed_vars[name] = teacher_program.global_block( )._clone_variable(var, force_persistable=False) model = create(teacher_arch) train_fetches = model.train(teacher_feed_vars) teacher_loss = train_fetches['loss'] exe.run(teacher_startup_program) assert FLAGS.teacher_pretrained, "teacher_pretrained should be set" checkpoint.load_params(exe, teacher_program, FLAGS.teacher_pretrained) teacher_program = teacher_program.clone(for_test=True) target_number = len(model.yolo_head.anchor_masks) data_name_map = { 'image': 'image', 'gt_bbox': 'gt_bbox', 'gt_class': 'gt_class', 'gt_score': 'gt_score' } for i in range(target_number): data_name_map['target{}'.format(i)] = 'target{}'.format(i) merge(teacher_program, fluid.default_main_program(), data_name_map, place) output_names = [ [ 'strided_slice_0.tmp_0', 'strided_slice_1.tmp_0', 'strided_slice_2.tmp_0', 'strided_slice_3.tmp_0', 'strided_slice_4.tmp_0', 'transpose_0.tmp_0' ], [ 'strided_slice_5.tmp_0', 'strided_slice_6.tmp_0', 'strided_slice_7.tmp_0', 'strided_slice_8.tmp_0', 'strided_slice_9.tmp_0', 'transpose_2.tmp_0' ], [ 'strided_slice_10.tmp_0', 'strided_slice_11.tmp_0', 'strided_slice_12.tmp_0', 'strided_slice_13.tmp_0', 'strided_slice_14.tmp_0', 'transpose_4.tmp_0' ], ] yolo_output_names = [] for i in range(target_number): yolo_output_names.extend(output_names[i]) assert cfg.use_fine_grained_loss, \ "Only support use_fine_grained_loss=True, Please set it in config file or '-o use_fine_grained_loss=true'" distill_loss = split_distill(yolo_output_names, 1000, target_number) loss = distill_loss + loss lr_builder = create('LearningRate') optim_builder = create('OptimizerBuilder') lr = lr_builder() opt = optim_builder(lr) opt.minimize(loss) exe.run(fluid.default_startup_program()) checkpoint.load_params(exe, fluid.default_main_program(), cfg.pretrain_weights) assert FLAGS.pruned_params is not None, \ "FLAGS.pruned_params is empty!!! Please set it by '--pruned_params' option." pruned_params = FLAGS.pruned_params.strip().split(",") logger.info("pruned params: {}".format(pruned_params)) pruned_ratios = [float(n) for n in FLAGS.pruned_ratios.strip().split(",")] logger.info("pruned ratios: {}".format(pruned_ratios)) assert len(pruned_params) == len(pruned_ratios), \ "The length of pruned params and pruned ratios should be equal." assert pruned_ratios > [0] * len(pruned_ratios) and pruned_ratios < [1] * len(pruned_ratios), \ "The elements of pruned ratios should be in range (0, 1)." assert FLAGS.prune_criterion in ['l1_norm', 'geometry_median'], \ "unsupported prune criterion {}".format(FLAGS.prune_criterion) pruner = Pruner(criterion=FLAGS.prune_criterion) distill_prog = pruner.prune(fluid.default_main_program(), fluid.global_scope(), params=pruned_params, ratios=pruned_ratios, place=place, only_graph=False)[0] base_flops = flops(eval_prog) eval_prog = pruner.prune(eval_prog, fluid.global_scope(), params=pruned_params, ratios=pruned_ratios, place=place, only_graph=True)[0] pruned_flops = flops(eval_prog) logger.info("FLOPs -{}; total FLOPs: {}; pruned FLOPs: {}".format( float(base_flops - pruned_flops) / base_flops, base_flops, pruned_flops)) build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_reduce_ops = False build_strategy.fuse_all_optimizer_ops = False build_strategy.fuse_elewise_add_act_ops = True # only enable sync_bn in multi GPU devices sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn' build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \ and cfg.use_gpu exec_strategy = fluid.ExecutionStrategy() # iteration number when CompiledProgram tries to drop local execution scopes. # Set it to be 1 to save memory usages, so that unused variables in # local execution scopes can be deleted after each iteration. exec_strategy.num_iteration_per_drop_scope = 1 parallel_main = fluid.CompiledProgram(distill_prog).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) compiled_eval_prog = fluid.CompiledProgram(eval_prog) # parse eval fetches extra_keys = [] if cfg.metric == 'COCO': extra_keys = ['im_info', 'im_id', 'im_shape'] if cfg.metric == 'VOC': extra_keys = ['gt_bbox', 'gt_class', 'is_difficult'] eval_keys, eval_values, eval_cls = parse_fetches(fetches, eval_prog, extra_keys) # whether output bbox is normalized in model output layer is_bbox_normalized = False if hasattr(model, 'is_bbox_normalized') and \ callable(model.is_bbox_normalized): is_bbox_normalized = model.is_bbox_normalized() map_type = cfg.map_type if 'map_type' in cfg else '11point' best_box_ap_list = [0.0, 0] #[map, iter] cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) train_loader.start() for step_id in range(start_iter, cfg.max_iters): teacher_loss_np, distill_loss_np, loss_np, lr_np = exe.run( parallel_main, fetch_list=[ 'teacher_' + teacher_loss.name, distill_loss.name, loss.name, lr.name ]) if step_id % cfg.log_iter == 0: logger.info( "step {} lr {:.6f}, loss {:.6f}, distill_loss {:.6f}, teacher_loss {:.6f}" .format(step_id, lr_np[0], loss_np[0], distill_loss_np[0], teacher_loss_np[0])) if step_id % cfg.snapshot_iter == 0 and step_id != 0 or step_id == cfg.max_iters - 1: save_name = str( step_id) if step_id != cfg.max_iters - 1 else "model_final" checkpoint.save(exe, distill_prog, os.path.join(save_dir, save_name)) # eval results = eval_run(exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls, cfg) resolution = None box_ap_stats = eval_results(results, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type, cfg['EvalReader']['dataset']) if box_ap_stats[0] > best_box_ap_list[0]: best_box_ap_list[0] = box_ap_stats[0] best_box_ap_list[1] = step_id checkpoint.save(exe, distill_prog, os.path.join("./", "best_model")) logger.info("Best test box ap: {}, in step: {}".format( best_box_ap_list[0], best_box_ap_list[1])) train_loader.reset()
def main(): env = os.environ FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env if FLAGS.dist: trainer_id = int(env['PADDLE_TRAINER_ID']) import random local_seed = (99 + trainer_id) random.seed(local_seed) np.random.seed(local_seed) cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) check_config(cfg) # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) # check if paddlepaddle version is satisfied check_version() main_arch = cfg.architecture if cfg.use_gpu: devices_num = fluid.core.get_cuda_device_count() else: devices_num = int(os.environ.get('CPU_NUM', 1)) if 'FLAGS_selected_gpus' in env: device_id = int(env['FLAGS_selected_gpus']) else: device_id = 0 place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) lr_builder = create('LearningRate') optim_builder = create('OptimizerBuilder') # build program startup_prog = fluid.Program() train_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) if FLAGS.fp16: assert (getattr(model.backbone, 'norm_type', None) != 'affine_channel'), \ '--fp16 currently does not support affine channel, ' \ ' please modify backbone settings to use batch norm' with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx: inputs_def = cfg['TrainReader']['inputs_def'] feed_vars, train_loader = model.build_inputs(**inputs_def) train_fetches = model.train(feed_vars) loss = train_fetches['loss'] if FLAGS.fp16: loss *= ctx.get_loss_scale_var() lr = lr_builder() optimizer = optim_builder(lr) optimizer.minimize(loss) if FLAGS.fp16: loss /= ctx.get_loss_scale_var() # parse train fetches train_keys, train_values, _ = parse_fetches(train_fetches) train_values.append(lr) if FLAGS.print_params: param_delimit_str = '-' * 20 + "All parameters in current graph" + '-' * 20 print(param_delimit_str) for block in train_prog.blocks: for param in block.all_parameters(): print("parameter name: {}\tshape: {}".format(param.name, param.shape)) print('-' * len(param_delimit_str)) return if FLAGS.eval: eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg['EvalReader']['inputs_def'] feed_vars, eval_loader = model.build_inputs(**inputs_def) fetches = model.eval(feed_vars) eval_prog = eval_prog.clone(True) eval_reader = create_reader(cfg.EvalReader) eval_loader.set_sample_list_generator(eval_reader, place) # parse eval fetches extra_keys = [] if cfg.metric == 'COCO': extra_keys = ['im_info', 'im_id', 'im_shape'] if cfg.metric == 'VOC': extra_keys = ['gt_bbox', 'gt_class', 'is_difficult'] if cfg.metric == 'WIDERFACE': extra_keys = ['im_id', 'im_shape', 'gt_bbox'] eval_keys, eval_values, eval_cls = parse_fetches(fetches, eval_prog, extra_keys) # compile program for multi-devices build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_optimizer_ops = False build_strategy.fuse_elewise_add_act_ops = True # only enable sync_bn in multi GPU devices sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn' build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \ and cfg.use_gpu exec_strategy = fluid.ExecutionStrategy() # iteration number when CompiledProgram tries to drop local execution scopes. # Set it to be 1 to save memory usages, so that unused variables in # local execution scopes can be deleted after each iteration. exec_strategy.num_iteration_per_drop_scope = 1 if FLAGS.dist: dist_utils.prepare_for_multi_process(exe, build_strategy, startup_prog, train_prog) exec_strategy.num_threads = 1 exe.run(startup_prog) fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel' start_iter = 0 if cfg.pretrain_weights: checkpoint.load_params(exe, train_prog, cfg.pretrain_weights) pruned_params = FLAGS.pruned_params assert FLAGS.pruned_params is not None, \ "FLAGS.pruned_params is empty!!! Please set it by '--pruned_params' option." pruned_params = FLAGS.pruned_params.strip().split(",") logger.info("pruned params: {}".format(pruned_params)) pruned_ratios = [float(n) for n in FLAGS.pruned_ratios.strip().split(",")] logger.info("pruned ratios: {}".format(pruned_ratios)) assert len(pruned_params) == len(pruned_ratios), \ "The length of pruned params and pruned ratios should be equal." assert (pruned_ratios > [0] * len(pruned_ratios) and pruned_ratios < [1] * len(pruned_ratios) ), "The elements of pruned ratios should be in range (0, 1)." assert FLAGS.prune_criterion in ['l1_norm', 'geometry_median'], \ "unsupported prune criterion {}".format(FLAGS.prune_criterion) pruner = Pruner(criterion=FLAGS.prune_criterion) train_prog = pruner.prune( train_prog, fluid.global_scope(), params=pruned_params, ratios=pruned_ratios, place=place, only_graph=False)[0] compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) if FLAGS.eval: base_flops = flops(eval_prog) eval_prog = pruner.prune( eval_prog, fluid.global_scope(), params=pruned_params, ratios=pruned_ratios, place=place, only_graph=True)[0] pruned_flops = flops(eval_prog) logger.info("FLOPs -{}; total FLOPs: {}; pruned FLOPs: {}".format( float(base_flops - pruned_flops) / base_flops, base_flops, pruned_flops)) compiled_eval_prog = fluid.compiler.CompiledProgram(eval_prog) if FLAGS.resume_checkpoint: checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint) start_iter = checkpoint.global_step() train_reader = create_reader(cfg.TrainReader, (cfg.max_iters - start_iter) * devices_num, cfg) train_loader.set_sample_list_generator(train_reader, place) # whether output bbox is normalized in model output layer is_bbox_normalized = False if hasattr(model, 'is_bbox_normalized') and \ callable(model.is_bbox_normalized): is_bbox_normalized = model.is_bbox_normalized() # if map_type not set, use default 11point, only use in VOC eval map_type = cfg.map_type if 'map_type' in cfg else '11point' train_stats = TrainingStats(cfg.log_smooth_window, train_keys) train_loader.start() start_time = time.time() end_time = time.time() cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) time_stat = deque(maxlen=cfg.log_smooth_window) best_box_ap_list = [0.0, 0] #[map, iter] # use tb-paddle to log data if FLAGS.use_tb: from tb_paddle import SummaryWriter tb_writer = SummaryWriter(FLAGS.tb_log_dir) tb_loss_step = 0 tb_mAP_step = 0 if FLAGS.eval: # evaluation results = eval_run(exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls, cfg) resolution = None if 'mask' in results[0]: resolution = model.mask_head.resolution dataset = cfg['EvalReader']['dataset'] box_ap_stats = eval_results( results, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type, dataset=dataset) for it in range(start_iter, cfg.max_iters): start_time = end_time end_time = time.time() time_stat.append(end_time - start_time) time_cost = np.mean(time_stat) eta_sec = (cfg.max_iters - it) * time_cost eta = str(datetime.timedelta(seconds=int(eta_sec))) outs = exe.run(compiled_train_prog, fetch_list=train_values) stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])} # use tb-paddle to log loss if FLAGS.use_tb: if it % cfg.log_iter == 0: for loss_name, loss_value in stats.items(): tb_writer.add_scalar(loss_name, loss_value, tb_loss_step) tb_loss_step += 1 train_stats.update(stats) logs = train_stats.log() if it % cfg.log_iter == 0 and (not FLAGS.dist or trainer_id == 0): strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format( it, np.mean(outs[-1]), logs, time_cost, eta) logger.info(strs) if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \ and (not FLAGS.dist or trainer_id == 0): save_name = str(it) if it != cfg.max_iters - 1 else "model_final" checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name)) if FLAGS.eval: # evaluation results = eval_run( exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls, cfg=cfg) resolution = None if 'mask' in results[0]: resolution = model.mask_head.resolution box_ap_stats = eval_results( results, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type, dataset=dataset) # use tb_paddle to log mAP if FLAGS.use_tb: tb_writer.add_scalar("mAP", box_ap_stats[0], tb_mAP_step) tb_mAP_step += 1 if box_ap_stats[0] > best_box_ap_list[0]: best_box_ap_list[0] = box_ap_stats[0] best_box_ap_list[1] = it checkpoint.save(exe, train_prog, os.path.join(save_dir, "best_model")) logger.info("Best test box ap: {}, in iter: {}".format( best_box_ap_list[0], best_box_ap_list[1])) train_loader.reset()
def fl_trainer( trainer_id: int, trainer_ep: str, scheduler_ep: str, main_program, startup_program, send_program, recv_program, feed_names, target_names, strategy, feeds, config, algorithm_config, ): import numpy as np import paddle.fluid as fluid from ppdet.utils import checkpoint logging.basicConfig( filename="trainer.log", filemode="w", format="%(asctime)s %(name)s:%(levelname)s:%(message)s", datefmt="%d-%M-%Y %H:%M:%S", level=logging.DEBUG, ) with open(config) as f: config_json = json.load(f) max_iter = config_json["max_iter"] device = config_json.get("device", "cpu") use_vdl = config_json.get("use_vdl", False) with open(algorithm_config) as f: algorithm_config_dict = yaml.load(f) batch_size = algorithm_config_dict.get("batch_size", 128) need_shuffle = algorithm_config_dict.get("need_shuffle", True) logging.debug(f"training program begin") trainer = FedAvgTrainer(scheduler_ep=scheduler_ep, trainer_ep=trainer_ep) logging.debug(f"job program loading") trainer.load_job( main_program=main_program, startup_program=startup_program, send_program=send_program, recv_program=recv_program, feed_names=feed_names, target_names=target_names, strategy=strategy, ) logging.debug(f"job program loaded") place = fluid.CPUPlace() if device != "cuda" else fluid.CUDAPlace(0) logging.debug(f"trainer starting with place {place}") trainer.start(place) logging.debug(f"trainer stared") logging.debug(f"loading data") feed_list = trainer.load_feed_list(feeds) feeder = fluid.DataFeeder(feed_list=feed_list, place=place) logging.debug(f"data loader ready") epoch_id = -1 step = 0 reader = paddle.dataset.mnist.reader_creator( image_filename=os.path.join( get_data_dir(), "mnist", "train-images-idx3-ubyte.gz", ), label_filename=os.path.join(get_data_dir(), "mnist", "train-labels-idx1-ubyte.gz"), buffer_size=100, ) if need_shuffle: reader = fluid.io.shuffle( reader=reader, buf_size=1000, ) mnist_loader = paddle.batch(reader=reader, batch_size=batch_size) if use_vdl: from visualdl import LogWriter vdl_writer = LogWriter("vdl_log") while epoch_id < max_iter: epoch_id += 1 if not trainer.scheduler_agent.join(epoch_id): logging.debug(f"not join, waiting next round") continue logging.debug(f"epoch {epoch_id} start train") for step_id, data in enumerate(mnist_loader()): outs = trainer.run(feeder.feed(data), fetch=trainer._target_names) if use_vdl: stats = { k: np.array(v).mean() for k, v in zip(trainer._target_names, outs) } for loss_name, loss_value in stats.items(): vdl_writer.add_scalar(loss_name, loss_value, step) step += 1 logging.debug(f"step: {step}, outs: {outs}") # save model logging.debug(f"saving model at {epoch_id}-th epoch") trainer.save_model(f"model/{epoch_id}") # info scheduler trainer.scheduler_agent.finish() checkpoint.save(trainer.exe, trainer._main_program, f"checkpoint/{epoch_id}") logging.debug(f"reach max iter, finish training")
def main(): env = os.environ FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env if FLAGS.dist: trainer_id = int(env['PADDLE_TRAINER_ID']) import random local_seed = (99 + trainer_id) random.seed(local_seed) np.random.seed(local_seed) cfg = load_config(FLAGS.config) if 'architecture' in cfg: main_arch = cfg.architecture else: raise ValueError("'architecture' not specified in config file.") merge_config(FLAGS.opt) if 'log_iter' not in cfg: cfg.log_iter = 20 # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) # check if paddlepaddle version is satisfied check_version() if not FLAGS.dist or trainer_id == 0: print_total_cfg(cfg) if cfg.use_gpu: devices_num = fluid.core.get_cuda_device_count() else: devices_num = int(os.environ.get('CPU_NUM', 1)) if 'FLAGS_selected_gpus' in env: device_id = int(env['FLAGS_selected_gpus']) else: device_id = 0 place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) lr_builder = create('LearningRate') optim_builder = create('OptimizerBuilder') # build program startup_prog = fluid.Program() train_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg['TrainReader']['inputs_def'] feed_vars, train_loader = model.build_inputs(**inputs_def) train_fetches = model.train(feed_vars) loss = train_fetches['loss'] lr = lr_builder() optimizer = optim_builder(lr) optimizer.minimize(loss) # parse train fetches train_keys, train_values, _ = parse_fetches(train_fetches) train_values.append(lr) if FLAGS.eval: eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg['EvalReader']['inputs_def'] feed_vars, eval_loader = model.build_inputs(**inputs_def) fetches = model.eval(feed_vars) eval_prog = eval_prog.clone(True) eval_reader = create_reader(cfg.EvalReader) eval_loader.set_sample_list_generator(eval_reader, place) # parse eval fetches extra_keys = [] if cfg.metric == 'COCO': extra_keys = ['im_info', 'im_id', 'im_shape'] if cfg.metric == 'VOC': extra_keys = ['gt_bbox', 'gt_class', 'is_difficult'] if cfg.metric == 'WIDERFACE': extra_keys = ['im_id', 'im_shape', 'gt_bbox'] eval_keys, eval_values, eval_cls = parse_fetches(fetches, eval_prog, extra_keys) # compile program for multi-devices build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_optimizer_ops = False build_strategy.fuse_elewise_add_act_ops = True build_strategy.fuse_all_reduce_ops = False # only enable sync_bn in multi GPU devices sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn' sync_bn = False build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \ and cfg.use_gpu exec_strategy = fluid.ExecutionStrategy() # iteration number when CompiledProgram tries to drop local execution scopes. # Set it to be 1 to save memory usages, so that unused variables in # local execution scopes can be deleted after each iteration. exec_strategy.num_iteration_per_drop_scope = 1 if FLAGS.dist: dist_utils.prepare_for_multi_process(exe, build_strategy, startup_prog, train_prog) exec_strategy.num_threads = 1 exe.run(startup_prog) not_quant_pattern = [] if FLAGS.not_quant_pattern: not_quant_pattern = FLAGS.not_quant_pattern config = { 'weight_quantize_type': 'channel_wise_abs_max', 'activation_quantize_type': 'moving_average_abs_max', 'quantize_op_types': ['depthwise_conv2d', 'mul', 'conv2d'], 'not_quant_pattern': not_quant_pattern } ignore_params = cfg.finetune_exclude_pretrained_params \ if 'finetune_exclude_pretrained_params' in cfg else [] fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel' if FLAGS.resume_checkpoint: checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint) start_iter = checkpoint.global_step() elif cfg.pretrain_weights and fuse_bn and not ignore_params: checkpoint.load_and_fusebn(exe, train_prog, cfg.pretrain_weights) elif cfg.pretrain_weights: checkpoint.load_params( exe, train_prog, cfg.pretrain_weights, ignore_params=ignore_params) # insert quantize op in train_prog, return type is CompiledProgram train_prog = quant_aware(train_prog, place, config, for_test=False) compiled_train_prog = train_prog.with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) if FLAGS.eval: # insert quantize op in eval_prog eval_prog = quant_aware(eval_prog, place, config, for_test=True) compiled_eval_prog = fluid.compiler.CompiledProgram(eval_prog) start_iter = 0 train_reader = create_reader(cfg.TrainReader, (cfg.max_iters - start_iter) * devices_num) train_loader.set_sample_list_generator(train_reader, place) # whether output bbox is normalized in model output layer is_bbox_normalized = False if hasattr(model, 'is_bbox_normalized') and \ callable(model.is_bbox_normalized): is_bbox_normalized = model.is_bbox_normalized() # if map_type not set, use default 11point, only use in VOC eval map_type = cfg.map_type if 'map_type' in cfg else '11point' train_stats = TrainingStats(cfg.log_smooth_window, train_keys) train_loader.start() start_time = time.time() end_time = time.time() cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) time_stat = deque(maxlen=cfg.log_smooth_window) best_box_ap_list = [0.0, 0] #[map, iter] for it in range(start_iter, cfg.max_iters): start_time = end_time end_time = time.time() time_stat.append(end_time - start_time) time_cost = np.mean(time_stat) eta_sec = (cfg.max_iters - it) * time_cost eta = str(datetime.timedelta(seconds=int(eta_sec))) outs = exe.run(compiled_train_prog, fetch_list=train_values) stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])} train_stats.update(stats) logs = train_stats.log() if it % cfg.log_iter == 0 and (not FLAGS.dist or trainer_id == 0): strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format( it, np.mean(outs[-1]), logs, time_cost, eta) logger.info(strs) if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \ and (not FLAGS.dist or trainer_id == 0): save_name = str(it) if it != cfg.max_iters - 1 else "model_final" checkpoint.save(exe, eval_prog, os.path.join(save_dir, save_name)) if FLAGS.eval: # evaluation results = eval_run(exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls) resolution = None if 'mask' in results[0]: resolution = model.mask_head.resolution box_ap_stats = eval_results( results, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type, cfg['EvalReader']['dataset']) if box_ap_stats[0] > best_box_ap_list[0]: best_box_ap_list[0] = box_ap_stats[0] best_box_ap_list[1] = it checkpoint.save(exe, eval_prog, os.path.join(save_dir, "best_model")) logger.info("Best test box ap: {}, in iter: {}".format( best_box_ap_list[0], best_box_ap_list[1])) train_loader.reset()
def main(): # 配置 cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) if 'architecture' in cfg: main_arch = cfg.architecture else: raise ValueError("'architecture' not specified in config file.") check_gpu(cfg.use_gpu) check_version() # 执行器 place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) # 模型 lr_builder = create('LearningRate') optim_builder = create('OptimizerBuilder') startup_prog = fluid.Program() train_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg.TrainReader['inputs_def'] feed_vars, train_loader = model.build_inputs(**inputs_def) train_fetches = model.train(feed_vars) loss = train_fetches['loss'] lr = lr_builder() optimizer = optim_builder(lr) optimizer.minimize(loss) train_keys, train_values, _ = parse_fetches(train_fetches) train_values.append(lr) if FLAGS.eval: eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg.EvalReader['inputs_def'] feed_vars, eval_loader = model.build_inputs(**inputs_def) fetches = model.eval(feed_vars) eval_prog = eval_prog.clone(True) extra_keys = ['gt_bbox', 'gt_class', 'is_difficult'] eval_keys, eval_values, _ = parse_fetches(fetches, eval_prog, extra_keys) eval_reader = create_reader(cfg.EvalReader) eval_loader.set_sample_list_generator(eval_reader, place) ##### 运行 #### exe.run(startup_prog) ## 恢复与迁移 ignore_params = cfg.finetune_exclude_pretrained_params \ if 'finetune_exclude_pretrained_params' in cfg else [] start_iter = 0 if FLAGS.resume_checkpoint: checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint) start_iter = checkpoint.global_step() + 1 elif cfg.pretrain_weights: checkpoint.load_params( exe, train_prog, cfg.pretrain_weights, ignore_params=ignore_params) ## 数据迭代器 train_reader = create_reader(cfg.TrainReader, cfg.max_iters - start_iter, cfg) train_loader.set_sample_list_generator(train_reader, place) ## 训练循环 train_loader.start() # 过程跟踪 train_stats = TrainingStats(cfg.log_smooth_window, train_keys) start_time = time.time() end_time = time.time() time_stat = deque(maxlen=cfg.log_smooth_window) cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) best_box_ap_list = [0.0, 0] if FLAGS.use_vdl: log_writter = LogWriter(FLAGS.vdl_log_dir, sync_cycle=5) with log_writter.mode("train") as vdl_logger: train_scalar_loss = vdl_logger.scalar(tag="loss") with log_writter.mode("val") as vdl_logger: val_scalar_map = vdl_logger.scalar(tag="map") for it in range(start_iter, cfg.max_iters): # 运行程序 outs = exe.run(train_prog, fetch_list=train_values) stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])} # 日志与可视化窗口 start_time = end_time end_time = time.time() time_stat.append(end_time - start_time) time_cost = np.mean(time_stat) eta_sec = (cfg.max_iters - it) * time_cost eta = str(datetime.timedelta(seconds=int(eta_sec))) train_stats.update(stats) logs = train_stats.log() if it % cfg.log_iter == 0: # log strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format( it, np.mean(outs[-1]), logs, time_cost, eta) logger.info(strs) # vdl if FLAGS.use_vdl: train_scalar_loss.add_record(it//cfg.log_iter, stats['loss']) # 模型保存与评价窗口 if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1): # 模型保存 save_name = str(it) if it != cfg.max_iters - 1 else "final" checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name)) ## 模型评价 if FLAGS.eval: current_step = it//cfg.snapshot_iter if it % cfg.snapshot_iter == 0 \ else it//cfg.snapshot_iter+1 ## 训练集评价 ## 验证集评价 results = eval_run(exe, eval_prog, eval_loader, eval_keys, eval_values) box_ap_stats = eval_results(results, cfg.num_classes) logger.info("eval box op: {}, in iter: {}".format( box_ap_stats, it)) if FLAGS.use_vdl: val_scalar_map.add_record(current_step, box_ap_stats) ## 保存最佳模型 if box_ap_stats > best_box_ap_list[0]: best_box_ap_list[0] = box_ap_stats best_box_ap_list[1] = it checkpoint.save(exe, train_prog, os.path.join(save_dir, "best_model")) # 日志 logger.info("Best eval box ap: {}, in iter: {}".format( best_box_ap_list[0], best_box_ap_list[1])) train_loader.reset()
def fl_trainer( trainer_id: int, trainer_ep: str, scheduler_ep: str, main_program, startup_program, send_program, recv_program, feed_names, target_names, strategy, feeds, config, algorithm_config, ): import numpy as np import paddle.fluid as fluid from ppdet.core.workspace import load_config from ppdet.data import create_reader from ppdet.utils import checkpoint from ppdet.utils.check import check_config, check_version logging.basicConfig( filename="trainer.log", filemode="w", format="%(asctime)s %(name)s:%(levelname)s:%(message)s", datefmt="%d-%M-%Y %H:%M:%S", level=logging.DEBUG, ) with open(config) as f: config_json = json.load(f) max_iter = config_json["max_iter"] device = config_json.get("device", "cpu") use_vdl = config_json.get("use_vdl", False) logging.debug(f"training program begin") trainer = FedAvgTrainer(scheduler_ep=scheduler_ep, trainer_ep=trainer_ep) logging.debug(f"job program loading") trainer.load_job( main_program=main_program, startup_program=startup_program, send_program=send_program, recv_program=recv_program, feed_names=feed_names, target_names=target_names, strategy=strategy, ) logging.debug(f"job program loaded") place = fluid.CPUPlace() if device != "cuda" else fluid.CUDAPlace(0) logging.debug(f"trainer starting with place {place}") trainer.start(place) logging.debug(f"trainer stared") cfg = load_config(algorithm_config) check_config(cfg) check_version() logging.debug(f"loading data") feed_list = trainer.load_feed_list(feeds) feeder = fluid.DataFeeder(feed_list=feed_list, place=place) logging.debug(f"data loader ready") epoch_id = -1 step = 0 # redirect dataset path to Fedvision/data cfg.TrainReader["dataset"].dataset_dir = os.path.join( get_data_dir(), cfg.TrainReader["dataset"].dataset_dir ) data_loader = create_reader( cfg.TrainReader, max_iter, cfg, devices_num=1, num_trainers=1 ) logging.error(f"{cfg.TrainReader['dataset']}") if use_vdl: from visualdl import LogWriter vdl_writer = LogWriter("vdl_log") while epoch_id < max_iter: epoch_id += 1 if not trainer.scheduler_agent.join(epoch_id): logging.debug(f"not join, waiting next round") continue logging.debug(f"epoch {epoch_id} start train") for step_id, data in enumerate(data_loader()): outs = trainer.run(feeder.feed(data), fetch=trainer._target_names) if use_vdl: stats = { k: np.array(v).mean() for k, v in zip(trainer._target_names, outs) } for loss_name, loss_value in stats.items(): vdl_writer.add_scalar(loss_name, loss_value, step) step += 1 logging.debug(f"step: {step}, outs: {outs}") # save model logging.debug(f"saving model at {epoch_id}-th epoch") trainer.save_model(f"model/{epoch_id}") # info scheduler trainer.scheduler_agent.finish() checkpoint.save(trainer.exe, trainer._main_program, f"checkpoint/{epoch_id}") logging.debug(f"reach max iter, finish training")