def __init__( self, evaluator, cudas=["0"], popsize=5, output_dir=None, ): self._num_thread = len(cudas) self._popsize = popsize self.cudas = cudas self.is_cuda_free = {"free": [], "busy": []} self.is_cuda_free["free"] = cudas self._round = 0 self.evaluator = evaluator self.init_input = evaluator.get_init_params() self.num_hparam = len(self.init_input) self.best_hparams_all_pop = [] self.best_reward_all_pop = INF self.current_hparams = [[0] * self.num_hparam] * self._popsize self.hparams_name_list = [ param["name"] for param in evaluator.params['param_list'] ] if output_dir is None: now = int(time.time()) time_str = time.strftime("%Y%m%d%H%M%S", time.localtime(now)) self._output_dir = "output_" + time_str else: self._output_dir = output_dir self.writer = SummaryWriter(logdir=self._output_dir + '/visualization')
def tb_writer(self): if not os.path.exists(self.config.checkpoint_dir): mkdir(self.config.checkpoint_dir) tb_log_dir = os.path.join(self.config.checkpoint_dir, "visualization") if not self._tb_writer: self._tb_writer = SummaryWriter(tb_log_dir) return self._tb_writer
def __init__( self, evaluator, cudas=["0"], popsize=5, output_dir=None, ): self._num_thread = len(cudas) self._popsize = popsize self.cudas = cudas self.is_cuda_free = {"free": [], "busy": []} self.is_cuda_free["free"] = cudas self._round = 0 self.evaluator = evaluator self.init_input = evaluator.get_init_params() self.num_hparam = len(self.init_input) self.best_hparams_all_pop = [] self.best_reward_all_pop = INF self.current_hparams = [[0] * self.num_hparam] * self._popsize self.hparams_name_list = [ param["name"] for param in evaluator.params['param_list'] ] if output_dir is None: now = int(time.time()) time_str = time.strftime("%Y%m%d%H%M%S", time.localtime(now)) self._output_dir = "output_" + time_str else: self._output_dir = output_dir # record the information for the whole auto finetune self.writer = SummaryWriter(logdir=self._output_dir + '/visualization') # record the information for per population in all round self.writer_pop_trails = [] for i in range(self.popsize): writer_pop_trail = SummaryWriter(logdir=self._output_dir + '/visualization/pop_{}'.format(i)) self.writer_pop_trails.append(writer_pop_trail) # for parallel on mpi self.mpi = MPIHelper() if self.mpi.multi_machine: print("Autofinetune multimachine mode: running on {}".format( self.mpi.gather(self.mpi.name)))
def main(): cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) check_config(cfg) # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) # check if paddlepaddle version is satisfied check_version() main_arch = cfg.architecture dataset = cfg.TestReader['dataset'] test_images = get_test_images(FLAGS.infer_dir, FLAGS.infer_img) dataset.set_images(test_images) place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) model = create(main_arch) startup_prog = fluid.Program() infer_prog = fluid.Program() with fluid.program_guard(infer_prog, startup_prog): with fluid.unique_name.guard(): inputs_def = cfg['TestReader']['inputs_def'] inputs_def['iterable'] = True feed_vars, loader = model.build_inputs(**inputs_def) test_fetches = model.test(feed_vars) infer_prog = infer_prog.clone(True) reader = create_reader(cfg.TestReader, devices_num=1) loader.set_sample_list_generator(reader, place) exe.run(startup_prog) if cfg.weights: checkpoint.load_params(exe, infer_prog, cfg.weights) # parse infer fetches assert cfg.metric in ['COCO', 'VOC', 'OID', 'WIDERFACE'], \ "unknown metric type {}".format(cfg.metric) extra_keys = [] if cfg['metric'] in ['COCO', 'OID']: extra_keys = ['im_info', 'im_id', 'im_shape'] if cfg['metric'] == 'VOC' or cfg['metric'] == 'WIDERFACE': extra_keys = ['im_id', 'im_shape'] keys, values, _ = parse_fetches(test_fetches, infer_prog, extra_keys) # parse dataset category if cfg.metric == 'COCO': from ppdet.utils.coco_eval import bbox2out, mask2out, get_category_info if cfg.metric == 'OID': from ppdet.utils.oid_eval import bbox2out, get_category_info if cfg.metric == "VOC": from ppdet.utils.voc_eval import bbox2out, get_category_info if cfg.metric == "WIDERFACE": from ppdet.utils.widerface_eval_utils import bbox2out, get_category_info anno_file = dataset.get_anno() with_background = dataset.with_background use_default_label = dataset.use_default_label clsid2catid, catid2name = get_category_info(anno_file, with_background, use_default_label) # whether output bbox is normalized in model output layer is_bbox_normalized = False if hasattr(model, 'is_bbox_normalized') and \ callable(model.is_bbox_normalized): is_bbox_normalized = model.is_bbox_normalized() # use tb-paddle to log image if FLAGS.use_tb: from tb_paddle import SummaryWriter tb_writer = SummaryWriter(FLAGS.tb_log_dir) tb_image_step = 0 tb_image_frame = 0 # each frame can display ten pictures at most. imid2path = dataset.get_imid2path() for iter_id, data in enumerate(loader()): outs = exe.run(infer_prog, feed=data, fetch_list=values, return_numpy=False) res = { k: (np.array(v), v.recursive_sequence_lengths()) for k, v in zip(keys, outs) } logger.info('Infer iter {}'.format(iter_id)) bbox_results = None mask_results = None if 'bbox' in res: bbox_results = bbox2out([res], clsid2catid, is_bbox_normalized) if 'mask' in res: mask_results = mask2out([res], clsid2catid, model.mask_head.resolution) # visualize result im_ids = res['im_id'][0] for im_id in im_ids: image_path = imid2path[int(im_id)] image = Image.open(image_path).convert('RGB') # use tb-paddle to log original image if FLAGS.use_tb: original_image_np = np.array(image) tb_writer.add_image("original/frame_{}".format(tb_image_frame), original_image_np, tb_image_step, dataformats='HWC') image = visualize_results(image, int(im_id), catid2name, FLAGS.draw_threshold, bbox_results, mask_results) # use tb-paddle to log image with bbox if FLAGS.use_tb: infer_image_np = np.array(image) tb_writer.add_image("bbox/frame_{}".format(tb_image_frame), infer_image_np, tb_image_step, dataformats='HWC') tb_image_step += 1 if tb_image_step % 10 == 0: tb_image_step = 0 tb_image_frame += 1 save_name = get_save_image_name(FLAGS.output_dir, image_path) logger.info("Detection bbox results save in {}".format(save_name)) image.save(save_name, quality=95)
def train(cfg): startup_prog = fluid.Program() train_prog = fluid.Program() drop_last = True dataset = SegDataset(file_list=cfg.DATASET.TRAIN_FILE_LIST, mode=ModelPhase.TRAIN, shuffle=True, data_dir=cfg.DATASET.DATA_DIR) def data_generator(): if args.use_mpio: data_gen = dataset.multiprocess_generator( num_processes=cfg.DATALOADER.NUM_WORKERS, max_queue_size=cfg.DATALOADER.BUF_SIZE) else: data_gen = dataset.generator() batch_data = [] for b in data_gen: batch_data.append(b) if len(batch_data) == (cfg.BATCH_SIZE // cfg.NUM_TRAINERS): for item in batch_data: yield item[0], item[1], item[2] batch_data = [] # If use sync batch norm strategy, drop last batch if number of samples # in batch_data is less then cfg.BATCH_SIZE to avoid NCCL hang issues if not cfg.TRAIN.SYNC_BATCH_NORM: for item in batch_data: yield item[0], item[1], item[2] # Get device environment # places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() # place = places[0] gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() # Get number of GPU dev_count = cfg.NUM_TRAINERS if cfg.NUM_TRAINERS > 1 else len(places) print_info("#Device count: {}".format(dev_count)) # Make sure BATCH_SIZE can divided by GPU cards assert cfg.BATCH_SIZE % dev_count == 0, ( 'BATCH_SIZE:{} not divisble by number of GPUs:{}'.format( cfg.BATCH_SIZE, dev_count)) # If use multi-gpu training mode, batch data will allocated to each GPU evenly batch_size_per_dev = cfg.BATCH_SIZE // dev_count print_info("batch_size_per_dev: {}".format(batch_size_per_dev)) py_reader, avg_loss, lr, pred, grts, masks = build_model( train_prog, startup_prog, phase=ModelPhase.TRAIN) py_reader.decorate_sample_generator(data_generator, batch_size=batch_size_per_dev, drop_last=drop_last) exe = fluid.Executor(place) exe.run(startup_prog) exec_strategy = fluid.ExecutionStrategy() # Clear temporary variables every 100 iteration if args.use_gpu: exec_strategy.num_threads = fluid.core.get_cuda_device_count() exec_strategy.num_iteration_per_drop_scope = 100 build_strategy = fluid.BuildStrategy() if cfg.NUM_TRAINERS > 1 and args.use_gpu: dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog) exec_strategy.num_threads = 1 if cfg.TRAIN.SYNC_BATCH_NORM and args.use_gpu: if dev_count > 1: # Apply sync batch norm strategy print_info("Sync BatchNorm strategy is effective.") build_strategy.sync_batch_norm = True else: print_info( "Sync BatchNorm strategy will not be effective if GPU device" " count <= 1") compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=avg_loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy) # Resume training begin_epoch = cfg.SOLVER.BEGIN_EPOCH if cfg.TRAIN.RESUME_MODEL_DIR: begin_epoch = load_checkpoint(exe, train_prog) # Load pretrained model elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR): print_info('Pretrained model dir: ', cfg.TRAIN.PRETRAINED_MODEL_DIR) load_vars = [] load_fail_vars = [] def var_shape_matched(var, shape): """ Check whehter persitable variable shape is match with current network """ var_exist = os.path.exists( os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name)) if var_exist: var_shape = parse_shape_from_file( os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name)) return var_shape == shape return False for x in train_prog.list_vars(): if isinstance(x, fluid.framework.Parameter): shape = tuple(fluid.global_scope().find_var( x.name).get_tensor().shape()) if var_shape_matched(x, shape): load_vars.append(x) else: load_fail_vars.append(x) fluid.io.load_vars(exe, dirname=cfg.TRAIN.PRETRAINED_MODEL_DIR, vars=load_vars) for var in load_vars: print_info("Parameter[{}] loaded sucessfully!".format(var.name)) for var in load_fail_vars: print_info( "Parameter[{}] don't exist or shape does not match current network, skip" " to load it.".format(var.name)) print_info("{}/{} pretrained parameters loaded successfully!".format( len(load_vars), len(load_vars) + len(load_fail_vars))) else: print_info( 'Pretrained model dir {} not exists, training from scratch...'. format(cfg.TRAIN.PRETRAINED_MODEL_DIR)) fetch_list = [avg_loss.name, lr.name] if args.debug: # Fetch more variable info and use streaming confusion matrix to # calculate IoU results if in debug mode np.set_printoptions(precision=4, suppress=True, linewidth=160, floatmode="fixed") fetch_list.extend([pred.name, grts.name, masks.name]) cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True) if args.use_tb: if not args.tb_log_dir: print_info("Please specify the log directory by --tb_log_dir.") exit(1) from tb_paddle import SummaryWriter log_writer = SummaryWriter(args.tb_log_dir) # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0)) # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) global_step = 0 all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True: all_step += 1 all_step *= (cfg.SOLVER.NUM_EPOCHS - begin_epoch + 1) avg_loss = 0.0 timer = Timer() timer.start() if begin_epoch > cfg.SOLVER.NUM_EPOCHS: raise ValueError(( "begin epoch[{}] is larger than cfg.SOLVER.NUM_EPOCHS[{}]").format( begin_epoch, cfg.SOLVER.NUM_EPOCHS)) if args.use_mpio: print_info("Use multiprocess reader") else: print_info("Use multi-thread reader") for epoch in range(begin_epoch, cfg.SOLVER.NUM_EPOCHS + 1): py_reader.start() while True: try: if args.debug: # Print category IoU and accuracy to check whether the # traning process is corresponed to expectation loss, lr, pred, grts, masks = exe.run( program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) cm.calculate(pred, grts, masks) avg_loss += np.mean(np.array(loss)) global_step += 1 if global_step % args.log_steps == 0: speed = args.log_steps / timer.elapsed_time() avg_loss /= args.log_steps category_acc, mean_acc = cm.accuracy() category_iou, mean_iou = cm.mean_iou() print_info(( "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}" ).format(epoch, global_step, lr[0], avg_loss, mean_acc, mean_iou, speed, calculate_eta(all_step - global_step, speed))) print_info("Category IoU: ", category_iou) print_info("Category Acc: ", category_acc) if args.use_tb: log_writer.add_scalar('Train/mean_iou', mean_iou, global_step) log_writer.add_scalar('Train/mean_acc', mean_acc, global_step) log_writer.add_scalar('Train/loss', avg_loss, global_step) log_writer.add_scalar('Train/lr', lr[0], global_step) log_writer.add_scalar('Train/step/sec', speed, global_step) sys.stdout.flush() avg_loss = 0.0 cm.zero_matrix() timer.restart() else: # If not in debug mode, avoid unnessary log and calculate loss, lr = exe.run(program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) avg_loss += np.mean(np.array(loss)) global_step += 1 if global_step % args.log_steps == 0 and cfg.TRAINER_ID == 0: avg_loss /= args.log_steps speed = args.log_steps / timer.elapsed_time() print(( "epoch={} step={} lr={:.5f} loss={:.4f} step/sec={:.3f} | ETA {}" ).format(epoch, global_step, lr[0], avg_loss, speed, calculate_eta(all_step - global_step, speed))) if args.use_tb: log_writer.add_scalar('Train/loss', avg_loss, global_step) log_writer.add_scalar('Train/lr', lr[0], global_step) log_writer.add_scalar('Train/speed', speed, global_step) sys.stdout.flush() avg_loss = 0.0 timer.restart() except fluid.core.EOFException: py_reader.reset() break except Exception as e: print(e) if epoch % cfg.TRAIN.SNAPSHOT_EPOCH == 0 and cfg.TRAINER_ID == 0: ckpt_dir = save_checkpoint(exe, train_prog, epoch) if args.do_eval: print("Evaluation start") _, mean_iou, _, mean_acc = evaluate(cfg=cfg, ckpt_dir=ckpt_dir, use_gpu=args.use_gpu, use_mpio=args.use_mpio) if args.use_tb: log_writer.add_scalar('Evaluate/mean_iou', mean_iou, global_step) log_writer.add_scalar('Evaluate/mean_acc', mean_acc, global_step) # Use Tensorboard to visualize results if args.use_tb and cfg.DATASET.VIS_FILE_LIST is not None: visualize(cfg=cfg, use_gpu=args.use_gpu, vis_file_list=cfg.DATASET.VIS_FILE_LIST, vis_dir="visual", ckpt_dir=ckpt_dir, log_writer=log_writer) # save final model if cfg.TRAINER_ID == 0: save_checkpoint(exe, train_prog, 'final')
def main(): env = os.environ FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env if FLAGS.dist: trainer_id = int(env['PADDLE_TRAINER_ID']) import random local_seed = (99 + trainer_id) random.seed(local_seed) np.random.seed(local_seed) cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) check_config(cfg) # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) # check if paddlepaddle version is satisfied check_version() main_arch = cfg.architecture if cfg.use_gpu: devices_num = fluid.core.get_cuda_device_count() else: devices_num = int(os.environ.get('CPU_NUM', 1)) if 'FLAGS_selected_gpus' in env: device_id = int(env['FLAGS_selected_gpus']) else: device_id = 0 place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) lr_builder = create('LearningRate') optim_builder = create('OptimizerBuilder') # build program startup_prog = fluid.Program() train_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) if FLAGS.fp16: assert (getattr(model.backbone, 'norm_type', None) != 'affine_channel'), \ '--fp16 currently does not support affine channel, ' \ ' please modify backbone settings to use batch norm' with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx: inputs_def = cfg['TrainReader']['inputs_def'] feed_vars, train_loader = model.build_inputs(**inputs_def) train_fetches = model.train(feed_vars) loss = train_fetches['loss'] if FLAGS.fp16: loss *= ctx.get_loss_scale_var() lr = lr_builder() optimizer = optim_builder(lr) optimizer.minimize(loss) if FLAGS.fp16: loss /= ctx.get_loss_scale_var() # parse train fetches train_keys, train_values, _ = parse_fetches(train_fetches) train_values.append(lr) if FLAGS.print_params: param_delimit_str = '-' * 20 + "All parameters in current graph" + '-' * 20 print(param_delimit_str) for block in train_prog.blocks: for param in block.all_parameters(): print("parameter name: {}\tshape: {}".format(param.name, param.shape)) print('-' * len(param_delimit_str)) return if FLAGS.eval: eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg['EvalReader']['inputs_def'] feed_vars, eval_loader = model.build_inputs(**inputs_def) fetches = model.eval(feed_vars) eval_prog = eval_prog.clone(True) eval_reader = create_reader(cfg.EvalReader) eval_loader.set_sample_list_generator(eval_reader, place) # parse eval fetches extra_keys = [] if cfg.metric == 'COCO': extra_keys = ['im_info', 'im_id', 'im_shape'] if cfg.metric == 'VOC': extra_keys = ['gt_bbox', 'gt_class', 'is_difficult'] if cfg.metric == 'WIDERFACE': extra_keys = ['im_id', 'im_shape', 'gt_bbox'] eval_keys, eval_values, eval_cls = parse_fetches(fetches, eval_prog, extra_keys) # compile program for multi-devices build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_optimizer_ops = False build_strategy.fuse_elewise_add_act_ops = True # only enable sync_bn in multi GPU devices sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn' build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \ and cfg.use_gpu exec_strategy = fluid.ExecutionStrategy() # iteration number when CompiledProgram tries to drop local execution scopes. # Set it to be 1 to save memory usages, so that unused variables in # local execution scopes can be deleted after each iteration. exec_strategy.num_iteration_per_drop_scope = 1 if FLAGS.dist: dist_utils.prepare_for_multi_process(exe, build_strategy, startup_prog, train_prog) exec_strategy.num_threads = 1 exe.run(startup_prog) fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel' start_iter = 0 if cfg.pretrain_weights: checkpoint.load_params(exe, train_prog, cfg.pretrain_weights) pruned_params = FLAGS.pruned_params assert FLAGS.pruned_params is not None, \ "FLAGS.pruned_params is empty!!! Please set it by '--pruned_params' option." pruned_params = FLAGS.pruned_params.strip().split(",") logger.info("pruned params: {}".format(pruned_params)) pruned_ratios = [float(n) for n in FLAGS.pruned_ratios.strip().split(",")] logger.info("pruned ratios: {}".format(pruned_ratios)) assert len(pruned_params) == len(pruned_ratios), \ "The length of pruned params and pruned ratios should be equal." assert (pruned_ratios > [0] * len(pruned_ratios) and pruned_ratios < [1] * len(pruned_ratios) ), "The elements of pruned ratios should be in range (0, 1)." assert FLAGS.prune_criterion in ['l1_norm', 'geometry_median'], \ "unsupported prune criterion {}".format(FLAGS.prune_criterion) pruner = Pruner(criterion=FLAGS.prune_criterion) train_prog = pruner.prune( train_prog, fluid.global_scope(), params=pruned_params, ratios=pruned_ratios, place=place, only_graph=False)[0] compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) if FLAGS.eval: base_flops = flops(eval_prog) eval_prog = pruner.prune( eval_prog, fluid.global_scope(), params=pruned_params, ratios=pruned_ratios, place=place, only_graph=True)[0] pruned_flops = flops(eval_prog) logger.info("FLOPs -{}; total FLOPs: {}; pruned FLOPs: {}".format( float(base_flops - pruned_flops) / base_flops, base_flops, pruned_flops)) compiled_eval_prog = fluid.compiler.CompiledProgram(eval_prog) if FLAGS.resume_checkpoint: checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint) start_iter = checkpoint.global_step() train_reader = create_reader(cfg.TrainReader, (cfg.max_iters - start_iter) * devices_num, cfg) train_loader.set_sample_list_generator(train_reader, place) # whether output bbox is normalized in model output layer is_bbox_normalized = False if hasattr(model, 'is_bbox_normalized') and \ callable(model.is_bbox_normalized): is_bbox_normalized = model.is_bbox_normalized() # if map_type not set, use default 11point, only use in VOC eval map_type = cfg.map_type if 'map_type' in cfg else '11point' train_stats = TrainingStats(cfg.log_smooth_window, train_keys) train_loader.start() start_time = time.time() end_time = time.time() cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) time_stat = deque(maxlen=cfg.log_smooth_window) best_box_ap_list = [0.0, 0] #[map, iter] # use tb-paddle to log data if FLAGS.use_tb: from tb_paddle import SummaryWriter tb_writer = SummaryWriter(FLAGS.tb_log_dir) tb_loss_step = 0 tb_mAP_step = 0 if FLAGS.eval: # evaluation results = eval_run(exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls, cfg) resolution = None if 'mask' in results[0]: resolution = model.mask_head.resolution dataset = cfg['EvalReader']['dataset'] box_ap_stats = eval_results( results, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type, dataset=dataset) for it in range(start_iter, cfg.max_iters): start_time = end_time end_time = time.time() time_stat.append(end_time - start_time) time_cost = np.mean(time_stat) eta_sec = (cfg.max_iters - it) * time_cost eta = str(datetime.timedelta(seconds=int(eta_sec))) outs = exe.run(compiled_train_prog, fetch_list=train_values) stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])} # use tb-paddle to log loss if FLAGS.use_tb: if it % cfg.log_iter == 0: for loss_name, loss_value in stats.items(): tb_writer.add_scalar(loss_name, loss_value, tb_loss_step) tb_loss_step += 1 train_stats.update(stats) logs = train_stats.log() if it % cfg.log_iter == 0 and (not FLAGS.dist or trainer_id == 0): strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format( it, np.mean(outs[-1]), logs, time_cost, eta) logger.info(strs) if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \ and (not FLAGS.dist or trainer_id == 0): save_name = str(it) if it != cfg.max_iters - 1 else "model_final" checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name)) if FLAGS.eval: # evaluation results = eval_run( exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls, cfg=cfg) resolution = None if 'mask' in results[0]: resolution = model.mask_head.resolution box_ap_stats = eval_results( results, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type, dataset=dataset) # use tb_paddle to log mAP if FLAGS.use_tb: tb_writer.add_scalar("mAP", box_ap_stats[0], tb_mAP_step) tb_mAP_step += 1 if box_ap_stats[0] > best_box_ap_list[0]: best_box_ap_list[0] = box_ap_stats[0] best_box_ap_list[1] = it checkpoint.save(exe, train_prog, os.path.join(save_dir, "best_model")) logger.info("Best test box ap: {}, in iter: {}".format( best_box_ap_list[0], best_box_ap_list[1])) train_loader.reset()
def __init__(self, feed_list, data_reader, main_program=None, startup_program=None, config=None, metrics_choices="default"): # base item self._base_data_reader = data_reader self._base_feed_list = feed_list # metrics item self.best_score = -999 if metrics_choices == "default": metrics_choices = ["acc"] elif metrics_choices == None: metrics_choices = [] if isinstance(metrics_choices, list): self.metrics_choices = metrics_choices else: self.metrics_choices = [metrics_choices] if main_program is None: self._base_main_program = clone_program( fluid.default_main_program(), for_test=False) else: self._base_main_program = clone_program(main_program, for_test=False) if startup_program is None: self._base_startup_program = clone_program( fluid.default_startup_program(), for_test=False) else: self._base_startup_program = clone_program(startup_program, for_test=False) self.is_checkpoint_loaded = False self._base_compiled_program = None # run config self.config = config if config else RunConfig() self.place = self.places[0] self.device_count = len(self.places) if self.config.use_data_parallel: if not self.config.use_pyreader and self.config.batch_size < self.device_count: logger.warning( "Batch size({}) is less than the count of devices({}), which is not allowed in current Paddle versions" .format(self.config.batch_size, self.device_count)) logger.warning( "Batch size automatically adjusted to {}".format( self.device_count)) self.config._batch_size = self.device_count self.exe = fluid.Executor(place=self.place) self.build_strategy = fluid.BuildStrategy() # log item if not os.path.exists(self.config.checkpoint_dir): mkdir(self.config.checkpoint_dir) tb_log_dir = os.path.join(self.config.checkpoint_dir, "visualization") self.tb_writer = SummaryWriter(tb_log_dir) # run environment self._phases = [] self._envs = {} self._predict_data = None # accelerate predict self.is_best_model_loaded = False # set default phase self.enter_phase("train")
class BasicTask(object): def __init__(self, feed_list, data_reader, main_program=None, startup_program=None, config=None, metrics_choices="default"): # base item self._base_data_reader = data_reader self._base_feed_list = feed_list # metrics item self.best_score = -999 if metrics_choices == "default": metrics_choices = ["acc"] elif metrics_choices == None: metrics_choices = [] if isinstance(metrics_choices, list): self.metrics_choices = metrics_choices else: self.metrics_choices = [metrics_choices] if main_program is None: self._base_main_program = clone_program( fluid.default_main_program(), for_test=False) else: self._base_main_program = clone_program(main_program, for_test=False) if startup_program is None: self._base_startup_program = clone_program( fluid.default_startup_program(), for_test=False) else: self._base_startup_program = clone_program(startup_program, for_test=False) self.is_checkpoint_loaded = False self._base_compiled_program = None # run config self.config = config if config else RunConfig() self.place = self.places[0] self.device_count = len(self.places) if self.config.use_data_parallel: if not self.config.use_pyreader and self.config.batch_size < self.device_count: logger.warning( "Batch size({}) is less than the count of devices({}), which is not allowed in current Paddle versions" .format(self.config.batch_size, self.device_count)) logger.warning( "Batch size automatically adjusted to {}".format( self.device_count)) self.config._batch_size = self.device_count self.exe = fluid.Executor(place=self.place) self.build_strategy = fluid.BuildStrategy() # log item if not os.path.exists(self.config.checkpoint_dir): mkdir(self.config.checkpoint_dir) tb_log_dir = os.path.join(self.config.checkpoint_dir, "visualization") self.tb_writer = SummaryWriter(tb_log_dir) # run environment self._phases = [] self._envs = {} self._predict_data = None # accelerate predict self.is_best_model_loaded = False # set default phase self.enter_phase("train") @contextlib.contextmanager def phase_guard(self, phase): self.enter_phase(phase) yield self.exit_phase() def enter_phase(self, phase): if phase not in [ "train", "val", "dev", "test", "predict", "inference" ]: raise RuntimeError() if phase in ["val", "dev"]: phase = "dev" elif phase in ["predict", "inference"]: phase = "predict" self._phases.append(phase) def exit_phase(self): self._phases = self._phases[:-1] def init_if_necessary(self): if not self.is_checkpoint_loaded: if not self.load_checkpoint(): self.exe.run(self._base_startup_program) self.is_checkpoint_loaded = True self.is_best_model_loaded = False def init_if_load_best_model(self): if not self.is_best_model_loaded: best_model_path = os.path.join(self.config.checkpoint_dir, "best_model") logger.info("Load the best model from %s" % best_model_path) if os.path.exists(best_model_path): self.load_parameters(best_model_path) self.is_checkpoint_loaded = False self.is_best_model_loaded = True else: self.init_if_necessary() else: logger.info("The best model has been loaded") def _build_env(self): if self.env.is_inititalized: return self._build_env_start_event() self.env.is_inititalized = True self.env.main_program = clone_program(self._base_main_program, for_test=False) self.env.startup_program = fluid.Program() with fluid.program_guard(self.env.main_program, self._base_startup_program): with fluid.unique_name.guard(self.env.UNG): self.env.outputs = self._build_net() if self.is_train_phase or self.is_test_phase: self.env.labels = self._add_label() self.env.loss = self._add_loss() self.env.metrics = self._add_metrics() if self.is_predict_phase or self.is_test_phase: self.env.main_program = clone_program(self.env.main_program, for_test=True) hub.common.paddle_helper.set_op_attr(self.env.main_program, is_test=True) if self.config.use_pyreader: t_program = fluid.Program() with fluid.program_guard(t_program, self.env.startup_program): self.env.py_reader = fluid.layers.py_reader( capacity=64, shapes=[var.shape for var in self.feed_var_list], dtypes=[ dtype_map[var.dtype] for var in self.feed_var_list ], lod_levels=[var.lod_level for var in self.feed_var_list], use_double_buffer=False) feed_var_list = self.feed_var_list py_vars = fluid.layers.read_file(self.env.py_reader) py_vars = to_list(py_vars) input_dict = { feed_var_list[index].name: py_var for index, py_var in enumerate(py_vars) } hub.connect_program(pre_program=t_program, next_program=self.env.main_program, input_dict=input_dict, need_log=False) self.env.main_program = t_program if not self.is_predict_phase: self.env.loss = self.env.main_program.global_block().vars[ self.env.loss.name] metrics_name = [var.name for var in self.env.metrics] self.env.metrics = [ self.env.main_program.global_block().vars[name] for name in metrics_name ] outputs_name = [var.name for var in self.env.outputs] self.env.outputs = [ self.env.main_program.global_block().vars[name] for name in outputs_name ] if self.config.enable_memory_optim: for var_name in self.fetch_list: var = self.env.main_program.global_block().vars[var_name] var.persistable = True # to avoid to print logger two times in result of the logger usage of paddle-fluid 1.6 for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) if self.is_train_phase: with fluid.program_guard(self.env.main_program, self._base_startup_program): with fluid.unique_name.guard(self.env.UNG): self.scheduled_lr, self.max_train_steps = self.config.strategy.execute( self.loss, self._base_data_reader, self.config, self.device_count) if self.is_train_phase: loss_name = self.env.loss.name else: loss_name = None share_vars_from = self._base_compiled_program if not self.config.use_data_parallel: self.env.main_program_compiled = None else: self.env.main_program_compiled = fluid.CompiledProgram( self.env.main_program).with_data_parallel( loss_name=loss_name, share_vars_from=share_vars_from, build_strategy=self.build_strategy) self.exe.run(self.env.startup_program) # to avoid to print logger two times in result of the logger usage of paddle-fluid 1.5 for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) self._build_env_end_event() @property def places(self): if self.config.use_cuda: _places = fluid.framework.cuda_places() else: _places = fluid.framework.cpu_places() if not self.config.use_data_parallel: return [_places[0]] return _places @property def return_numpy(self): return True @property def is_train_phase(self): return self.phase in ["train"] @property def is_test_phase(self): return self.phase in ["val", "dev", "test"] @property def is_predict_phase(self): return self.phase in ["predict", "inference"] @property def phase(self): return self._phases[-1] @property def env(self): phase = self.phase if phase in ["val", "dev", "test"]: phase = "dev" if not phase in self._envs: self._envs[phase] = RunEnv() return self._envs[phase] @property def py_reader(self): if not self.env.is_inititalized: self._build_env() return self.env.py_reader @property def current_step(self): if not self.env.is_inititalized: self._build_env() return self.env.current_step @property def current_epoch(self): if not self.env.is_inititalized: self._build_env() return self.env.current_epoch @property def main_program(self): if not self.env.is_inititalized: self._build_env() return self.env.main_program @property def startup_program(self): if not self.env.is_inititalized: self._build_env() return self.env.startup_program @property def main_program_compiled(self): if not self.env.is_inititalized: self._build_env() return self.env.main_program_compiled @property def main_program_to_be_run(self): if self.config.use_data_parallel: if self._base_compiled_program is None: self._base_compiled_program = self.env.main_program_compiled return self.main_program_compiled return self.main_program @property def reader(self): if self.is_predict_phase: data = self._predict_data else: data = None self.env.reader = self._base_data_reader.data_generator( batch_size=self.config.batch_size, phase=self.phase, data=data) return self.env.reader @property def loss(self): if self.is_predict_phase: raise RuntimeError() if not self.env.is_inititalized: self._build_env() return self.env.loss @property def labels(self): if self.is_predict_phase: raise RuntimeError() if not self.env.is_inititalized: self._build_env() return self.env.labels @property def outputs(self): if not self.env.is_inititalized: self._build_env() return self.env.outputs @property def metrics(self): if self.is_predict_phase: raise RuntimeError() if not self.env.is_inititalized: self._build_env() return self.env.metrics @property def unique_name_generator(self): return self.env.UNG @property def feed_list(self): feed_list = [varname for varname in self._base_feed_list] if self.is_train_phase or self.is_test_phase: feed_list += [label.name for label in self.labels] return feed_list @property def feed_var_list(self): vars = self.main_program.global_block().vars return [vars[varname] for varname in self.feed_list] @property def fetch_list(self): if self.is_train_phase or self.is_test_phase: return [metric.name for metric in self.metrics] + [self.loss.name] return [output.name for output in self.outputs] def _build_env_start_event(self): pass def _build_env_end_event(self): if not self.is_predict_phase: self.env.score_scalar = {} def _finetune_start_event(self): logger.info("PaddleHub finetune start") def _finetune_end_event(self, run_states): logger.info("PaddleHub finetune finished.") def _predict_start_event(self): logger.info("PaddleHub predict start") def _predict_end_event(self, run_states): logger.info("PaddleHub predict finished.") def _eval_start_event(self): logger.info("Evaluation on {} dataset start".format(self.phase)) def _eval_end_event(self, run_states): eval_scores, eval_loss, run_speed = self._calculate_metrics(run_states) if 'train' in self._envs: self.tb_writer.add_scalar( tag="Loss_{}".format(self.phase), scalar_value=eval_loss, global_step=self._envs['train'].current_step) log_scores = "" for metric in eval_scores: if 'train' in self._envs: self.tb_writer.add_scalar( tag="{}_{}".format(metric, self.phase), scalar_value=eval_scores[metric], global_step=self._envs['train'].current_step) log_scores += "%s=%.5f " % (metric, eval_scores[metric]) logger.info( "[%s dataset evaluation result] loss=%.5f %s[step/sec: %.2f]" % (self.phase, eval_loss, log_scores, run_speed)) eval_scores_items = eval_scores.items() if len(eval_scores_items): # The first metric will be chose to eval main_metric, main_value = list(eval_scores_items)[0] else: logger.warning( "None of metrics has been implemented, loss will be used to evaluate." ) # The larger, the better main_metric, main_value = "negative loss", -eval_loss if self.phase in ["dev", "val"] and main_value > self.best_score: self.best_score = main_value model_saved_dir = os.path.join(self.config.checkpoint_dir, "best_model") logger.info("best model saved to %s [best %s=%.5f]" % (model_saved_dir, main_metric, main_value)) save_result = fluid.io.save_persistables( executor=self.exe, dirname=model_saved_dir, main_program=self.main_program) def _log_interval_event(self, run_states): scores, avg_loss, run_speed = self._calculate_metrics(run_states) self.tb_writer.add_scalar(tag="Loss_{}".format(self.phase), scalar_value=avg_loss, global_step=self._envs['train'].current_step) log_scores = "" for metric in scores: self.tb_writer.add_scalar( tag="{}_{}".format(metric, self.phase), scalar_value=scores[metric], global_step=self._envs['train'].current_step) log_scores += "%s=%.5f " % (metric, scores[metric]) logger.info("step %d / %d: loss=%.5f %s[step/sec: %.2f]" % (self.current_step, self.max_train_steps, avg_loss, log_scores, run_speed)) def _save_ckpt_interval_event(self): self.save_checkpoint() def _eval_interval_event(self): self.eval(phase="dev") def _run_step_event(self, run_state): if self.is_predict_phase: yield run_state.run_results def _build_net(self): raise NotImplementedError def _add_loss(self): raise NotImplementedError def _add_label(self): raise NotImplementedError def _add_metrics(self): # Some metrics like acc, auc can be calculated by fluid.layers # The others can be calculated in _calculate_metrics function raise NotImplementedError def _calculate_metrics(self, run_states): # NOTE: if you want to customize the metrics # you should make sure that the first parameter returned is a dict # The first key will be used as main metrics to update the best model raise NotImplementedError # NOTE: current saved checkpoint machanism is not completed, # it can't restore dataset training status def save_checkpoint(self): save_checkpoint(checkpoint_dir=self.config.checkpoint_dir, current_epoch=self.current_epoch, global_step=self.current_step, best_score=self.best_score, exe=self.exe, main_program=self.main_program) def load_checkpoint(self): is_load_successful, self.env.current_epoch, self.env.current_step, self.best_score = load_checkpoint( self.config.checkpoint_dir, self.exe, main_program=self.main_program) return is_load_successful def load_parameters(self, dirname): def if_exist(var): path = os.path.join(dirname, var.name) return os.path.exists(path) fluid.io.load_vars(self.exe, dirname, self.main_program, predicate=if_exist) def save_parameters(self, dirname): fluid.io.save_params(self.exe, dirname=dirname, main_program=self.main_program) def finetune_and_eval(self): return self.finetune(do_eval=True) def finetune(self, do_eval=False): # Start to finetune with self.phase_guard(phase="train"): self.init_if_necessary() self._finetune_start_event() run_states = [] if self.current_epoch <= self.config.num_epoch: while self.current_epoch <= self.config.num_epoch: self.config.strategy.step() run_states = self._run(do_eval=do_eval) self.env.current_epoch += 1 # Final evaluation if self._base_data_reader.get_dev_examples() != []: self.eval(phase="dev") if self._base_data_reader.get_test_examples() != []: self.eval(phase="test", load_best_model=True) # Save checkpoint after finetune self.save_checkpoint() self._finetune_end_event(run_states) return run_states def eval(self, phase="dev", load_best_model=False): # Warning: DO NOT use eval(load_best_model=True) in finetune_and_eval # It will cause trainer unable to continue training from checkpoint after eval # More important, The model should evaluate current performance during training. with self.phase_guard(phase=phase): if load_best_model: self.init_if_load_best_model() else: self.init_if_necessary() self._eval_start_event() run_states = self._run() self._eval_end_event(run_states) return run_states def predict(self, data, load_best_model=True): with self.phase_guard(phase="predict"): if load_best_model: self.init_if_load_best_model() else: self.init_if_necessary() self._predict_data = data self._predict_start_event() run_states = self._run() self._predict_end_event(run_states) self._predict_data = None return run_states def _run(self, do_eval=False): with fluid.program_guard(self.main_program, self.startup_program): if self.config.use_pyreader: return self._run_with_py_reader(do_eval=do_eval) return self._run_with_data_feeder(do_eval=do_eval) def _run_with_data_feeder(self, do_eval=False): data_feeder = fluid.DataFeeder(feed_list=self.feed_list, place=self.place) global_run_states = [] period_run_states = [] for run_step, batch in enumerate(self.reader(), start=1): if self.config.use_data_parallel and len( batch) < self.device_count: continue step_run_state = RunState(len(self.fetch_list)) step_run_state.run_step = 1 num_batch_examples = len(batch) if self.return_numpy: fetch_result = self.exe.run(self.main_program_to_be_run, feed=data_feeder.feed(batch), fetch_list=self.fetch_list) else: fetch_result = self.exe.run(self.main_program_to_be_run, feed=data_feeder.feed(batch), fetch_list=self.fetch_list, return_numpy=False) fetch_result = [np.array(x) for x in fetch_result] for index, result in enumerate(fetch_result): step_run_state.run_results[index] = result step_run_state.run_examples += num_batch_examples step_run_state.update() period_run_states += [step_run_state] self.env.current_step += 1 if self.is_train_phase: if self.current_step % self.config.log_interval == 0: self._log_interval_event(period_run_states) global_run_states += period_run_states period_run_states = [] if self.config.save_ckpt_interval and self.current_step % self.config.save_ckpt_interval == 0: self._save_ckpt_interval_event() if do_eval and self.current_step % self.config.eval_interval == 0: self._eval_interval_event() self._run_step_event(step_run_state) global_run_states += period_run_states return global_run_states def _run_with_py_reader(self, do_eval=False): flag = False use_data_parallel_backup = self.config.use_data_parallel while True: global_run_states = [] period_run_states = [] self.py_reader.decorate_paddle_reader(self.reader) self.py_reader.start() try: while True: num_batch_examples = self.config.batch_size * self.device_count step_run_state = RunState(len(self.fetch_list)) step_run_state.run_step = 1 if self.return_numpy: fetch_result = self.exe.run( self.main_program_to_be_run, fetch_list=self.fetch_list) else: fetch_result = self.exe.run( self.main_program_to_be_run, fetch_list=self.fetch_list, return_numpy=False) fetch_result = [np.array(x) for x in fetch_result] for index, result in enumerate(fetch_result): step_run_state.run_results[index] = result step_run_state.run_examples += num_batch_examples step_run_state.update() period_run_states += [step_run_state] self.env.current_step += 1 if self.is_train_phase: if self.current_step % self.config.log_interval == 0: self._log_interval_event(period_run_states) global_run_states += period_run_states period_run_states = [] if self.config.save_ckpt_interval and self.current_step % self.config.save_ckpt_interval == 0: self._save_ckpt_interval_event() if do_eval and self.current_step % self.config.eval_interval == 0: self._eval_interval_event() self._run_step_event(step_run_state) except fluid.core.EOFException: global_run_states += period_run_states self.py_reader.reset() ''' When opening use_data_parallel and use_pyreader, if the amount of data is too small, the reader will have thrown EOF Exception when not fetching to the running result. In this case, temporarily close the use_data_parallel to get the result. ''' if flag: self.config._use_data_parallel = use_data_parallel_backup elif len(global_run_states) == 0: flag = True self.config._use_data_parallel = False continue break return global_run_states
def main(): env = os.environ FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env if FLAGS.dist: trainer_id = int(env['PADDLE_TRAINER_ID']) local_seed = (99 + trainer_id) random.seed(local_seed) np.random.seed(local_seed) if FLAGS.enable_ce: random.seed(0) np.random.seed(0) cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) check_config(cfg) # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) # check if paddlepaddle version is satisfied check_version() main_arch = cfg.architecture if cfg.use_gpu: devices_num = fluid.core.get_cuda_device_count() else: devices_num = int(os.environ.get('CPU_NUM', 1)) if 'FLAGS_selected_gpus' in env: device_id = int(env['FLAGS_selected_gpus']) else: device_id = 0 place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) lr_builder = create('LearningRate') optim_builder = create('OptimizerBuilder') # build program startup_prog = fluid.Program() train_prog = fluid.Program() if FLAGS.enable_ce: startup_prog.random_seed = 1000 train_prog.random_seed = 1000 with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) if FLAGS.fp16: assert (getattr(model.backbone, 'norm_type', None) != 'affine_channel'), \ '--fp16 currently does not support affine channel, ' \ ' please modify backbone settings to use batch norm' with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx: inputs_def = cfg['TrainReader']['inputs_def'] feed_vars, train_loader = model.build_inputs(**inputs_def) train_fetches = model.train(feed_vars) loss = train_fetches['loss'] if FLAGS.fp16: loss *= ctx.get_loss_scale_var() lr = lr_builder() optimizer = optim_builder(lr) optimizer.minimize(loss) if FLAGS.fp16: loss /= ctx.get_loss_scale_var() if 'use_ema' in cfg and cfg['use_ema']: global_steps = _decay_step_counter() ema = ExponentialMovingAverage(cfg['ema_decay'], thres_steps=global_steps) ema.update() # parse train fetches train_keys, train_values, _ = parse_fetches(train_fetches) train_values.append(lr) if FLAGS.eval: eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg['EvalReader']['inputs_def'] feed_vars, eval_loader = model.build_inputs(**inputs_def) fetches = model.eval(feed_vars) eval_prog = eval_prog.clone(True) eval_reader = create_reader(cfg.EvalReader, devices_num=1) eval_loader.set_sample_list_generator(eval_reader, place) # parse eval fetches extra_keys = [] if cfg.metric == 'COCO': extra_keys = ['im_info', 'im_id', 'im_shape'] elif cfg.metric == 'VOC': extra_keys = ['gt_bbox', 'gt_class', 'is_difficult'] elif cfg.metric == 'WIDERFACE': extra_keys = ['im_id', 'im_shape', 'gt_bbox'] else: extra_keys = ['gt_bbox', 'gt_class', 'im_id'] eval_keys, eval_values, eval_cls = parse_fetches( fetches, eval_prog, extra_keys) # compile program for multi-devices build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_optimizer_ops = False # only enable sync_bn in multi GPU devices sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn' build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \ and cfg.use_gpu exec_strategy = fluid.ExecutionStrategy() # iteration number when CompiledProgram tries to drop local execution scopes. # Set it to be 1 to save memory usages, so that unused variables in # local execution scopes can be deleted after each iteration. exec_strategy.num_iteration_per_drop_scope = 1 if FLAGS.dist: dist_utils.prepare_for_multi_process(exe, build_strategy, startup_prog, train_prog) exec_strategy.num_threads = 1 exe.run(startup_prog) compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) if FLAGS.eval: compiled_eval_prog = fluid.compiler.CompiledProgram(eval_prog) fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel' ignore_params = cfg.finetune_exclude_pretrained_params \ if 'finetune_exclude_pretrained_params' in cfg else [] start_iter = 0 if FLAGS.resume_checkpoint: checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint) start_iter = checkpoint.global_step() elif cfg.pretrain_weights and fuse_bn and not ignore_params: checkpoint.load_and_fusebn(exe, train_prog, cfg.pretrain_weights) elif cfg.pretrain_weights: checkpoint.load_params(exe, train_prog, cfg.pretrain_weights, ignore_params=ignore_params) train_reader = create_reader(cfg.TrainReader, (cfg.max_iters - start_iter) * devices_num, cfg, devices_num=devices_num) train_loader.set_sample_list_generator(train_reader, place) # whether output bbox is normalized in model output layer is_bbox_normalized = False if hasattr(model, 'is_bbox_normalized') and \ callable(model.is_bbox_normalized): is_bbox_normalized = model.is_bbox_normalized() # if map_type not set, use default 11point, only use in VOC eval map_type = cfg.map_type if 'map_type' in cfg else '11point' train_stats = TrainingStats(cfg.log_smooth_window, train_keys) train_loader.start() start_time = time.time() end_time = time.time() cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) time_stat = deque(maxlen=cfg.log_smooth_window) best_box_ap_list = [0.0, 0] #[map, iter] # use tb-paddle to log data if FLAGS.use_tb: from tb_paddle import SummaryWriter tb_writer = SummaryWriter(FLAGS.tb_log_dir) tb_loss_step = 0 tb_mAP_step = 0 for it in range(start_iter, cfg.max_iters): start_time = end_time end_time = time.time() time_stat.append(end_time - start_time) time_cost = np.mean(time_stat) eta_sec = (cfg.max_iters - it) * time_cost eta = str(datetime.timedelta(seconds=int(eta_sec))) outs = exe.run(compiled_train_prog, fetch_list=train_values) stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])} # use tb-paddle to log loss if FLAGS.use_tb: if it % cfg.log_iter == 0: for loss_name, loss_value in stats.items(): tb_writer.add_scalar(loss_name, loss_value, tb_loss_step) tb_loss_step += 1 train_stats.update(stats) logs = train_stats.log() if it % cfg.log_iter == 0 and (not FLAGS.dist or trainer_id == 0): strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format( it, np.mean(outs[-1]), logs, time_cost, eta) logger.info(strs) # NOTE : profiler tools, used for benchmark if FLAGS.is_profiler and it == 5: profiler.start_profiler("All") elif FLAGS.is_profiler and it == 10: profiler.stop_profiler("total", FLAGS.profiler_path) return if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \ and (not FLAGS.dist or trainer_id == 0): save_name = str(it) if it != cfg.max_iters - 1 else "model_final" if 'use_ema' in cfg and cfg['use_ema']: exe.run(ema.apply_program) checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name)) if FLAGS.eval: # evaluation resolution = None if 'Mask' in cfg.architecture: resolution = model.mask_head.resolution results = eval_run(exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls, cfg, resolution=resolution) box_ap_stats = eval_results(results, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type, cfg['EvalReader']['dataset']) # use tb_paddle to log mAP if FLAGS.use_tb: tb_writer.add_scalar("mAP", box_ap_stats[0], tb_mAP_step) tb_mAP_step += 1 if box_ap_stats[0] > best_box_ap_list[0]: best_box_ap_list[0] = box_ap_stats[0] best_box_ap_list[1] = it checkpoint.save(exe, train_prog, os.path.join(save_dir, "best_model")) logger.info("Best test box ap: {}, in iter: {}".format( best_box_ap_list[0], best_box_ap_list[1])) if 'use_ema' in cfg and cfg['use_ema']: exe.run(ema.restore_program) train_loader.reset()
train_reader = paddle.batch(reader = dataset.train_data(client = trainer_id), batch_size = params["federated"]["batch_size"]) val_reader = paddle.batch(reader=dataset.val_data(client = trainer_id), batch_size = params["federated"]["batch_size"]) if trainer_id == 0: test_reader = paddle.batch(reader=dataset.test_data(), batch_size = params["federated"]["batch_size"]) inp = paddle.fluid.layers.data(name ='inp', shape = params["federated"]["input_shape"], dtype = params["federated"]["input_dtype"]) label = paddle.fluid.layers.data(name ='label', shape = params["federated"]["label_shape"], dtype = params["federated"]["label_dtype"]) feeder = paddle.fluid.DataFeeder(feed_list = [inp, label], place = paddle.fluid.CPUPlace()) # Summary ########### data_writer = SummaryWriter(logdir=join(join(params["federated"]["logdir"],"data"),f"client_{trainer_id}")) # Run ######### round_id = 0 while not trainer.stop(): round_id += 1 if round_id > params["federated"]["num_round"]: break for e in range(params["federated"]["num_epoch"]): for data in train_reader(): trainer.run(feeder.feed(data), fetch=job._target_names) train_metrics = metrics(trainer.exe, test_program,feeder, train_reader, job._target_names)
import paddle import paddle.fluid as fluid from paddle.fluid.dygraph.nn import Linear, Conv2D, Pool2D # from paddle.fluid.dygraph.nn import FC import numpy as np import os import gzip import json import random from tb_paddle import SummaryWriter import matplotlib.pyplot as plt data_writer = SummaryWriter(logdir='log/data') #计算分类准确率,观察模型训练结果 #检查模型训练过程,识别潜在问题 #加入校验和测试,更好评价模型效果 #加入正则化,避免模型过拟合 #可视化分析 def load_data(mode='train'): # 数据文件 datafile = 'mnist.json.gz' print('loading mnist dataset from {} ......'.format(datafile)) data = json.load(gzip.open(datafile)) # 读取到的数据可以直接区分训练集,验证集,测试集 train_set, val_set, eval_set = data # 数据集相关参数,图片高度IMG_ROWS, 图片宽度IMG_COLS IMG_ROWS = 28 IMG_COLS = 28
test_program = paddle.fluid.default_main_program().clone(for_test=True) # Define optimization method optimizer = paddle.fluid.optimizer.SGD( learning_rate=params["centralized"]["learning_rate"]) opts = optimizer.minimize(model.loss) # Define a parser that uses the CPU place = paddle.fluid.CPUPlace() exe = paddle.fluid.Executor(place) #Parameter initialization exe.run(paddle.fluid.default_startup_program()) # Summary ########### data_writer = SummaryWriter( logdir=join(params["centralized"]["logdir"], "data")) #Start training and testing for epoch in range(params["centralized"]["num_epoch"]): for data in train_reader(): exe.run(program=paddle.fluid.default_main_program(), feed=feeder.feed(data), fetch_list=model.fetch_list) train_metrics = metrics(exe, test_program, feeder, train_reader, model.fetch_list) val_metrics = metrics(exe, test_program, feeder, val_reader, model.fetch_list) test_metrics = metrics(exe, test_program, feeder, test_reader, model.fetch_list)
def main(): cfg = load_config(FLAGS.config) if 'architecture' in cfg: main_arch = cfg.architecture else: raise ValueError("'architecture' not specified in config file.") merge_config(FLAGS.opt) # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) # print_total_cfg(cfg) if 'test_feed' not in cfg: test_feed = create(main_arch + 'TestFeed') else: test_feed = create(cfg.test_feed) test_images = get_test_images(FLAGS.infer_dir, FLAGS.infer_img) test_feed.dataset.add_images(test_images) place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) infer_prog, feed_var_names, fetch_list = fluid.io.load_inference_model( dirname=FLAGS.model_path, model_filename=FLAGS.model_name, params_filename=FLAGS.params_name, executor=exe) reader = create_reader(test_feed) feeder = fluid.DataFeeder( place=place, feed_list=feed_var_names, program=infer_prog) # parse infer fetches assert cfg.metric in ['COCO', 'VOC'], \ "unknown metric type {}".format(cfg.metric) extra_keys = [] if cfg['metric'] == 'COCO': extra_keys = ['im_info', 'im_id', 'im_shape'] if cfg['metric'] == 'VOC': extra_keys = ['im_id', 'im_shape'] keys, values, _ = parse_fetches({ 'bbox': fetch_list }, infer_prog, extra_keys) # parse dataset category if cfg.metric == 'COCO': from ppdet.utils.coco_eval import bbox2out, mask2out, get_category_info if cfg.metric == "VOC": from ppdet.utils.voc_eval import bbox2out, get_category_info anno_file = getattr(test_feed.dataset, 'annotation', None) with_background = getattr(test_feed, 'with_background', True) use_default_label = getattr(test_feed, 'use_default_label', False) clsid2catid, catid2name = get_category_info(anno_file, with_background, use_default_label) # whether output bbox is normalized in model output layer is_bbox_normalized = False # use tb-paddle to log image if FLAGS.use_tb: from tb_paddle import SummaryWriter tb_writer = SummaryWriter(FLAGS.tb_log_dir) tb_image_step = 0 tb_image_frame = 0 # each frame can display ten pictures at most. imid2path = reader.imid2path keys = ['bbox'] infer_time = True compile_prog = fluid.compiler.CompiledProgram(infer_prog) for iter_id, data in enumerate(reader()): feed_data = [[d[0], d[1]] for d in data] # for infer time if infer_time: warmup_times = 10 repeats_time = 100 feed_data_dict = feeder.feed(feed_data) for i in range(warmup_times): exe.run(compile_prog, feed=feed_data_dict, fetch_list=fetch_list, return_numpy=False) start_time = time.time() for i in range(repeats_time): exe.run(compile_prog, feed=feed_data_dict, fetch_list=fetch_list, return_numpy=False) print("infer time: {} ms/sample".format((time.time() - start_time) * 1000 / repeats_time)) infer_time = False outs = exe.run(compile_prog, feed=feeder.feed(feed_data), fetch_list=fetch_list, return_numpy=False) res = { k: (np.array(v), v.recursive_sequence_lengths()) for k, v in zip(keys, outs) } res['im_id'] = [[d[2] for d in data]] logger.info('Infer iter {}'.format(iter_id)) bbox_results = None mask_results = None if 'bbox' in res: bbox_results = bbox2out([res], clsid2catid, is_bbox_normalized) if 'mask' in res: mask_results = mask2out([res], clsid2catid, model.mask_head.resolution) # visualize result im_ids = res['im_id'][0] for im_id in im_ids: image_path = imid2path[int(im_id)] image = Image.open(image_path).convert('RGB') # use tb-paddle to log original image if FLAGS.use_tb: original_image_np = np.array(image) tb_writer.add_image( "original/frame_{}".format(tb_image_frame), original_image_np, tb_image_step, dataformats='HWC') image = visualize_results(image, int(im_id), catid2name, FLAGS.draw_threshold, bbox_results, mask_results) # use tb-paddle to log image with bbox if FLAGS.use_tb: infer_image_np = np.array(image) tb_writer.add_image( "bbox/frame_{}".format(tb_image_frame), infer_image_np, tb_image_step, dataformats='HWC') tb_image_step += 1 if tb_image_step % 10 == 0: tb_image_step = 0 tb_image_frame += 1 save_name = get_save_image_name(FLAGS.output_dir, image_path) logger.info("Detection bbox results save in {}".format(save_name)) image.save(save_name, quality=95)