def evaluate(cfg, ckpt_dir=None, use_gpu=False, use_mpio=False, **kwargs): np.set_printoptions(precision=5, suppress=True) startup_prog = fluid.Program() test_prog = fluid.Program() dataset = LaneNetDataset(file_list=cfg.DATASET.VAL_FILE_LIST, mode=ModelPhase.TRAIN, shuffle=True, data_dir=cfg.DATASET.DATA_DIR) def data_generator(): #TODO: check is batch reader compatitable with Windows if use_mpio: data_gen = dataset.multiprocess_generator( num_processes=cfg.DATALOADER.NUM_WORKERS, max_queue_size=cfg.DATALOADER.BUF_SIZE) else: data_gen = dataset.generator() for b in data_gen: yield b data_loader, pred, grts, masks, accuracy, fp, fn = build_model( test_prog, startup_prog, phase=ModelPhase.EVAL) data_loader.set_sample_generator(data_generator, drop_last=False, batch_size=cfg.BATCH_SIZE) # Get device environment places = fluid.cuda_places() if use_gpu else fluid.cpu_places() place = places[0] dev_count = len(places) print("#Device count: {}".format(dev_count)) exe = fluid.Executor(place) exe.run(startup_prog) test_prog = test_prog.clone(for_test=True) ckpt_dir = cfg.TEST.TEST_MODEL if not ckpt_dir else ckpt_dir if ckpt_dir is not None: print('load test model:', ckpt_dir) fluid.io.load_params(exe, ckpt_dir, main_program=test_prog) # Use streaming confusion matrix to calculate mean_iou np.set_printoptions(precision=4, suppress=True, linewidth=160, floatmode="fixed") fetch_list = [ pred.name, grts.name, masks.name, accuracy.name, fp.name, fn.name ] num_images = 0 step = 0 avg_acc = 0.0 avg_fp = 0.0 avg_fn = 0.0 # cur_images = 0 all_step = cfg.DATASET.TEST_TOTAL_IMAGES // cfg.BATCH_SIZE + 1 timer = Timer() timer.start() data_loader.start() while True: try: step += 1 pred, grts, masks, out_acc, out_fp, out_fn = exe.run( test_prog, fetch_list=fetch_list, return_numpy=True) avg_acc += np.mean(out_acc) * pred.shape[0] avg_fp += np.mean(out_fp) * pred.shape[0] avg_fn += np.mean(out_fn) * pred.shape[0] num_images += pred.shape[0] speed = 1.0 / timer.elapsed_time() print( "[EVAL]step={} accuracy={:.4f} fp={:.4f} fn={:.4f} step/sec={:.2f} | ETA {}" .format(step, avg_acc / num_images, avg_fp / num_images, avg_fn / num_images, speed, calculate_eta(all_step - step, speed))) timer.restart() sys.stdout.flush() except fluid.core.EOFException: break print("[EVAL]#image={} accuracy={:.4f} fp={:.4f} fn={:.4f}".format( num_images, avg_acc / num_images, avg_fp / num_images, avg_fn / num_images)) return avg_acc / num_images, avg_fp / num_images, avg_fn / num_images
def train(cfg): startup_prog = fluid.Program() train_prog = fluid.Program() drop_last = True dataset = LaneNetDataset( file_list=cfg.DATASET.TRAIN_FILE_LIST, mode=ModelPhase.TRAIN, shuffle=True, data_dir=cfg.DATASET.DATA_DIR) def data_generator(): if args.use_mpio: data_gen = dataset.multiprocess_generator( num_processes=cfg.DATALOADER.NUM_WORKERS, max_queue_size=cfg.DATALOADER.BUF_SIZE) else: data_gen = dataset.generator() batch_data = [] for b in data_gen: batch_data.append(b) if len(batch_data) == (cfg.BATCH_SIZE // cfg.NUM_TRAINERS): for item in batch_data: yield item batch_data = [] # Get device environment gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() # Get number of GPU dev_count = cfg.NUM_TRAINERS if cfg.NUM_TRAINERS > 1 else len(places) print_info("#Device count: {}".format(dev_count)) # Make sure BATCH_SIZE can divided by GPU cards assert cfg.BATCH_SIZE % dev_count == 0, ( 'BATCH_SIZE:{} not divisble by number of GPUs:{}'.format( cfg.BATCH_SIZE, dev_count)) # If use multi-gpu training mode, batch data will allocated to each GPU evenly batch_size_per_dev = cfg.BATCH_SIZE // dev_count cfg.BATCH_SIZE_PER_DEV = batch_size_per_dev print_info("batch_size_per_dev: {}".format(batch_size_per_dev)) data_loader, avg_loss, lr, pred, grts, masks, emb_loss, seg_loss, accuracy, fp, fn = build_model( train_prog, startup_prog, phase=ModelPhase.TRAIN) data_loader.set_sample_generator( data_generator, batch_size=batch_size_per_dev, drop_last=drop_last) exe = fluid.Executor(place) exe.run(startup_prog) exec_strategy = fluid.ExecutionStrategy() # Clear temporary variables every 100 iteration if args.use_gpu: exec_strategy.num_threads = fluid.core.get_cuda_device_count() exec_strategy.num_iteration_per_drop_scope = 100 build_strategy = fluid.BuildStrategy() if cfg.NUM_TRAINERS > 1 and args.use_gpu: dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog) exec_strategy.num_threads = 1 if cfg.TRAIN.SYNC_BATCH_NORM and args.use_gpu: if dev_count > 1: # Apply sync batch norm strategy print_info("Sync BatchNorm strategy is effective.") build_strategy.sync_batch_norm = True else: print_info( "Sync BatchNorm strategy will not be effective if GPU device" " count <= 1") compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=avg_loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy) # Resume training begin_epoch = cfg.SOLVER.BEGIN_EPOCH if cfg.TRAIN.RESUME_MODEL_DIR: begin_epoch = load_checkpoint(exe, train_prog) # Load pretrained model elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR): load_pretrained_weights(exe, train_prog, cfg.TRAIN.PRETRAINED_MODEL_DIR) else: print_info( 'Pretrained model dir {} not exists, training from scratch...'. format(cfg.TRAIN.PRETRAINED_MODEL_DIR)) # fetch_list = [avg_loss.name, lr.name, accuracy.name, precision.name, recall.name] fetch_list = [ avg_loss.name, lr.name, seg_loss.name, emb_loss.name, accuracy.name, fp.name, fn.name ] if args.debug: # Fetch more variable info and use streaming confusion matrix to # calculate IoU results if in debug mode np.set_printoptions( precision=4, suppress=True, linewidth=160, floatmode="fixed") fetch_list.extend([pred.name, grts.name, masks.name]) # cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True) if args.use_vdl: if not args.vdl_log_dir: print_info("Please specify the log directory by --vdl_log_dir.") exit(1) from visualdl import LogWriter log_writer = LogWriter(args.vdl_log_dir) # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0)) # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) step = 0 all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True: all_step += 1 all_step *= (cfg.SOLVER.NUM_EPOCHS - begin_epoch + 1) avg_loss = 0.0 avg_seg_loss = 0.0 avg_emb_loss = 0.0 avg_acc = 0.0 avg_fp = 0.0 avg_fn = 0.0 timer = Timer() timer.start() if begin_epoch > cfg.SOLVER.NUM_EPOCHS: raise ValueError( ("begin epoch[{}] is larger than cfg.SOLVER.NUM_EPOCHS[{}]").format( begin_epoch, cfg.SOLVER.NUM_EPOCHS)) if args.use_mpio: print_info("Use multiprocess reader") else: print_info("Use multi-thread reader") for epoch in range(begin_epoch, cfg.SOLVER.NUM_EPOCHS + 1): data_loader.start() while True: try: # If not in debug mode, avoid unnessary log and calculate loss, lr, out_seg_loss, out_emb_loss, out_acc, out_fp, out_fn = exe.run( program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) avg_loss += np.mean(np.array(loss)) avg_seg_loss += np.mean(np.array(out_seg_loss)) avg_emb_loss += np.mean(np.array(out_emb_loss)) avg_acc += np.mean(out_acc) avg_fp += np.mean(out_fp) avg_fn += np.mean(out_fn) step += 1 if step % args.log_steps == 0 and cfg.TRAINER_ID == 0: avg_loss /= args.log_steps avg_seg_loss /= args.log_steps avg_emb_loss /= args.log_steps avg_acc /= args.log_steps avg_fp /= args.log_steps avg_fn /= args.log_steps speed = args.log_steps / timer.elapsed_time() print(( "epoch={} step={} lr={:.5f} loss={:.4f} seg_loss={:.4f} emb_loss={:.4f} accuracy={:.4} fp={:.4} fn={:.4} step/sec={:.3f} | ETA {}" ).format(epoch, step, lr[0], avg_loss, avg_seg_loss, avg_emb_loss, avg_acc, avg_fp, avg_fn, speed, calculate_eta(all_step - step, speed))) if args.use_vdl: log_writer.add_scalar('Train/loss', avg_loss, step) log_writer.add_scalar('Train/lr', lr[0], step) log_writer.add_scalar('Train/speed', speed, step) sys.stdout.flush() avg_loss = 0.0 avg_seg_loss = 0.0 avg_emb_loss = 0.0 avg_acc = 0.0 avg_fp = 0.0 avg_fn = 0.0 timer.restart() except fluid.core.EOFException: data_loader.reset() break except Exception as e: print(e) if epoch % cfg.TRAIN.SNAPSHOT_EPOCH == 0 and cfg.TRAINER_ID == 0: ckpt_dir = save_checkpoint(exe, train_prog, epoch) if args.do_eval: print("Evaluation start") accuracy, fp, fn = evaluate( cfg=cfg, ckpt_dir=ckpt_dir, use_gpu=args.use_gpu, use_mpio=args.use_mpio) if args.use_vdl: log_writer.add_scalar('Evaluate/accuracy', accuracy, step) log_writer.add_scalar('Evaluate/fp', fp, step) log_writer.add_scalar('Evaluate/fn', fn, step) # Use VisualDL to visualize results if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None: visualize( cfg=cfg, use_gpu=args.use_gpu, vis_file_list=cfg.DATASET.VIS_FILE_LIST, vis_dir="visual", ckpt_dir=ckpt_dir, log_writer=log_writer) # save final model if cfg.TRAINER_ID == 0: save_checkpoint(exe, train_prog, 'final')