def __init__(self, cfg): use_mixed_precision = cfg.dtype == "float16" if use_mixed_precision: policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16") tf.keras.mixed_precision.experimental.set_policy(policy) self.train_dataset = build_dataset(**cfg.train.dataset.as_dict()) self.val_dataset = build_dataset(**cfg.val.dataset.as_dict()) self.model = build_model(**cfg.model.as_dict()) optimizer = build_optimizer(**cfg.optimizer.as_dict()) if cfg.lookahead: optimizer = LookaheadOptimizer(optimizer, cfg.lookahead.steps, cfg.lookahead.alpha) if use_mixed_precision: optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( optimizer=optimizer, loss_scale= "dynamic") self.loss_fn = build_loss(**cfg.loss.as_dict()) self.optimizer = optimizer self.use_mixed_precision = use_mixed_precision self.cfg = cfg self.total_train_steps = cfg.learning_rate_scheduler.train_steps self.learning_rate_scheduler = build_learning_rate_scheduler( **cfg.learning_rate_scheduler.as_dict()) self.global_step = tf.Variable(initial_value=0, trainable=False, name="global_step", dtype=tf.int64) self.learning_rate = tf.Variable(initial_value=0, trainable=False, name="learning_rate", dtype=tf.float32) self.checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, model=self.model) self.manager = tf.train.CheckpointManager(checkpoint=self.checkpoint, directory=cfg.checkpoint_dir, max_to_keep=10) if os.path.exists(cfg.pretrained_weights_path): self.model.load_weights(cfg.pretrained_weights_path, by_name=True, skip_mismatch=True) latest_checkpoint = self.manager.latest_checkpoint if latest_checkpoint is not None: try: steps = int(latest_checkpoint.split("-")[-1]) self.global_step.assign(steps) except: self.global_step.assign(0) self.checkpoint.restore(latest_checkpoint) tf.print(_time_to_string(), "Restored weights from %s." % latest_checkpoint) else: self.global_step.assign(0) self.summary_writer = tf.summary.create_file_writer(logdir=cfg.summary_dir) self.log_every_n_steps = cfg.log_every_n_steps self.save_ckpt_steps = cfg.save_ckpt_steps self.use_jit = tf.config.optimizer.get_jit() is not None self.training_loss_metrics = {} self.val_loss_metrics = {} self.train_acc_metric = tf.keras.metrics.Accuracy() self.train_auc_metric = tf.keras.metrics.AUC() self.val_acc_metric = tf.keras.metrics.Accuracy() self.val_auc_metric = tf.keras.metrics.AUC() self._add_graph = True
def train(cfg_file=None, model_dir=None, result_path=None, create_folder=False, display_step=50, summary_step=5, pickle_result=False): model_dir = Path(model_dir) model_dir.mkdir(parents=True, exist_ok=True) eval_checkpoint_dir = model_dir / 'eval_checkpoints' eval_checkpoint_dir.mkdir(parents=True, exist_ok=True) if result_path is None: result_path = model_dir / 'results' config_file_bkp = "pipeline.config" shutil.copyfile(cfg_file, str(model_dir / config_file_bkp)) config = cfg_from_yaml_file(cfg_file, cfg) input_cfg = config.TRAIN_INPUT_READER eval_input_cfg = config.EVAL_INPUT_READER model_cfg = config.MODEL train_cfg = config.TRAIN_CONFIG class_names = config.CLASS_NAMES ###################### # BUILD VOXEL GENERATOR ###################### voxel_generator = core.build_voxel_generator(config.VOXEL_GENERATOR) ###################### # BUILD TARGET ASSIGNER ###################### bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = core.build_box_coder(config.BOX_CODER) target_assigner_cfg = config.TARGET_ASSIGNER target_assigner = core.build_target_assigner(target_assigner_cfg, bv_range, box_coder) ###################### # BUILD NET ###################### center_limit_range = model_cfg.POST_PROCESSING.post_center_limit_range net = models.build_network(model_cfg, voxel_generator, target_assigner) net.cuda() # net_train = torch.nn.DataParallel(net).cuda() print("num_trainable parameters:", len(list(net.parameters()))) # for n, p in net.named_parameters(): # print(n, p.shape) ###################### # BUILD OPTIMIZER ###################### # we need global_step to create lr_scheduler, so restore net first. libs.tools.try_restore_latest_checkpoints(model_dir, [net]) gstep = net.get_global_step() - 1 optimizer_cfg = train_cfg.OPTIMIZER if train_cfg.ENABLE_MIXED_PRECISION: net.half() net.metrics_to_float() net.convert_norm_to_float(net) optimizer = core.build_optimizer(optimizer_cfg, net.parameters()) if train_cfg.ENABLE_MIXED_PRECISION: loss_scale = train_cfg.LOSS_SCALE_FACTOR mixed_optimizer = libs.tools.MixedPrecisionWrapper( optimizer, loss_scale) else: mixed_optimizer = optimizer # must restore optimizer AFTER using MixedPrecisionWrapper libs.tools.try_restore_latest_checkpoints(model_dir, [mixed_optimizer]) lr_scheduler = core.build_lr_schedules(optimizer_cfg, optimizer, gstep) if train_cfg.ENABLE_MIXED_PRECISION: float_dtype = torch.float16 else: float_dtype = torch.float32 ###################### # PREPARE INPUT ###################### dataset = core.build_input_reader(input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataset = core.build_input_reader(input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) def _worker_init_fn(worker_id): time_seed = np.array(time.time(), dtype=np.int32) np.random.seed(time_seed + worker_id) print(f"WORKER {worker_id} seed:", np.random.get_state()[1][0]) dataloader = torch.utils.data.DataLoader(dataset, batch_size=input_cfg.BATCH_SIZE, shuffle=True, num_workers=input_cfg.NUM_WORKERS, pin_memory=False, collate_fn=merge_second_batch, worker_init_fn=_worker_init_fn) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=eval_input_cfg.BATCH_SIZE, shuffle=False, num_workers=eval_input_cfg.NUM_WORKERS, pin_memory=False, collate_fn=merge_second_batch) data_iter = iter(dataloader) ###################### # TRAINING ###################### log_path = model_dir / 'log.txt' logf = open(log_path, 'a') # logf.write(proto_str) logf.write("\n") total_step_elapsed = 0 remain_steps = train_cfg.STEPS - net.get_global_step() t = time.time() ckpt_start_time = t #total_loop = train_cfg.STEPS // train_cfg.STEPS_PER_EVAL + 1 total_loop = remain_steps // train_cfg.STEPS_PER_EVAL + 1 clear_metrics_every_epoch = train_cfg.CLEAR_METRICS_EVERY_EPOCH if train_cfg.STEPS % train_cfg.STEPS_PER_EVAL == 0: total_loop -= 1 mixed_optimizer.zero_grad() try: for _ in range(total_loop): if total_step_elapsed + train_cfg.STEPS_PER_EVAL > train_cfg.STEPS: steps = train_cfg.STEPS % train_cfg.STEPS_PER_EVAL else: steps = train_cfg.STEPS_PER_EVAL for step in range(steps): lr_scheduler.step() try: example = next(data_iter) except StopIteration: print("end epoch") if clear_metrics_every_epoch: net.clear_metrics() data_iter = iter(dataloader) example = next(data_iter) example_torch = example_convert_to_torch(example, float_dtype, device="cuda:0") batch_size = example["anchors"].shape[0] ret_dict = net(example_torch) # box_preds = ret_dict["box_preds"] cls_preds = ret_dict["cls_preds"] loss = ret_dict["loss"].mean() cls_loss_reduced = ret_dict["cls_loss_reduced"].mean() loc_loss_reduced = ret_dict["loc_loss_reduced"].mean() cls_pos_loss = ret_dict["cls_pos_loss"] cls_neg_loss = ret_dict["cls_neg_loss"] loc_loss = ret_dict["loc_loss"] cls_loss = ret_dict["cls_loss"] dir_loss_reduced = ret_dict["dir_loss_reduced"] cared = ret_dict["cared"] labels = example_torch["labels"] if train_cfg.ENABLE_MIXED_PRECISION: loss *= loss_scale loss.backward() torch.nn.utils.clip_grad_norm_(net.parameters(), 10.0) mixed_optimizer.step() mixed_optimizer.zero_grad() net.update_global_step() net_metrics = net.update_metrics(cls_loss_reduced, loc_loss_reduced, cls_preds, labels, cared) step_time = (time.time() - t) t = time.time() metrics = {} num_pos = int((labels > 0)[0].float().sum().cpu().numpy()) num_neg = int((labels == 0)[0].float().sum().cpu().numpy()) if 'anchors_mask' not in example_torch: num_anchors = example_torch['anchors'].shape[1] else: num_anchors = int(example_torch['anchors_mask'][0].sum()) global_step = net.get_global_step() if global_step % display_step == 0: loc_loss_elem = [ float(loc_loss[:, :, i].sum().detach().cpu().numpy() / batch_size) for i in range(loc_loss.shape[-1]) ] metrics["step"] = global_step metrics["steptime"] = step_time metrics.update(net_metrics) metrics["loss"] = {} metrics["loss"]["loc_elem"] = loc_loss_elem metrics["loss"]["cls_pos_rt"] = float( cls_pos_loss.detach().cpu().numpy()) metrics["loss"]["cls_neg_rt"] = float( cls_neg_loss.detach().cpu().numpy()) if model_cfg.BACKBONE.use_direction_classifier: metrics["loss"]["dir_rt"] = float( dir_loss_reduced.detach().cpu().numpy()) metrics["num_vox"] = int(example_torch["voxels"].shape[0]) metrics["num_pos"] = int(num_pos) metrics["num_neg"] = int(num_neg) metrics["num_anchors"] = int(num_anchors) metrics["lr"] = float( mixed_optimizer.param_groups[0]['lr']) metrics["image_idx"] = example['image_idx'][0] flatted_metrics = flat_nested_json_dict(metrics) flatted_summarys = flat_nested_json_dict(metrics, "/") # for k,v in flatted_summarys.items(): # if isinstance(v,(list,tuple)): # v = {str(i): e for i,e in enumerate(v)} # writer.add_scalars(k,v,global_step) # else: # writer.add_scalars(k,v,global_step) metrics_str_list = [] for k, v in flatted_metrics.items(): if isinstance(v, float): metrics_str_list.append(f"{k}={v:.3}") elif isinstance(v, (list, tuple)): if v and isinstance(v[0], float): v_str = ', '.join([f"{e:.3}" for e in v]) metrics_str_list.append(f"{k}=[{v_str}]") else: metrics_str_list.append(f"{k}={v}") else: metrics_str_list.append(f"{k}={v}") log_str = ', '.join(metrics_str_list) print(log_str, file=logf) print(log_str) ckpt_elasped_time = time.time() - ckpt_start_time if ckpt_elasped_time > train_cfg.SAVE_CHECKPOINTS_SECS: libs.tools.save_models(model_dir, [net, optimizer], net.get_global_step()) ckpt_start_time = time.time() total_step_elapsed += steps libs.tools.save_models(model_dir, [net, optimizer], net.get_global_step()) # Ensure that all evaluation points are saved forever libs.tools.save_models(eval_checkpoint_dir, [net, optimizer], net.get_global_step(), max_to_keep=100) net.eval() result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) print("#################################") print("#################################", file=logf) print("# EVAL") print("# EVAL", file=logf) print("#################################") print("#################################", file=logf) print("Generate output labels...") print("Generate output labels...", file=logf) t = time.time() dt_annos = [] prog_bar = ProgressBar() prog_bar.start(len(eval_dataset) // eval_input_cfg.BATCH_SIZE + 1) for example in iter(eval_dataloader): example = example_convert_to_torch(example, float_dtype) if pickle_result: dt_annos += predict_kitti_to_anno(net, example, class_names, center_limit_range, model_cfg.LIDAR_INPUT) else: _predict_kitti_to_file(net, example, result_path_step, class_names, center_limit_range, model_cfg.LIDAR_INPUT) prog_bar.print_bar() sec_per_ex = len(eval_dataset) / (time.time() - t) print(f"avg forward time per example: {net.avg_forward_time:.3f}") print( f"avg postprocess time per example: {net.avg_postprocess_time:.3f}" ) print(f'generate label finished({sec_per_ex:.2f}/s). start eval:') print(f'generate label finished({sec_per_ex:.2f}/s). start eval:', file=logf) gt_annos = [ info["annos"] for info in eval_dataset.dataset.kitti_infos ] if not pickle_result: dt_annos = kitti.get_label_annos(result_path_step) result, mAPbbox, mAPbev, mAP3d, mAPaos = get_official_eval_result( gt_annos, dt_annos, class_names, return_data=True) print(result, file=logf) print(result) result = get_coco_eval_result(gt_annos, dt_annos, class_names) print(result, file=logf) print(result) net.train() except Exception as e: libs.tools.save_models(model_dir, [net, optimizer], net.get_global_step())
cfg.test.nms_type = args.nms_type if args.nms_type in ["soft_nms", "matrix_nms"]: cfg.test.sigma = args.nms_sigma else: cfg.override(args.config) detector = build_detector(cfg.detector, return_loss=False, cfg=cfg) images = tf.random.uniform( [1, cfg.train.input_size[0], cfg.train.input_size[1], 3]) images = tf.cast(images, tf.uint8) detector(images) if args.ckpt is not None and ".h5" in args.ckpt: detector.load_weights(args.ckpt) else: optimizer = build_optimizer(**cfg.train.optimizer.as_dict()) checkpoint = tf.train.Checkpoint(optimizer=optimizer, detector=detector) manager = tf.train.CheckpointManager(checkpoint=checkpoint, directory=cfg.train.checkpoint_dir, max_to_keep=10) latest_checkpoint = manager.latest_checkpoint checkpoint.restore(latest_checkpoint) saved_model_dir = args.saved_model_dir or "./saved_model/" + args.detector tf.saved_model.save(detector, saved_model_dir) print("saved model to %s" % saved_model_dir) # images = tf.random.uniform([1, cfg.train.input_size[0], cfg.train.input_size[1], 3]) # image_info = {"valid_size": tf.constant([[cfg.train.input_size[0], cfg.train.input_size[1]]]),
def __init__(self, cfg, logger): self.logger = logger use_mixed_precision = cfg.dtype in ["float16", "FP16"] if use_mixed_precision: tf.keras.mixed_precision.set_global_policy("mixed_float16") print("Using mixed precision training.") self.train_dataset = build_dataset( dtype=tf.float16 if use_mixed_precision else tf.float32, **cfg.train.dataset.as_dict()) self.val_dataset = build_dataset( dtype=tf.float16 if use_mixed_precision else tf.float32, **cfg.val.dataset.as_dict()) if cfg.train.get("proposal_layer"): self.detector = build_detector( cfg.detector, cfg=cfg, proposal_cfg=cfg.train.proposal_layer) else: self.detector = build_detector(cfg.detector, cfg=cfg) self.detector.load_pretrained_weights( cfg.train.pretrained_weights_path) train_steps_per_epoch = cfg.train.dataset.num_samples // cfg.train.dataset.batch_size self.total_train_steps = cfg.train.scheduler.train_epochs * train_steps_per_epoch self.warmup_steps = cfg.train.scheduler.warmup.steps self.warmup_learning_rate = cfg.train.scheduler.warmup.warmup_learning_rate self.learning_rate_scheduler = build_learning_rate_scheduler( **cfg.train.scheduler.learning_rate_scheduler.as_dict(), train_steps=self.total_train_steps, warmup_steps=self.warmup_steps, train_steps_per_epoch=train_steps_per_epoch) optimizer = build_optimizer(learning_rate=self.learning_rate_scheduler, **cfg.train.optimizer.as_dict()) if use_mixed_precision: optimizer = tf.keras.mixed_precision.LossScaleOptimizer( optimizer, dynamic=True) self.logger.info("Using mixed precision training.") self.optimizer = optimizer self.use_mixed_precision = use_mixed_precision self.cfg = cfg self.global_step = tf.Variable(initial_value=0, trainable=False, name="global_step", dtype=tf.int64) self.val_steps = tf.Variable(0, trainable=False, name="val_steps", dtype=tf.int64) self.checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, detector=self.detector.detector, global_step=self.global_step, val_steps=self.val_steps) self.manager = tf.train.CheckpointManager( checkpoint=self.checkpoint, directory=cfg.train.checkpoint_dir, max_to_keep=10) self.epochs = 0 latest_checkpoint = self.manager.latest_checkpoint if latest_checkpoint is not None: # try: # steps = int(latest_checkpoint.split("-")[-1]) # self.global_step.assign(steps) # self.epochs = steps // train_steps_per_epoch # except: # self.global_step.assign(0) self.checkpoint.restore(latest_checkpoint) self.logger.info("Restored weights from %s.", latest_checkpoint) else: self.global_step.assign(0) self.summary_writer = tf.summary.create_file_writer( logdir=cfg.train.summary_dir) self.log_every_n_steps = cfg.train.log_every_n_steps self.use_jit = tf.config.optimizer.get_jit() is not None self.train_loss_metrics = { "l2_loss": tf.keras.metrics.Mean(), "loss": tf.keras.metrics.Mean() } self.val_loss_metrics = { "l2_loss": tf.keras.metrics.Mean(), "loss": tf.keras.metrics.Mean() } self.ap_metric = None self._add_graph = True self.ap_metric = metrics.mAP(self.cfg.num_classes)