def __init__(self, cfg):
        use_mixed_precision = cfg.dtype == "float16"
        if use_mixed_precision:
            policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
            tf.keras.mixed_precision.experimental.set_policy(policy)

        self.train_dataset = build_dataset(**cfg.train.dataset.as_dict())
        self.val_dataset = build_dataset(**cfg.val.dataset.as_dict())

        self.model = build_model(**cfg.model.as_dict())

        optimizer = build_optimizer(**cfg.optimizer.as_dict())
    
        if cfg.lookahead:
            optimizer = LookaheadOptimizer(optimizer, cfg.lookahead.steps, cfg.lookahead.alpha) 

        if use_mixed_precision:
            optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
                optimizer=optimizer, loss_scale= "dynamic") 
        
        self.loss_fn = build_loss(**cfg.loss.as_dict())

        self.optimizer = optimizer
        self.use_mixed_precision = use_mixed_precision
        self.cfg = cfg

        self.total_train_steps = cfg.learning_rate_scheduler.train_steps
        self.learning_rate_scheduler = build_learning_rate_scheduler(
            **cfg.learning_rate_scheduler.as_dict())

        self.global_step = tf.Variable(initial_value=0,
                                       trainable=False,
                                       name="global_step",
                                       dtype=tf.int64)

        self.learning_rate = tf.Variable(initial_value=0,
                                         trainable=False,
                                         name="learning_rate",
                                         dtype=tf.float32)
       
        self.checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, model=self.model)
        self.manager = tf.train.CheckpointManager(checkpoint=self.checkpoint,
                                                  directory=cfg.checkpoint_dir,
                                                  max_to_keep=10)
        if os.path.exists(cfg.pretrained_weights_path):
            self.model.load_weights(cfg.pretrained_weights_path, by_name=True, skip_mismatch=True)

        latest_checkpoint = self.manager.latest_checkpoint
        if latest_checkpoint is not None:
            try:
                steps = int(latest_checkpoint.split("-")[-1])
                self.global_step.assign(steps)
            except:
                self.global_step.assign(0)
            self.checkpoint.restore(latest_checkpoint)
            tf.print(_time_to_string(), "Restored weights from %s." % latest_checkpoint)
        else:
            self.global_step.assign(0)

        self.summary_writer = tf.summary.create_file_writer(logdir=cfg.summary_dir)
        self.log_every_n_steps = cfg.log_every_n_steps
        self.save_ckpt_steps = cfg.save_ckpt_steps
        self.use_jit = tf.config.optimizer.get_jit() is not None

        self.training_loss_metrics = {}
        self.val_loss_metrics = {}

        self.train_acc_metric = tf.keras.metrics.Accuracy() 
        self.train_auc_metric = tf.keras.metrics.AUC()
        self.val_acc_metric = tf.keras.metrics.Accuracy() 
        self.val_auc_metric = tf.keras.metrics.AUC()
        self._add_graph = True
Example #2
0
def train(cfg_file=None,
          model_dir=None,
          result_path=None,
          create_folder=False,
          display_step=50,
          summary_step=5,
          pickle_result=False):
    model_dir = Path(model_dir)
    model_dir.mkdir(parents=True, exist_ok=True)
    eval_checkpoint_dir = model_dir / 'eval_checkpoints'
    eval_checkpoint_dir.mkdir(parents=True, exist_ok=True)
    if result_path is None:
        result_path = model_dir / 'results'
    config_file_bkp = "pipeline.config"
    shutil.copyfile(cfg_file, str(model_dir / config_file_bkp))

    config = cfg_from_yaml_file(cfg_file, cfg)
    input_cfg = config.TRAIN_INPUT_READER
    eval_input_cfg = config.EVAL_INPUT_READER
    model_cfg = config.MODEL
    train_cfg = config.TRAIN_CONFIG
    class_names = config.CLASS_NAMES
    ######################
    # BUILD VOXEL GENERATOR
    ######################
    voxel_generator = core.build_voxel_generator(config.VOXEL_GENERATOR)
    ######################
    # BUILD TARGET ASSIGNER
    ######################
    bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]]
    box_coder = core.build_box_coder(config.BOX_CODER)
    target_assigner_cfg = config.TARGET_ASSIGNER
    target_assigner = core.build_target_assigner(target_assigner_cfg, bv_range,
                                                 box_coder)
    ######################
    # BUILD NET
    ######################
    center_limit_range = model_cfg.POST_PROCESSING.post_center_limit_range
    net = models.build_network(model_cfg, voxel_generator, target_assigner)
    net.cuda()
    # net_train = torch.nn.DataParallel(net).cuda()
    print("num_trainable parameters:", len(list(net.parameters())))
    # for n, p in net.named_parameters():
    #     print(n, p.shape)

    ######################
    # BUILD OPTIMIZER
    ######################
    # we need global_step to create lr_scheduler, so restore net first.
    libs.tools.try_restore_latest_checkpoints(model_dir, [net])
    gstep = net.get_global_step() - 1
    optimizer_cfg = train_cfg.OPTIMIZER
    if train_cfg.ENABLE_MIXED_PRECISION:
        net.half()
        net.metrics_to_float()
        net.convert_norm_to_float(net)
    optimizer = core.build_optimizer(optimizer_cfg, net.parameters())
    if train_cfg.ENABLE_MIXED_PRECISION:
        loss_scale = train_cfg.LOSS_SCALE_FACTOR
        mixed_optimizer = libs.tools.MixedPrecisionWrapper(
            optimizer, loss_scale)
    else:
        mixed_optimizer = optimizer

# must restore optimizer AFTER using MixedPrecisionWrapper
    libs.tools.try_restore_latest_checkpoints(model_dir, [mixed_optimizer])
    lr_scheduler = core.build_lr_schedules(optimizer_cfg, optimizer, gstep)
    if train_cfg.ENABLE_MIXED_PRECISION:
        float_dtype = torch.float16
    else:
        float_dtype = torch.float32
    ######################
    # PREPARE INPUT
    ######################
    dataset = core.build_input_reader(input_cfg,
                                      model_cfg,
                                      training=True,
                                      voxel_generator=voxel_generator,
                                      target_assigner=target_assigner)
    eval_dataset = core.build_input_reader(input_cfg,
                                           model_cfg,
                                           training=False,
                                           voxel_generator=voxel_generator,
                                           target_assigner=target_assigner)

    def _worker_init_fn(worker_id):
        time_seed = np.array(time.time(), dtype=np.int32)
        np.random.seed(time_seed + worker_id)
        print(f"WORKER {worker_id} seed:", np.random.get_state()[1][0])

    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=input_cfg.BATCH_SIZE,
                                             shuffle=True,
                                             num_workers=input_cfg.NUM_WORKERS,
                                             pin_memory=False,
                                             collate_fn=merge_second_batch,
                                             worker_init_fn=_worker_init_fn)
    eval_dataloader = torch.utils.data.DataLoader(
        eval_dataset,
        batch_size=eval_input_cfg.BATCH_SIZE,
        shuffle=False,
        num_workers=eval_input_cfg.NUM_WORKERS,
        pin_memory=False,
        collate_fn=merge_second_batch)
    data_iter = iter(dataloader)

    ######################
    # TRAINING
    ######################
    log_path = model_dir / 'log.txt'
    logf = open(log_path, 'a')
    # logf.write(proto_str)
    logf.write("\n")

    total_step_elapsed = 0
    remain_steps = train_cfg.STEPS - net.get_global_step()
    t = time.time()
    ckpt_start_time = t
    #total_loop = train_cfg.STEPS // train_cfg.STEPS_PER_EVAL + 1
    total_loop = remain_steps // train_cfg.STEPS_PER_EVAL + 1
    clear_metrics_every_epoch = train_cfg.CLEAR_METRICS_EVERY_EPOCH

    if train_cfg.STEPS % train_cfg.STEPS_PER_EVAL == 0:
        total_loop -= 1
    mixed_optimizer.zero_grad()
    try:

        for _ in range(total_loop):
            if total_step_elapsed + train_cfg.STEPS_PER_EVAL > train_cfg.STEPS:
                steps = train_cfg.STEPS % train_cfg.STEPS_PER_EVAL
            else:
                steps = train_cfg.STEPS_PER_EVAL

            for step in range(steps):

                lr_scheduler.step()
                try:
                    example = next(data_iter)
                except StopIteration:
                    print("end epoch")
                    if clear_metrics_every_epoch:
                        net.clear_metrics()
                    data_iter = iter(dataloader)
                    example = next(data_iter)
                example_torch = example_convert_to_torch(example,
                                                         float_dtype,
                                                         device="cuda:0")
                batch_size = example["anchors"].shape[0]

                ret_dict = net(example_torch)

                # box_preds = ret_dict["box_preds"]
                cls_preds = ret_dict["cls_preds"]
                loss = ret_dict["loss"].mean()
                cls_loss_reduced = ret_dict["cls_loss_reduced"].mean()
                loc_loss_reduced = ret_dict["loc_loss_reduced"].mean()
                cls_pos_loss = ret_dict["cls_pos_loss"]
                cls_neg_loss = ret_dict["cls_neg_loss"]
                loc_loss = ret_dict["loc_loss"]
                cls_loss = ret_dict["cls_loss"]
                dir_loss_reduced = ret_dict["dir_loss_reduced"]
                cared = ret_dict["cared"]
                labels = example_torch["labels"]
                if train_cfg.ENABLE_MIXED_PRECISION:
                    loss *= loss_scale

                loss.backward()
                torch.nn.utils.clip_grad_norm_(net.parameters(), 10.0)
                mixed_optimizer.step()
                mixed_optimizer.zero_grad()
                net.update_global_step()
                net_metrics = net.update_metrics(cls_loss_reduced,
                                                 loc_loss_reduced, cls_preds,
                                                 labels, cared)
                step_time = (time.time() - t)
                t = time.time()
                metrics = {}
                num_pos = int((labels > 0)[0].float().sum().cpu().numpy())
                num_neg = int((labels == 0)[0].float().sum().cpu().numpy())
                if 'anchors_mask' not in example_torch:
                    num_anchors = example_torch['anchors'].shape[1]
                else:
                    num_anchors = int(example_torch['anchors_mask'][0].sum())
                global_step = net.get_global_step()
                if global_step % display_step == 0:
                    loc_loss_elem = [
                        float(loc_loss[:, :, i].sum().detach().cpu().numpy() /
                              batch_size) for i in range(loc_loss.shape[-1])
                    ]
                    metrics["step"] = global_step
                    metrics["steptime"] = step_time
                    metrics.update(net_metrics)
                    metrics["loss"] = {}
                    metrics["loss"]["loc_elem"] = loc_loss_elem
                    metrics["loss"]["cls_pos_rt"] = float(
                        cls_pos_loss.detach().cpu().numpy())
                    metrics["loss"]["cls_neg_rt"] = float(
                        cls_neg_loss.detach().cpu().numpy())

                    if model_cfg.BACKBONE.use_direction_classifier:
                        metrics["loss"]["dir_rt"] = float(
                            dir_loss_reduced.detach().cpu().numpy())
                    metrics["num_vox"] = int(example_torch["voxels"].shape[0])
                    metrics["num_pos"] = int(num_pos)
                    metrics["num_neg"] = int(num_neg)
                    metrics["num_anchors"] = int(num_anchors)
                    metrics["lr"] = float(
                        mixed_optimizer.param_groups[0]['lr'])

                    metrics["image_idx"] = example['image_idx'][0]
                    flatted_metrics = flat_nested_json_dict(metrics)
                    flatted_summarys = flat_nested_json_dict(metrics, "/")
                    # for k,v in flatted_summarys.items():
                    #     if isinstance(v,(list,tuple)):
                    #         v = {str(i): e for i,e in enumerate(v)}
                    #         writer.add_scalars(k,v,global_step)
                    #     else:
                    #         writer.add_scalars(k,v,global_step)
                    metrics_str_list = []
                    for k, v in flatted_metrics.items():
                        if isinstance(v, float):
                            metrics_str_list.append(f"{k}={v:.3}")
                        elif isinstance(v, (list, tuple)):
                            if v and isinstance(v[0], float):
                                v_str = ', '.join([f"{e:.3}" for e in v])
                                metrics_str_list.append(f"{k}=[{v_str}]")
                            else:
                                metrics_str_list.append(f"{k}={v}")
                        else:
                            metrics_str_list.append(f"{k}={v}")
                    log_str = ', '.join(metrics_str_list)
                    print(log_str, file=logf)
                    print(log_str)
                ckpt_elasped_time = time.time() - ckpt_start_time
                if ckpt_elasped_time > train_cfg.SAVE_CHECKPOINTS_SECS:
                    libs.tools.save_models(model_dir, [net, optimizer],
                                           net.get_global_step())
                    ckpt_start_time = time.time()

            total_step_elapsed += steps
            libs.tools.save_models(model_dir, [net, optimizer],
                                   net.get_global_step())
            # Ensure that all evaluation points are saved forever
            libs.tools.save_models(eval_checkpoint_dir, [net, optimizer],
                                   net.get_global_step(),
                                   max_to_keep=100)

            net.eval()
            result_path_step = result_path / f"step_{net.get_global_step()}"
            result_path_step.mkdir(parents=True, exist_ok=True)
            print("#################################")
            print("#################################", file=logf)
            print("# EVAL")
            print("# EVAL", file=logf)
            print("#################################")
            print("#################################", file=logf)
            print("Generate output labels...")
            print("Generate output labels...", file=logf)
            t = time.time()
            dt_annos = []
            prog_bar = ProgressBar()
            prog_bar.start(len(eval_dataset) // eval_input_cfg.BATCH_SIZE + 1)
            for example in iter(eval_dataloader):
                example = example_convert_to_torch(example, float_dtype)
                if pickle_result:
                    dt_annos += predict_kitti_to_anno(net, example,
                                                      class_names,
                                                      center_limit_range,
                                                      model_cfg.LIDAR_INPUT)
                else:
                    _predict_kitti_to_file(net, example, result_path_step,
                                           class_names, center_limit_range,
                                           model_cfg.LIDAR_INPUT)
                prog_bar.print_bar()
            sec_per_ex = len(eval_dataset) / (time.time() - t)
            print(f"avg forward time per example: {net.avg_forward_time:.3f}")
            print(
                f"avg postprocess time per example: {net.avg_postprocess_time:.3f}"
            )
            print(f'generate label finished({sec_per_ex:.2f}/s). start eval:')
            print(f'generate label finished({sec_per_ex:.2f}/s). start eval:',
                  file=logf)
            gt_annos = [
                info["annos"] for info in eval_dataset.dataset.kitti_infos
            ]
            if not pickle_result:
                dt_annos = kitti.get_label_annos(result_path_step)
                result, mAPbbox, mAPbev, mAP3d, mAPaos = get_official_eval_result(
                    gt_annos, dt_annos, class_names, return_data=True)
            print(result, file=logf)
            print(result)

            result = get_coco_eval_result(gt_annos, dt_annos, class_names)
            print(result, file=logf)
            print(result)
            net.train()

    except Exception as e:
        libs.tools.save_models(model_dir, [net, optimizer],
                               net.get_global_step())
Example #3
0
        cfg.test.nms_type = args.nms_type
        if args.nms_type in ["soft_nms", "matrix_nms"]:
            cfg.test.sigma = args.nms_sigma
else:
    cfg.override(args.config)

detector = build_detector(cfg.detector, return_loss=False, cfg=cfg)
images = tf.random.uniform(
    [1, cfg.train.input_size[0], cfg.train.input_size[1], 3])
images = tf.cast(images, tf.uint8)
detector(images)

if args.ckpt is not None and ".h5" in args.ckpt:
    detector.load_weights(args.ckpt)
else:
    optimizer = build_optimizer(**cfg.train.optimizer.as_dict())

    checkpoint = tf.train.Checkpoint(optimizer=optimizer, detector=detector)
    manager = tf.train.CheckpointManager(checkpoint=checkpoint,
                                         directory=cfg.train.checkpoint_dir,
                                         max_to_keep=10)
    latest_checkpoint = manager.latest_checkpoint
    checkpoint.restore(latest_checkpoint)

saved_model_dir = args.saved_model_dir or "./saved_model/" + args.detector

tf.saved_model.save(detector, saved_model_dir)
print("saved model to %s" % saved_model_dir)

# images = tf.random.uniform([1, cfg.train.input_size[0], cfg.train.input_size[1], 3])
# image_info = {"valid_size": tf.constant([[cfg.train.input_size[0], cfg.train.input_size[1]]]),
Example #4
0
    def __init__(self, cfg, logger):
        self.logger = logger
        use_mixed_precision = cfg.dtype in ["float16", "FP16"]
        if use_mixed_precision:
            tf.keras.mixed_precision.set_global_policy("mixed_float16")
            print("Using mixed precision training.")

        self.train_dataset = build_dataset(
            dtype=tf.float16 if use_mixed_precision else tf.float32,
            **cfg.train.dataset.as_dict())
        self.val_dataset = build_dataset(
            dtype=tf.float16 if use_mixed_precision else tf.float32,
            **cfg.val.dataset.as_dict())

        if cfg.train.get("proposal_layer"):
            self.detector = build_detector(
                cfg.detector, cfg=cfg, proposal_cfg=cfg.train.proposal_layer)
        else:
            self.detector = build_detector(cfg.detector, cfg=cfg)

        self.detector.load_pretrained_weights(
            cfg.train.pretrained_weights_path)

        train_steps_per_epoch = cfg.train.dataset.num_samples // cfg.train.dataset.batch_size
        self.total_train_steps = cfg.train.scheduler.train_epochs * train_steps_per_epoch
        self.warmup_steps = cfg.train.scheduler.warmup.steps
        self.warmup_learning_rate = cfg.train.scheduler.warmup.warmup_learning_rate
        self.learning_rate_scheduler = build_learning_rate_scheduler(
            **cfg.train.scheduler.learning_rate_scheduler.as_dict(),
            train_steps=self.total_train_steps,
            warmup_steps=self.warmup_steps,
            train_steps_per_epoch=train_steps_per_epoch)

        optimizer = build_optimizer(learning_rate=self.learning_rate_scheduler,
                                    **cfg.train.optimizer.as_dict())

        if use_mixed_precision:
            optimizer = tf.keras.mixed_precision.LossScaleOptimizer(
                optimizer, dynamic=True)
            self.logger.info("Using mixed precision training.")

        self.optimizer = optimizer
        self.use_mixed_precision = use_mixed_precision
        self.cfg = cfg

        self.global_step = tf.Variable(initial_value=0,
                                       trainable=False,
                                       name="global_step",
                                       dtype=tf.int64)

        self.val_steps = tf.Variable(0,
                                     trainable=False,
                                     name="val_steps",
                                     dtype=tf.int64)

        self.checkpoint = tf.train.Checkpoint(optimizer=self.optimizer,
                                              detector=self.detector.detector,
                                              global_step=self.global_step,
                                              val_steps=self.val_steps)
        self.manager = tf.train.CheckpointManager(
            checkpoint=self.checkpoint,
            directory=cfg.train.checkpoint_dir,
            max_to_keep=10)

        self.epochs = 0
        latest_checkpoint = self.manager.latest_checkpoint
        if latest_checkpoint is not None:
            # try:
            #     steps = int(latest_checkpoint.split("-")[-1])
            #     self.global_step.assign(steps)
            #     self.epochs = steps // train_steps_per_epoch
            # except:
            #     self.global_step.assign(0)
            self.checkpoint.restore(latest_checkpoint)
            self.logger.info("Restored weights from %s.", latest_checkpoint)
        else:
            self.global_step.assign(0)

        self.summary_writer = tf.summary.create_file_writer(
            logdir=cfg.train.summary_dir)
        self.log_every_n_steps = cfg.train.log_every_n_steps
        self.use_jit = tf.config.optimizer.get_jit() is not None

        self.train_loss_metrics = {
            "l2_loss": tf.keras.metrics.Mean(),
            "loss": tf.keras.metrics.Mean()
        }
        self.val_loss_metrics = {
            "l2_loss": tf.keras.metrics.Mean(),
            "loss": tf.keras.metrics.Mean()
        }
        self.ap_metric = None
        self._add_graph = True
        self.ap_metric = metrics.mAP(self.cfg.num_classes)