def _init_net(self): self.config = pipeline_pb2.TrainEvalPipelineConfig() with open(self.config_f, "r") as f: proto_str = f.read() text_format.Merge(proto_str, self.config) self.input_cfg = self.config.eval_input_reader self.model_cfg = self.config.model.second self.train_cfg = self.config.train_config self.class_names = list(self.input_cfg.class_names) self.center_limit_range = self.model_cfg.post_center_limit_range # BUILD VOXEL GENERATOR voxel_generator = voxel_builder.build(self.model_cfg.voxel_generator) bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(self.model_cfg.box_coder) target_assigner_cfg = self.model_cfg.target_assigner self.target_assigner = target_assigner_builder.build( target_assigner_cfg, bv_range, box_coder) self.net = second_builder.build(self.model_cfg, voxel_generator, self.target_assigner) self.net.cuda() if self.train_cfg.enable_mixed_precision: self.net.half() self.net.metrics_to_float() self.net.convert_norm_to_float(self.net) torchplus.train.try_restore_latest_checkpoints(self.model_dir, [self.net]) print('Success load latest checkpoint in {}'.format(self.model_dir))
def _build(self): config = self.config input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config batch_size = 1 # voxel_generator = voxel_builder.build(model_cfg.voxel_generator) # bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] # grid_size = voxel_generator.grid_size # self.voxel_generator = voxel_generator fv_generator = voxel_builder.build(model_cfg.voxel_generator) bv_range = fv_generator.cartesian_coord_range[[0, 1, 3, 4]] fv_dim = fv_generator.fv_dim self.fv_generator = fv_generator vfe_num_filters = list(model_cfg.voxel_feature_extractor.num_filters) # box_coder = box_coder_builder.build(model_cfg.box_coder) # target_assigner_cfg = model_cfg.target_assigner # target_assigner = target_assigner_builder.build( # target_assigner_cfg, bv_range, box_coder) # self.target_assigner = target_assigner out_size_factor = model_cfg.rpn.layer_strides[ 0] // model_cfg.rpn.upsample_strides[0] self.net = second_builder.build(model_cfg, fv_generator, RGB_embedding=self.RGB_embedding) self.net.cuda().eval() if train_cfg.enable_mixed_precision: self.net.half() self.net.metrics_to_float() self.net.convert_norm_to_float(self.net) feature_map_size = fv_dim[:2] // out_size_factor feature_map_size = [*feature_map_size, 1][::-1]
def build_network(model_cfg, measure_time=False): voxel_generator = voxel_builder.build(model_cfg.voxel_generator) bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) net = second_builder.build( model_cfg, voxel_generator, target_assigner, measure_time=measure_time) return net
def build_network(self): model_cfg = self.config.model.second voxel_generator = voxel_builder.build(model_cfg.voxel_generator) bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) box_coder.custom_ndim = target_assigner._anchor_generators[0].custom_ndim net = second_builder.build(model_cfg, voxel_generator, target_assigner, measure_time=False) return net
def build_network(model_cfg, measure_time=False): # generate voxel voxel_generator = voxel_builder.build(model_cfg.voxel_generator) bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner = target_assigner_builder.build(model_cfg, bv_range, box_coder) # box_coder.custom_ndim = target_assigner._anchor_generators[0].custom_ndim # model build net = second_builder.build(model_cfg, voxel_generator, target_assigner, measure_time=measure_time) return net
def _build(self): config = self.config input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config batch_size = 1 voxel_generator = voxel_builder.build(model_cfg.voxel_generator) bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] grid_size = voxel_generator.grid_size self.voxel_generator = voxel_generator vfe_num_filters = list(model_cfg.voxel_feature_extractor.num_filters) box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build( target_assigner_cfg, bv_range, box_coder) self.target_assigner = target_assigner out_size_factor = model_cfg.rpn.layer_strides[ 0] / model_cfg.rpn.upsample_strides[0] out_size_factor *= model_cfg.middle_feature_extractor.downsample_factor out_size_factor = int(out_size_factor) assert out_size_factor > 0 self.net = second_builder.build(model_cfg, voxel_generator, target_assigner) self.net.cuda().eval() if train_cfg.enable_mixed_precision: self.net.half() self.net.metrics_to_float() self.net.convert_norm_to_float(self.net) feature_map_size = grid_size[:2] // out_size_factor feature_map_size = [*feature_map_size, 1][::-1] ret = target_assigner.generate_anchors(feature_map_size) anchors_dict = target_assigner.generate_anchors_dict(feature_map_size) #print("feature_map_size is ",feature_map_size) #print("generated_anchors shape is",ret['anchors'].shape) anchors = ret["anchors"] #print("",ret['anchors'][0,79,79,1,:]) anchors = anchors.reshape([-1, 7]) #anchors_reshape = anchors.reshape([1,200,176,14]) #print("",anchors_reshape[0,79,79,7:]) matched_thresholds = ret["matched_thresholds"] unmatched_thresholds = ret["unmatched_thresholds"] anchors_bv = box_np_ops.rbbox2d_to_near_bbox(anchors[:, [0, 1, 3, 4, 6]]) self.anchor_cache = { "anchors": anchors, "anchors_bv": anchors_bv, "matched_thresholds": matched_thresholds, "unmatched_thresholds": unmatched_thresholds, "anchors_dict": anchors_dict, }
def build_network(model_cfg, measure_time=False): voxel_generator = voxel_builder.build(model_cfg.voxel_generator) bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] #[-50,50,-50,50] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner #对10个类的大小等config进行分配 target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) box_coder.custom_ndim = target_assigner._anchor_generators[0].custom_ndim net = second_builder.build(model_cfg, voxel_generator, target_assigner, measure_time=measure_time) return net
def build_network(model_cfg, measure_time=False, KL=False): voxel_generator = voxel_builder.build(model_cfg.voxel_generator) bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) box_coder.custom_ndim = target_assigner._anchor_generators[0].custom_ndim print(KL) net = second_builder.build(model_cfg, voxel_generator, target_assigner, measure_time=measure_time, KL=KL) return net
def build_inference_net(config_path, model_dir, result_path=None, predict_test=False, ckpt_path=None, ref_detfile=None, pickle_result=True, measure_time=False, batch_size=1): model_dir = pathlib.Path(model_dir) if predict_test: result_name = 'predict_test' else: result_name = 'eval_results' if result_path is None: result_path = model_dir / result_name else: result_path = pathlib.Path(result_path) config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) model_cfg = config.model.second detection_2d_path = config.train_config.detection_2d_path center_limit_range = model_cfg.post_center_limit_range voxel_generator = voxel_builder.build(model_cfg.voxel_generator) bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) class_names = target_assigner.classes net = second_builder.build(model_cfg, voxel_generator, target_assigner, measure_time=measure_time) net.cuda() if ckpt_path is None: print("load existing model") torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) else: torchplus.train.restore(ckpt_path, net) batch_size = batch_size or input_cfg.batch_size #batch_size = 1 net.eval() return net
def build_network(model_cfg, measure_time=False): """ build voxel generator, box codder, target assigner and network, from model cfg """ voxel_generator = voxel_builder.build(model_cfg.voxel_generator) bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) box_coder.custom_ndim = target_assigner._anchor_generators[0].custom_ndim net = second_builder.build(model_cfg, voxel_generator, target_assigner, measure_time=measure_time) return net
def set_model(config_path, model_dir, ckpt_path=None, ref_detfile=None): model_dir = pathlib.Path(model_dir) result_name = 'predict_test' config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config class_names = list(input_cfg.class_names) center_limit_range = model_cfg.post_center_limit_range ###################### # BUILD VOXEL GENERATOR ###################### voxel_generator = voxel_builder.build(model_cfg.voxel_generator) bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) net = second_builder.build(model_cfg, voxel_generator, target_assigner) net.cuda() if train_cfg.enable_mixed_precision: net.half() net.metrics_to_float() net.convert_norm_to_float(net) if ckpt_path is None: torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) else: torchplus.train.restore(ckpt_path, net) return net, input_cfg, model_cfg, train_cfg, class_names, voxel_generator, target_assigner
def train(config_path, model_dir, use_fusion=True, use_ft=False, use_second_stage=True, use_endtoend=True, result_path=None, create_folder=False, display_step=50, summary_step=5, local_rank=0, pickle_result=True, patchs=None): """train a VoxelNet mod[el specified by a config file. """ ############ tracking config_tr_path = '/mnt/new_iou/second.pytorch/second/mmMOT/experiments/second/spatio_test/config.yaml' load_tr_path = '/mnt/new_iou/second.pytorch/second/mmMOT/experiments/second/spatio_test/results' with open(config_tr_path) as f: config_tr = yaml.load(f, Loader=yaml.FullLoader) result_path_tr = load_tr_path config_tr = EasyDict(config_tr['common']) config_tr.save_path = os.path.dirname(config_tr_path) # create model # model_tr = build_model(config_tr) # model_tr.cuda() # optimizer_tr = build_optim(model_tr, config_tr) criterion_tr = build_criterion(config_tr.loss) last_iter = -1 best_mota = 0 # if load_tr_path: # if False: # best_mota, last_iter = load_state( # load_tr_path, model_tr, optimizer=optimizer_tr) # else: # load_state(load_tr_path, model_tr) cudnn.benchmark = True # Data loading code train_transform, valid_transform = build_augmentation(config_tr.augmentation) # # train # train_dataset = build_dataset( # config_tr, # set_source='train', # evaluate=False, # train_transform=train_transform) # trainval_dataset = build_dataset( # config_tr, # set_source='train', # evaluate=True, # valid_transform=valid_transform) # val_dataset = build_dataset( # config_tr, # set_source='val', # evaluate=True, # valid_transform=valid_transform) # train_sampler = DistributedGivenIterationSampler( # train_dataset, # config_tr.lr_scheduler.max_iter, # config_tr.batch_size, # world_size=1, # rank=0, # last_iter=last_iter) # import pdb; pdb.set_trace() # train_loader = DataLoader( # train_dataset, # batch_size=config_tr.batch_size, # shuffle=False, # num_workers=config_tr.workers, # pin_memory=True) tb_logger = SummaryWriter(config_tr.save_path + '/events') logger = create_logger('global_logger', config_tr.save_path + '/log.txt') # logger.info('args: {}'.format(pprint.pformat(args))) logger.info('config: {}'.format(pprint.pformat(config_tr))) # tracking_module = TrackingModule(model_tr, criterion_tr, # config_tr.det_type) # tracking_module.model.train() #### tracking setup done if create_folder: if pathlib.Path(model_dir).exists(): model_dir = torchplus.train.create_folder(model_dir) patchs = patchs or [] model_dir = pathlib.Path(model_dir) model_dir.mkdir(parents=True, exist_ok=True) if result_path is None: result_path = model_dir / 'results' config_file_bkp = "pipeline.config" config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) for patch in patchs: patch = "config." + patch exec(patch) shutil.copyfile(config_path, str(model_dir / config_file_bkp)) input_cfg = config.train_input_reader eval_input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config ###################### # BUILD VOXEL GENERATOR ###################### voxel_generator = voxel_builder.build(model_cfg.voxel_generator) ###################### # BUILD TARGET ASSIGNER ###################### bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) class_names = target_assigner.classes ###################### # BUILD NET ###################### center_limit_range = model_cfg.post_center_limit_range # if use_second_stage: # net = second_2stage_builder.build(model_cfg, voxel_generator, target_assigner) if use_endtoend: net = second_endtoend_builder_spatio.build(model_cfg, voxel_generator, target_assigner, criterion_tr, config_tr.det_type) else: net = second_builder.build(model_cfg, voxel_generator, target_assigner) net.cuda() print("num_trainable parameters:", len(list(net.parameters()))) for n, p in net.named_parameters(): print(n, p.shape) # pth_name = './pre_weight/first_stage_gating_det/voxelnet-17013.tckpt' pth_name = './pre_weight/second_stage_gating_det/voxelnet-35000.tckpt' res_pre_weights = torch.load(pth_name) new_res_state_dict = OrderedDict() model_dict = net.state_dict() for k,v in res_pre_weights.items(): if 'global_step' not in k: # if 'dir' not in k: new_res_state_dict[k] = v model_dict.update(new_res_state_dict) net.load_state_dict(model_dict) # for k, weight in dict(net.named_parameters()).items(): # lidar_conv, p_lidar_conv, fusion_module, w_det, w_link, appearance, point_net # if 'middle_feature_extractor' in '%s'%(k) or 'rpn' in '%s'%(k) or 'second_rpn' in '%s'%(k): # weight.requires_grad = False # BUILD OPTIMIZER ##################### # we need global_step to create lr_scheduler, so restore net first. torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) gstep = net.get_global_step() - 1 optimizer_cfg = train_cfg.optimizer if train_cfg.enable_mixed_precision: net.half() net.metrics_to_float() net.convert_norm_to_float(net) loss_scale = train_cfg.loss_scale_factor mixed_optimizer = optimizer_builder.build(optimizer_cfg, net, mixed=train_cfg.enable_mixed_precision, loss_scale=loss_scale) optimizer = mixed_optimizer # must restore optimizer AFTER using MixedPrecisionWrapper torchplus.train.try_restore_latest_checkpoints(model_dir, [mixed_optimizer]) lr_scheduler = lr_scheduler_builder.build(optimizer_cfg, optimizer, train_cfg.steps) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 ###################### # PREPARE INPUT ###################### # import pdb; pdb.set_trace() dataset = input_reader_builder_tr_vid_spatio.build( input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner, config_tr=config_tr, set_source='train', evaluate=False, train_transform=train_transform) eval_dataset = input_reader_builder_tr_vid_spatio.build( eval_input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner, config_tr=config_tr, set_source='val', evaluate=True, valid_transform=valid_transform) def _worker_init_fn(worker_id): time_seed = np.array(time.time(), dtype=np.int32) np.random.seed(time_seed + worker_id) print(f"WORKER {worker_id} seed:", np.random.get_state()[1][0]) dataloader = torch.utils.data.DataLoader( dataset, batch_size=input_cfg.batch_size, shuffle=True, num_workers=input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch_tr_vid_spatio, worker_init_fn=_worker_init_fn) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=eval_input_cfg.batch_size, shuffle=False, num_workers=eval_input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch_tr_vid_spatio) data_iter = iter(dataloader) ###################### # TRAINING ###################### training_detail = [] log_path = model_dir / 'log.txt' training_detail_path = model_dir / 'log.json' if training_detail_path.exists(): with open(training_detail_path, 'r') as f: training_detail = json.load(f) logf = open(log_path, 'a') logf.write(proto_str) logf.write("\n") summary_dir = model_dir / 'summary' summary_dir.mkdir(parents=True, exist_ok=True) writer = SummaryWriter(str(summary_dir)) total_step_elapsed = 0 remain_steps = train_cfg.steps - net.get_global_step() t = time.time() ckpt_start_time = t total_loop = train_cfg.steps // train_cfg.steps_per_eval + 1 clear_metrics_every_epoch = train_cfg.clear_metrics_every_epoch if train_cfg.steps % train_cfg.steps_per_eval == 0: total_loop -= 1 mixed_optimizer.zero_grad() # optimizer_tr.zero_grad() logger = logging.getLogger('global_logger') best_mota = 0 losses = AverageMeter(config_tr.print_freq) total_steps = train_cfg.steps total_loop = total_steps // len(dataloader) kkkk = 0 for step in range(total_loop): for i, (example) in enumerate(dataloader): curr_step = 0 + i kkkk += 1 lr_scheduler.step(net.get_global_step()) example_torch = example_convert_to_torch(example, float_dtype) batch_size = example["anchors"].shape[0] ret_dict = net(example_torch, train_param=True) cls_preds = ret_dict["cls_preds"] loss = ret_dict["loss"].mean() cls_loss_reduced = ret_dict["cls_loss_reduced"].mean() loc_loss_reduced = ret_dict["loc_loss_reduced"].mean() cls_pos_loss = ret_dict["cls_pos_loss"] cls_neg_loss = ret_dict["cls_neg_loss"] loc_loss = ret_dict["loc_loss"] cls_loss = ret_dict["cls_loss"] dir_loss_reduced = ret_dict["dir_loss_reduced"] cared = ret_dict["cared"] # loss_tr = ret_dict["loss_tr"] if use_second_stage or use_endtoend: labels = ret_dict["labels"] else: labels = example_torch["labels"] if train_cfg.enable_mixed_precision: loss *= loss_scale try: loss.backward() except: abc = 1 # import pdb; pdb.set_trace() # abc = 1 # torch.nn.utils.clip_grad_norm_(net.parameters(), 10.0) # optimizer_tr.step() # optimizer_tr.zero_grad() mixed_optimizer.step() mixed_optimizer.zero_grad() net.update_global_step() net_metrics = net.update_metrics(cls_loss_reduced, loc_loss_reduced, cls_preds, labels, cared) step_time = (time.time() - t) t = time.time() metrics = {} num_pos = int((labels > 0)[0].float().sum().cpu().numpy()) num_neg = int((labels == 0)[0].float().sum().cpu().numpy()) if 'anchors_mask' not in example_torch: num_anchors = example_torch['anchors'].shape[1] else: num_anchors = int(example_torch['anchors_mask'][0].sum()) global_step = net.get_global_step() # print(step) if global_step % display_step == 0: loc_loss_elem = [ float(loc_loss[:, :, i].sum().detach().cpu().numpy() / batch_size) for i in range(loc_loss.shape[-1]) ] metrics["type"] = "step_info" metrics["step"] = global_step metrics["steptime"] = step_time metrics.update(net_metrics) metrics["loss"] = {} metrics["loss"]["loc_elem"] = loc_loss_elem metrics["loss"]["cls_pos_rt"] = float( cls_pos_loss.detach().cpu().numpy()) metrics["loss"]["cls_neg_rt"] = float( cls_neg_loss.detach().cpu().numpy()) if model_cfg.use_direction_classifier: metrics["loss"]["dir_rt"] = float( dir_loss_reduced.detach().cpu().numpy()) metrics["num_vox"] = int(example_torch["voxels"].shape[0]) metrics["num_pos"] = int(num_pos) metrics["num_neg"] = int(num_neg) metrics["num_anchors"] = int(num_anchors) metrics["lr"] = float( optimizer.lr) metrics["image_idx"] = example['image_idx'][0][7:] training_detail.append(metrics) flatted_metrics = flat_nested_json_dict(metrics) flatted_summarys = flat_nested_json_dict(metrics, "/") for k, v in flatted_summarys.items(): if isinstance(v, (list, tuple)): v = {str(i): e for i, e in enumerate(v)} if type(v) != str and ('loc_elem' not in k): writer.add_scalars(k, v, global_step) else: if (type(v) != str) and ('loc_elem' not in k): writer.add_scalar(k, v, global_step) metrics_str_list = [] for k, v in flatted_metrics.items(): if isinstance(v, float): metrics_str_list.append(f"{k}={v:.3}") elif isinstance(v, (list, tuple)): if v and isinstance(v[0], float): v_str = ', '.join([f"{e:.3}" for e in v]) metrics_str_list.append(f"{k}=[{v_str}]") else: metrics_str_list.append(f"{k}={v}") else: metrics_str_list.append(f"{k}={v}") log_str = ', '.join(metrics_str_list) print(log_str, file=logf) print(log_str) ckpt_elasped_time = time.time() - ckpt_start_time if ckpt_elasped_time > train_cfg.save_checkpoints_secs: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) ckpt_start_time = time.time() if kkkk > 0 and (kkkk) % config_tr.val_freq == 0: # if True: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) net.eval() result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) print("#################################") print("#################################", file=logf) print("# EVAL") print("# EVAL", file=logf) print("#################################") print("#################################", file=logf) print("Generate output labels...") print("Generate output labels...", file=logf) t = time.time() dt_annos = [] prog_bar = ProgressBar() net.clear_timer() prog_bar.start((len(eval_dataset) + eval_input_cfg.batch_size - 1) // eval_input_cfg.batch_size) for example in iter(eval_dataloader): example = example_convert_to_torch(example, float_dtype) if pickle_result: results = predict_kitti_to_anno( net, example, class_names, center_limit_range, model_cfg.lidar_input) dt_annos += results else: _predict_kitti_to_file(net, example, result_path_step, class_names, center_limit_range, model_cfg.lidar_input) prog_bar.print_bar() sec_per_ex = len(eval_dataset) / (time.time() - t) print(f'generate label finished({sec_per_ex:.2f}/s). start eval:') print(f'generate label finished({sec_per_ex:.2f}/s). start eval:',file=logf) gt_annos = [ info["annos"] for info in eval_dataset.dataset.kitti_infos ] if not pickle_result: dt_annos = kitti.get_label_annos(result_path_step) # result = get_official_eval_result_v2(gt_annos, dt_annos, class_names) # print(json.dumps(result, indent=2), file=logf) result = get_official_eval_result(gt_annos, dt_annos, class_names) print(result, file=logf) print(result) result_1 = result.split("\n")[:5] result_2 = result.split("\n")[10:15] result_3 = result.split("\n")[20:25] emh = ['0_easy', '1_mod', '2_hard'] result_save = result_1 for i in range(len(result_save)-1): save_targ = result_save[i+1] name_val = save_targ.split(':')[0].split(' ')[0] value_val = save_targ.split(':')[1:] for ev in range(3): each_val = value_val[0].split(',')[ev] merge_txt = 'AP_kitti/car_70/' + name_val+'/'+emh[ev] try: writer.add_scalar(merge_txt, float(each_val), global_step) except: abc=1 import pdb; pdb.set_trace() abc=1 if pickle_result: with open(result_path_step / "result.pkl", 'wb') as f: pickle.dump(dt_annos, f) writer.add_text('eval_result', result, global_step) logger.info('Evaluation on validation set:') # MOTA, MOTP, recall, prec, F1, fp, fn, id_switches = validate( # val_dataset, # net, # str(0 + 1), # config_tr, # result_path_tr, # part='val') # print(MOTA, MOTP, recall, prec, F1, fp, fn, id_switches) # curr_step = step # if tb_logger is not None: # tb_logger.add_scalar('prec', prec, curr_step) # tb_logger.add_scalar('recall', recall, curr_step) # tb_logger.add_scalar('mota', MOTA, curr_step) # tb_logger.add_scalar('motp', MOTP, curr_step) # tb_logger.add_scalar('fp', fp, curr_step) # tb_logger.add_scalar('fn', fn, curr_step) # tb_logger.add_scalar('f1', F1, curr_step) # tb_logger.add_scalar('id_switches', id_switches, curr_step) # if lr_scheduler is not None: # tb_logger.add_scalar('lr', current_lr, curr_step) # is_best = MOTA > best_mota # best_mota = max(MOTA, best_mota) # print(best_mota) # import pdb; pdb.set_trace() # save_checkpoint( # { 'step': net.get_global_step(), # 'score_arch': config_tr.model.score_arch, # 'appear_arch': config_tr.model.appear_arch, # 'best_mota': best_mota, # 'state_dict': tracking_module.model.state_dict(), # 'optimizer': tracking_module.optimizer.state_dict(), # }, is_best, config_tr.save_path + '/ckpt') # net.train() # save model before exit torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) logf.close()
def train(config_path, model_dir, use_fusion=False, use_ft=False, use_second_stage=False, use_endtoend=False, result_path=None, create_folder=False, display_step=50, summary_step=5, local_rank=0, pickle_result=True, patchs=None): """train a VoxelNet model specified by a config file. """ if create_folder: if pathlib.Path(model_dir).exists(): model_dir = torchplus.train.create_folder(model_dir) patchs = patchs or [] model_dir = pathlib.Path(model_dir) model_dir.mkdir(parents=True, exist_ok=True) if result_path is None: result_path = model_dir / 'results' config_file_bkp = "pipeline.config" config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) for patch in patchs: patch = "config." + patch exec(patch) shutil.copyfile(config_path, str(model_dir / config_file_bkp)) input_cfg = config.train_input_reader eval_input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config ###################### # BUILD VOXEL GENERATOR ###################### voxel_generator = voxel_builder.build(model_cfg.voxel_generator) ###################### # BUILD TARGET ASSIGNER ###################### bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) class_names = target_assigner.classes ###################### # BUILD NET ###################### center_limit_range = model_cfg.post_center_limit_range if use_second_stage: net = second_2stage_builder.build(model_cfg, voxel_generator, target_assigner) if use_endtoend: net = second_endtoend_builder.build(model_cfg, voxel_generator, target_assigner) else: net = second_builder.build(model_cfg, voxel_generator, target_assigner) net.cuda() # import pdb; pdb.set_trace() print("num_trainable parameters:", len(list(net.parameters()))) # for n, p in net.named_parameters(): # print(n, p.shape) # pth_name = 'pre_weight/first_stage/fusion_split/voxelnet-35210.tckpt' # # pth_name = 'pre_weight/first_stage/fusion_split/voxelnet-20130.tckpt' # res_pre_weights = torch.load(pth_name) # new_res_state_dict = OrderedDict() # model_dict = net.state_dict() # for k,v in res_pre_weights.items(): # if 'global_step' not in k: # if 'dir' not in k: # new_res_state_dict[k] = v # model_dict.update(new_res_state_dict) # net.load_state_dict(model_dict) ###################### if use_second_stage or use_endtoend: if use_fusion: # pth_name = 'pre_weight/8020/voxelnet-20130.tckpt' pth_name = 'pre_weight/first_stage/fusion_split/voxelnet-35210.tckpt' for i in range(30): print( '################## load Fusion First stage weight complete #######################' ) else: pth_name = 'pre_weight/first_stage/lidaronly/voxelnet-30950.tckpt' for i in range(30): print( '################## load LiDAR Only First stage weight complete #######################' ) res_pre_weights = torch.load(pth_name) new_res_state_dict = OrderedDict() model_dict = net.state_dict() for k, v in res_pre_weights.items(): if 'global_step' not in k: if 'dir' not in k: new_res_state_dict[k] = v model_dict.update(new_res_state_dict) net.load_state_dict(model_dict) ############ load FPN18 pre-weight ############# if (use_fusion and not use_second_stage and not use_endtoend): # if True: # or (use_endtoend and use_fusion): fpn_depth = 18 pth_name = 'pre_weight/FPN' + str(fpn_depth) + '_retinanet_968.pth' res_pre_weights = torch.load(pth_name) new_res_state_dict = OrderedDict() model_dict = net.state_dict() for k, v in res_pre_weights['state_dict'].items(): if ('regressionModel' not in k) and ('classificationModel' not in k): name = k.replace('module', 'rpn') new_res_state_dict[name] = v model_dict.update(new_res_state_dict) net.load_state_dict(model_dict) for i in range(30): print('!!!!!!!!!!!!!!!!!! load FPN' + str(fpn_depth) + ' weight complete !!!!!!!!!!!!!!!!!!') ################################################ # BUILD OPTIMIZER ##################### # we need global_step to create lr_scheduler, so restore net first. torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) gstep = net.get_global_step() - 1 optimizer_cfg = train_cfg.optimizer if train_cfg.enable_mixed_precision: net.half() net.metrics_to_float() net.convert_norm_to_float(net) loss_scale = train_cfg.loss_scale_factor mixed_optimizer = optimizer_builder.build( optimizer_cfg, net, mixed=train_cfg.enable_mixed_precision, loss_scale=loss_scale) optimizer = mixed_optimizer """ if train_cfg.enable_mixed_precision: mixed_optimizer = torchplus.train.MixedPrecisionWrapper( optimizer, loss_scale) else: mixed_optimizer = optimizer """ # must restore optimizer AFTER using MixedPrecisionWrapper torchplus.train.try_restore_latest_checkpoints(model_dir, [mixed_optimizer]) lr_scheduler = lr_scheduler_builder.build(optimizer_cfg, optimizer, train_cfg.steps) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 ###################### # PREPARE INPUT ###################### dataset = input_reader_builder.build(input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataset = input_reader_builder.build(eval_input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) def _worker_init_fn(worker_id): time_seed = np.array(time.time(), dtype=np.int32) np.random.seed(time_seed + worker_id) print(f"WORKER {worker_id} seed:", np.random.get_state()[1][0]) dataloader = torch.utils.data.DataLoader(dataset, batch_size=input_cfg.batch_size, shuffle=True, num_workers=input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch, worker_init_fn=_worker_init_fn) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=eval_input_cfg.batch_size, shuffle=False, num_workers=eval_input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch) data_iter = iter(dataloader) ###################### # TRAINING ###################### training_detail = [] log_path = model_dir / 'log.txt' training_detail_path = model_dir / 'log.json' if training_detail_path.exists(): with open(training_detail_path, 'r') as f: training_detail = json.load(f) logf = open(log_path, 'a') logf.write(proto_str) logf.write("\n") summary_dir = model_dir / 'summary' summary_dir.mkdir(parents=True, exist_ok=True) writer = SummaryWriter(str(summary_dir)) total_step_elapsed = 0 remain_steps = train_cfg.steps - net.get_global_step() t = time.time() ckpt_start_time = t total_loop = train_cfg.steps // train_cfg.steps_per_eval + 1 # total_loop = remain_steps // train_cfg.steps_per_eval + 1 clear_metrics_every_epoch = train_cfg.clear_metrics_every_epoch if train_cfg.steps % train_cfg.steps_per_eval == 0: total_loop -= 1 mixed_optimizer.zero_grad() try: for _ in range(total_loop): if total_step_elapsed + train_cfg.steps_per_eval > train_cfg.steps: steps = train_cfg.steps % train_cfg.steps_per_eval else: steps = train_cfg.steps_per_eval for step in range(steps): lr_scheduler.step(net.get_global_step()) try: example = next(data_iter) except StopIteration: print("end epoch") if clear_metrics_every_epoch: net.clear_metrics() data_iter = iter(dataloader) example = next(data_iter) example_torch = example_convert_to_torch(example, float_dtype) batch_size = example["anchors"].shape[0] ret_dict = net(example_torch) # box_preds = ret_dict["box_preds"] cls_preds = ret_dict["cls_preds"] loss = ret_dict["loss"].mean() cls_loss_reduced = ret_dict["cls_loss_reduced"].mean() loc_loss_reduced = ret_dict["loc_loss_reduced"].mean() cls_pos_loss = ret_dict["cls_pos_loss"] cls_neg_loss = ret_dict["cls_neg_loss"] loc_loss = ret_dict["loc_loss"] cls_loss = ret_dict["cls_loss"] dir_loss_reduced = ret_dict["dir_loss_reduced"] cared = ret_dict["cared"] # idx_offset = ret_dict["idx_offset"] # labels = example_torch["labels"] if use_second_stage or use_endtoend: labels = ret_dict["labels"] else: labels = example_torch["labels"] if train_cfg.enable_mixed_precision: loss *= loss_scale loss.backward() # import pdb; pdb.set_trace() torch.nn.utils.clip_grad_norm_(net.parameters(), 10.0) mixed_optimizer.step() mixed_optimizer.zero_grad() net.update_global_step() net_metrics = net.update_metrics(cls_loss_reduced, loc_loss_reduced, cls_preds, labels, cared) step_time = (time.time() - t) t = time.time() metrics = {} num_pos = int((labels > 0)[0].float().sum().cpu().numpy()) num_neg = int((labels == 0)[0].float().sum().cpu().numpy()) if 'anchors_mask' not in example_torch: num_anchors = example_torch['anchors'].shape[1] else: num_anchors = int(example_torch['anchors_mask'][0].sum()) global_step = net.get_global_step() # print(step) if global_step % display_step == 0: loc_loss_elem = [ float(loc_loss[:, :, i].sum().detach().cpu().numpy() / batch_size) for i in range(loc_loss.shape[-1]) ] metrics["type"] = "step_info" metrics["step"] = global_step metrics["steptime"] = step_time metrics.update(net_metrics) metrics["loss"] = {} metrics["loss"]["loc_elem"] = loc_loss_elem metrics["loss"]["cls_pos_rt"] = float( cls_pos_loss.detach().cpu().numpy()) metrics["loss"]["cls_neg_rt"] = float( cls_neg_loss.detach().cpu().numpy()) if model_cfg.use_direction_classifier: metrics["loss"]["dir_rt"] = float( dir_loss_reduced.detach().cpu().numpy()) metrics["num_vox"] = int(example_torch["voxels"].shape[0]) metrics["num_pos"] = int(num_pos) metrics["num_neg"] = int(num_neg) metrics["num_anchors"] = int(num_anchors) # metrics["idx_offset_mean"] = float(idx_offset.mean().detach().cpu().numpy()) # metrics["idx_offset_sum"] = float(idx_offset.sum().detach().cpu().numpy()) # metrics["lr"] = float( # mixed_optimizer.param_groups[0]['lr']) metrics["lr"] = float(optimizer.lr) metrics["image_idx"] = example['image_idx'][0] training_detail.append(metrics) flatted_metrics = flat_nested_json_dict(metrics) flatted_summarys = flat_nested_json_dict(metrics, "/") for k, v in flatted_summarys.items(): if isinstance(v, (list, tuple)): v = {str(i): e for i, e in enumerate(v)} if type(v) != str and ('loc_elem' not in k): writer.add_scalars(k, v, global_step) else: if (type(v) != str) and ('loc_elem' not in k): writer.add_scalar(k, v, global_step) # if use_second_stage or use_endtoend: # bev_logs = ret_dict['bev_crops_output'][:64,0,...].view(64,1,14,14) # bev_vis = torchvision.utils.make_grid(bev_logs,normalize=True,scale_each=True) # writer.add_image('bev_crop',img_tensor=bev_vis, global_step=global_step) # if ret_dict['concat_crops_output'] is not None: # concat_logs = ret_dict['concat_crops_output'][:64,0,...].view(64,1,14,14) # concat_vis = torchvision.utils.make_grid(concat_logs,normalize=True,scale_each=True) # writer.add_image('concat_crop',img_tensor=concat_vis, global_step=global_step) metrics_str_list = [] for k, v in flatted_metrics.items(): if isinstance(v, float): metrics_str_list.append(f"{k}={v:.3}") elif isinstance(v, (list, tuple)): if v and isinstance(v[0], float): v_str = ', '.join([f"{e:.3}" for e in v]) metrics_str_list.append(f"{k}=[{v_str}]") else: metrics_str_list.append(f"{k}={v}") else: metrics_str_list.append(f"{k}={v}") log_str = ', '.join(metrics_str_list) print(log_str, file=logf) print(log_str) ckpt_elasped_time = time.time() - ckpt_start_time if ckpt_elasped_time > train_cfg.save_checkpoints_secs: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) ckpt_start_time = time.time() total_step_elapsed += steps torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) net.eval() result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) print("#################################") print("#################################", file=logf) print("# EVAL") print("# EVAL", file=logf) print("#################################") print("#################################", file=logf) print("Generate output labels...") print("Generate output labels...", file=logf) t = time.time() dt_annos = [] prog_bar = ProgressBar() net.clear_timer() prog_bar.start( (len(eval_dataset) + eval_input_cfg.batch_size - 1) // eval_input_cfg.batch_size) for example in iter(eval_dataloader): example = example_convert_to_torch(example, float_dtype) if pickle_result: dt_annos += predict_kitti_to_anno(net, example, class_names, center_limit_range, model_cfg.lidar_input) else: _predict_kitti_to_file(net, example, result_path_step, class_names, center_limit_range, model_cfg.lidar_input) prog_bar.print_bar() sec_per_ex = len(eval_dataset) / (time.time() - t) print(f'generate label finished({sec_per_ex:.2f}/s). start eval:') print(f'generate label finished({sec_per_ex:.2f}/s). start eval:', file=logf) gt_annos = [ info["annos"] for info in eval_dataset.dataset.kitti_infos ] if not pickle_result: dt_annos = kitti.get_label_annos(result_path_step) # result = get_official_eval_result_v2(gt_annos, dt_annos, class_names) # print(json.dumps(result, indent=2), file=logf) result = get_official_eval_result(gt_annos, dt_annos, class_names) print(result, file=logf) print(result) result_1 = result.split("\n")[:5] result_2 = result.split("\n")[10:15] result_3 = result.split("\n")[20:25] emh = ['0_easy', '1_mod', '2_hard'] result_save = result_1 for i in range(len(result_save) - 1): save_targ = result_save[i + 1] name_val = save_targ.split(':')[0].split(' ')[0] value_val = save_targ.split(':')[1:] for ev in range(3): each_val = value_val[0].split(',')[ev] merge_txt = 'AP_kitti/car_70/' + name_val + '/' + emh[ev] writer.add_scalar(merge_txt, float(each_val), global_step) if pickle_result: with open(result_path_step / "result.pkl", 'wb') as f: pickle.dump(dt_annos, f) writer.add_text('eval_result', result, global_step) net.train() except Exception as e: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) logf.close() raise e # save model before exit torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) logf.close()
def predict(config_path, model_dir, result_path=None, predict_test=False, ckpt_path=None, ref_detfile=None, pickle_result=True, bb_save_dir=None, pub_bb=None, pub_lidar=None): ''' Setup network and provide useful output ''' #################### # SETUP PARAMETERS # #################### model_dir = pathlib.Path(model_dir) if predict_test: result_name = 'predict_test' else: result_name = 'eval_results' if result_path is None: result_path = model_dir / result_name else: result_path = pathlib.Path(result_path) config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) # TODO: include this program as a function call in the localization/mapping code as needed # TODO: use whole pointcloud data instead of reduced pointcloud # TODO: [Done] store data in respective pcd and bounding box (csv) files # TODO: [Done] create a cpp file to read and show (n number of) pcd files with respective bounding boxes # > [Done] Check if pcl_viewer can open pcd # > [Done] Check if pcl_viewer can be called from a cpp program for vizualization # > [Done] Check if that cpp program can also show a bounding box input_cfg = config.eval_input_reader # Read the config file data into useful structures model_cfg = config.model.second # Read the config file data into useful structures train_cfg = config.train_config # Read the config file data into useful structures class_names = list(input_cfg.class_names) center_limit_range = model_cfg.post_center_limit_range ######################### # BUILD VOXEL GENERATOR # ######################### voxel_generator = voxel_builder.build(model_cfg.voxel_generator) bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) ##################### # NETWORK GENERATOR # ##################### # Build the NN in GPU mode net = second_builder.build(model_cfg, voxel_generator, target_assigner) net.cuda() # Standard conversion approach if using FloatingPoint16 instead of FloatingPoint32 type of tensor if train_cfg.enable_mixed_precision: net.half() net.metrics_to_float() net.convert_norm_to_float(net) float_dtype = torch.float16 else: float_dtype = torch.float32 # Restore old checkpoint if possible if ckpt_path is None: torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) else: torchplus.train.restore(ckpt_path, net) # Setup network for evaluation mode net.eval() ##################### # DATASET GENERATOR # ##################### # Dataset build for easy usage eval_dataset = input_reader_builder.build(input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=input_cfg.batch_size, shuffle=False, num_workers=input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch) # Further variable setup result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) t = time.time() dt_annos = [] global_set = None print() print("Generate output labels...") bar = ProgressBar() bar.start(len(eval_dataset) // input_cfg.batch_size + 1) ################# # NETWORK USAGE # ################# # Predict a set of 'num_workers' samples, get info and reformat data as needed # temp_count = 0 for example in iter(eval_dataloader): # pprint.pprint(example, width=1) # for key, value in example.items(): # print(key) # print(np.shape(value)) example = example_convert_to_torch(example, float_dtype) print(example['image_idx']) # pprint.pprint(example, width=1) # for key, value in example.items(): # print(key) # print(np.shape(value)) # # # # if pickle_result: # NOTE: Predict network output # start_time = time.time() predictions_dicts = net(example) # # Save copy of data if user requested # if save_pcd: # np.fromfile(str(v_path), dtype=np.float32, count=-1).reshape([-1, 4]) # # Publish original data # if pub_lidar: # data=PointCloud2() # # FIXME: Extract pointclound info from 'example' (use original kitti data file if needed) > publish # pub_lidar.publish(data) # # Publish network output # if pub_bb: # data = MarkerArray() # # FIXME: Create a wireframe 3D bounding box and, if possible, a transluscent 3D cuboid as well > publish # pub_bb.publish(data) # # print('Network predict time: {}'.format(time.time()-start_time)) # pprint.pprint(predictions_dicts[0]) # for key, value in predictions_dicts[0].items(): # print(key) # print(np.shape(value)) if bb_save_dir: save_path = pathlib.Path(bb_save_dir) save_path.mkdir( parents=True, exist_ok=True ) # create directory (and its parents) if non-existent for pred_dict in predictions_dicts: if pred_dict['box3d_lidar'] is not None: bb_lidar = pred_dict['box3d_lidar'].detach().cpu().numpy() else: bb_lidar = [[ 'temp', 'temp', 'temp', 'temp', 'temp', 'temp', 'temp' ]] df = pd.DataFrame(bb_lidar) df.columns = ['x', 'y', 'z', 'w', 'l', 'h', 't'] filename = save_path.joinpath( str(pred_dict['image_idx']) + '.csv') filename.write_text(df.to_csv(index=False))
def evaluate(config_path, model_dir, result_path=None, predict_test=False, ckpt_path=None, ref_detfile=None, pickle_result=True): model_dir = pathlib.Path(model_dir) if predict_test: result_name = 'predict_test' else: result_name = 'eval_results' if result_path is None: result_path = model_dir / result_name else: result_path = pathlib.Path(result_path) config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config class_names = list(input_cfg.class_names) center_limit_range = model_cfg.post_center_limit_range ###################### # BUILD VOXEL GENERATOR ###################### voxel_generator = voxel_builder.build(model_cfg.voxel_generator) bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) net = second_builder.build(model_cfg, voxel_generator, target_assigner) net.cuda() if train_cfg.enable_mixed_precision: net.half() net.metrics_to_float() net.convert_norm_to_float(net) if ckpt_path is None: torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) else: torchplus.train.restore(ckpt_path, net) eval_dataset = input_reader_builder.build(input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=input_cfg.batch_size, shuffle=False, num_workers=input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 net.eval() result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) t = time.time() dt_annos = [] global_set = None print("Generate output labels...") bar = ProgressBar() bar.start(len(eval_dataset) // input_cfg.batch_size + 1) for example in iter(eval_dataloader): example = example_convert_to_torch(example, float_dtype) if pickle_result: dt_annos += predict_kitti_to_anno(net, example, class_names, center_limit_range, model_cfg.lidar_input, global_set) else: _predict_kitti_to_file(net, example, result_path_step, class_names, center_limit_range, model_cfg.lidar_input) bar.print_bar() sec_per_example = len(eval_dataset) / (time.time() - t) print(f'generate label finished({sec_per_example:.2f}/s). start eval:') print(f"avg forward time per example: {net.avg_forward_time:.3f}") print(f"avg postprocess time per example: {net.avg_postprocess_time:.3f}") if not predict_test: gt_annos = [info["annos"] for info in eval_dataset.dataset.kitti_infos] if not pickle_result: dt_annos = kitti.get_label_annos(result_path_step) result = get_official_eval_result(gt_annos, dt_annos, class_names) print(result) result = get_coco_eval_result(gt_annos, dt_annos, class_names) print(result) if pickle_result: with open(result_path_step / "result.pkl", 'wb') as f: pickle.dump(dt_annos, f)
def evaluate(config_path, model_dir, use_second_stage=False, use_endtoend=False, result_path=None, predict_test=False, ckpt_path=None, ref_detfile=None, pickle_result=True, measure_time=False, batch_size=None): model_dir = pathlib.Path(model_dir) if predict_test: result_name = 'predict_test_0095' else: result_name = 'eval_results' if result_path is None: result_path = model_dir / result_name else: result_path = pathlib.Path(result_path) config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config center_limit_range = model_cfg.post_center_limit_range ###################### # BUILD VOXEL GENERATOR ###################### voxel_generator = voxel_builder.build(model_cfg.voxel_generator) bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) class_names = target_assigner.classes if use_second_stage: net = second_2stage_builder.build(model_cfg, voxel_generator, target_assigner, measure_time=measure_time) elif use_endtoend: net = second_endtoend_builder.build(model_cfg, voxel_generator, target_assigner, measure_time=measure_time) else: net = second_builder.build(model_cfg, voxel_generator, target_assigner, measure_time=measure_time) net.cuda() ######################################### # net = torch.nn.DataParallel(net) ######################################### if ckpt_path is None: torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) else: torchplus.train.restore(ckpt_path, net) if train_cfg.enable_mixed_precision: net.half() print("half inference!") net.metrics_to_float() net.convert_norm_to_float(net) batch_size = batch_size or input_cfg.batch_size eval_dataset = input_reader_builder_tr.build( input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=batch_size, shuffle=False, num_workers=0,# input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 net.eval() result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) t = time.time() dt_annos = [] global_set = None print("Generate output labels...") bar = ProgressBar() bar.start((len(eval_dataset) + batch_size - 1) // batch_size) prep_example_times = [] prep_times = [] t2 = time.time() for example in iter(eval_dataloader): if measure_time: prep_times.append(time.time() - t2) t1 = time.time() torch.cuda.synchronize() example = example_convert_to_torch(example, float_dtype) if measure_time: torch.cuda.synchronize() prep_example_times.append(time.time() - t1) if pickle_result: dt_annos += predict_kitti_to_anno( net, example, class_names, center_limit_range, model_cfg.lidar_input, global_set) else: _predict_kitti_to_file(net, example, result_path_step, class_names, center_limit_range, model_cfg.lidar_input) # print(json.dumps(net.middle_feature_extractor.middle_conv.sparity_dict)) bar.print_bar() if measure_time: t2 = time.time() sec_per_example = len(eval_dataset) / (time.time() - t) print(f'generate label finished({sec_per_example:.2f}/s). start eval:') if measure_time: print(f"avg example to torch time: {np.mean(prep_example_times) * 1000:.3f} ms") print(f"avg prep time: {np.mean(prep_times) * 1000:.3f} ms") for name, val in net.get_avg_time_dict().items(): print(f"avg {name} time = {val * 1000:.3f} ms") if not predict_test: gt_annos = [info["annos"] for info in eval_dataset.dataset.kitti_infos] img_idx = [info["image_idx"] for info in eval_dataset.dataset.kitti_infos] if not pickle_result: dt_annos = kitti.get_label_annos(result_path_step) result = get_official_eval_result(gt_annos, dt_annos, class_names) # print(json.dumps(result, indent=2)) print(result) result = get_coco_eval_result(gt_annos, dt_annos, class_names) print(result) if pickle_result: with open(result_path_step / "result.pkl", 'wb') as f: pickle.dump(dt_annos, f) # annos to txt file if True: os.makedirs(str(result_path_step) + '/txt', exist_ok=True) for i in range(len(dt_annos)): dt_annos[i]['dimensions'] = dt_annos[i]['dimensions'][:, [1, 2, 0]] result_lines = kitti.annos_to_kitti_label(dt_annos[i]) image_idx = img_idx[i] with open(str(result_path_step) + '/txt/%06d.txt' % image_idx, 'w') as f: for result_line in result_lines: f.write(result_line + '\n') abcd = 1 else: os.makedirs(str(result_path_step) + '/txt', exist_ok=True) img_idx = [info["image_idx"] for info in eval_dataset.dataset.kitti_infos] for i in range(len(dt_annos)): dt_annos[i]['dimensions'] = dt_annos[i]['dimensions'][:, [1, 2, 0]] result_lines = kitti.annos_to_kitti_label(dt_annos[i]) image_idx = img_idx[i] with open(str(result_path_step) + '/txt/%06d.txt' % image_idx, 'w') as f: for result_line in result_lines: f.write(result_line + '\n')
# voxels, coordinates, num_points = voxel_generator.generate( # points, 20000) # ret = target_assigner.generate_anchors(feature_map_size) # anchors = ret["anchors"] # anchors = anchors.reshape([-1, 7]) # matched_thresholds = ret["matched_thresholds"] # unmatched_thresholds = ret["unmatched_thresholds"] # gt_classes = np.array( # [class_names.index(n) + 1 for n in gt_names], dtype=np.int32) # target_dict = target_assigner.assign(anchors,gt_boxes,gt_classes=gt_classes, # matched_thresholds=matched_thresholds, # unmatched_thresholds=unmatched_thresholds) net = second_builder.build(model_cfg, voxel_generator, target_assigner) net.cuda() net.eval() torchplus.train.restore(ckpt_path, net) eval_dataset = ikg_input_reader_builder.build(input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=input_cfg.batch_size, shuffle=False, num_workers=input_cfg.num_workers,
def test(config_path=args.config_path, model_dir=args.model_dir, result_path=None, create_folder=False, pickle_result=True, include_roadmap=False, device=1): """train a VoxelNet model specified by a config file. """ if create_folder: if pathlib.Path(model_dir).exists(): model_dir = torchplus.train.create_folder(model_dir) model_dir = pathlib.Path(model_dir) model_dir.mkdir(parents=True, exist_ok=True) eval_checkpoint_dir = model_dir / 'eval_checkpoints' eval_checkpoint_dir.mkdir(parents=True, exist_ok=True) if result_path is None: result_path = model_dir / 'results' config_file_bkp = "pipeline.config" config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) shutil.copyfile(config_path, str(model_dir / config_file_bkp)) input_cfg = config.train_input_reader eval_input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config batch_size = 1 class_names = list(input_cfg.class_names) ###################### # BUILD VOXEL GENERATOR ###################### voxel_generator = voxel_builder.build(model_cfg.voxel_generator) grid_size = voxel_generator.grid_size ###################### # BUILD TARGET ASSIGNER ###################### bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) ###################### # BUILD NET ###################### center_limit_range = model_cfg.post_center_limit_range net = second_builder.build(model_cfg, voxel_generator, target_assigner, include_roadmap) net.cuda().eval() print("num_trainable parameters:", len(list(net.parameters()))) # for n, p in net.named_parameters(): # print(n, p.shape) #torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) torchplus.train.restore(args.model_path, net) #torchplus.train.restore("./ped_models_56/voxelnet-275130.tckpt",net) out_size_factor = model_cfg.rpn.layer_strides[ 0] / model_cfg.rpn.upsample_strides[0] print(out_size_factor) #out_size_factor *= model_cfg.middle_feature_extractor.downsample_factor out_size_factor = int(out_size_factor) feature_map_size = grid_size[:2] // out_size_factor feature_map_size = [*feature_map_size, 1][::-1] print(feature_map_size) ret = target_assigner.generate_anchors(feature_map_size) #anchors_dict = target_assigner.generate_anchors_dict(feature_map_size) anchors = ret["anchors"] anchors = anchors.reshape([-1, 7]) matched_thresholds = ret["matched_thresholds"] unmatched_thresholds = ret["unmatched_thresholds"] anchors_bv = box_np_ops.rbbox2d_to_near_bbox(anchors[:, [0, 1, 3, 4, 6]]) anchor_cache = { "anchors": anchors, "anchors_bv": anchors_bv, "matched_thresholds": matched_thresholds, "unmatched_thresholds": unmatched_thresholds, #"anchors_dict": anchors_dict, } am = ArgoverseMap() dt_annos = [] root_dir = os.path.join('./../../argodataset/argoverse-tracking/', args.set) argoverse_loader = ArgoverseTrackingLoader(root_dir) prog_cnt = 0 for seq in range(len(argoverse_loader)): argoverse_data = argoverse_loader[seq] nlf = argoverse_data.num_lidar_frame for frame in range(nlf): prog_cnt += 1 if prog_cnt % 50 == 0: print(prog_cnt) points = argoverse_data.get_lidar(frame) roi_pts = copy.deepcopy(points) city_name = argoverse_data.city_name city_to_egovehicle_se3 = argoverse_data.get_pose(frame) ''' roi_pts = city_to_egovehicle_se3.transform_point_cloud(roi_pts) # put into city coords #non roi roi_pts_flag = am.remove_non_roi_points(roi_pts, city_name) # remove non-driveable region roi_pts = roi_pts[roi_pts_flag] roi_pts = am.remove_ground_surface(roi_pts, city_name) # remove ground surface # convert city to lidar co-ordinates roi_pts = city_to_egovehicle_se3.inverse_transform_point_cloud(roi_pts) ''' if args.include_roi or args.dr_area or not args.include_road_points: roi_pts = city_to_egovehicle_se3.transform_point_cloud( roi_pts) # put into city coords if args.include_roi: roi_pts_flag = am.remove_non_roi_points( roi_pts, city_name) # remove non-driveable region roi_pts = roi_pts[roi_pts_flag] if not args.include_roi and args.dr_area: roi_pts_flag = am.remove_non_driveable_area_points( roi_pts, city_name) # remove non-driveable region roi_pts = roi_pts[roi_pts_flag] if not args.include_road_points: roi_pts = am.remove_ground_surface( roi_pts, city_name) # remove ground surface # convert city to lidar co-ordinates if args.include_roi or args.dr_area or not args.include_road_points: roi_pts = city_to_egovehicle_se3.inverse_transform_point_cloud( roi_pts) roi_pts[:, 2] = roi_pts[:, 2] - 1.73 pts_x, pts_y, pts_z = roi_pts[:, 0], roi_pts[:, 1], roi_pts[:, 2] input_dict = { 'points': roi_pts, 'pointcloud_num_features': 3, } out_size_factor = model_cfg.rpn.layer_strides[ 0] // model_cfg.rpn.upsample_strides[0] example = prep_pointcloud( input_dict=input_dict, root_path=None, voxel_generator=voxel_generator, target_assigner=target_assigner, max_voxels=input_cfg.max_number_of_voxels, class_names=list(input_cfg.class_names), training=False, create_targets=False, shuffle_points=input_cfg.shuffle_points, generate_bev=False, without_reflectivity=model_cfg.without_reflectivity, num_point_features=model_cfg.num_point_features, anchor_area_threshold=input_cfg.anchor_area_threshold, anchor_cache=anchor_cache, out_size_factor=out_size_factor, out_dtype=np.float32) if "anchors_mask" in example: example["anchors_mask"] = example["anchors_mask"].astype( np.uint8) example["image_idx"] = str(seq) + "_" + str(frame) example["image_shape"] = np.array([400, 400], dtype=np.int32) example["road_map"] = None example["include_roadmap"] = False example["points"] = roi_pts #torch.save(example,"./network_input_examples/" + info) example = merge_second_batch([example]) example_torch = example_convert_to_torch(example, device=args.device) try: result_annos = predict_kitti_to_anno( net, example_torch, input_cfg.class_names, model_cfg.post_center_limit_range, model_cfg.lidar_input) except: print(seq, frame) continue dt_annos += result_annos if pickle_result: sdi = args.save_path.rfind('/') save_dir = args.save_path[:sdi] if not os.path.exists(save_dir): os.mkdir(save_dir) with open(args.save_path, 'wb') as f: pickle.dump(dt_annos, f)
def train(config_path, model_dir, result_path=None, create_folder=False, display_step=50, summary_step=5, pickle_result=True): """train a VoxelNet model specified by a config file. """ if create_folder: if pathlib.Path(model_dir).exists(): model_dir = torchplus.train.create_folder(model_dir) model_dir = pathlib.Path(model_dir) model_dir.mkdir(parents=True, exist_ok=True) eval_checkpoint_dir = model_dir / 'eval_checkpoints' eval_checkpoint_dir.mkdir(parents=True, exist_ok=True) if result_path is None: result_path = model_dir / 'results' config_file_bkp = "pipeline.config" config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) shutil.copyfile(config_path, str(model_dir / config_file_bkp)) input_cfg = config.train_input_reader eval_input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config class_names = list(input_cfg.class_names) ###################### # BUILD VOXEL GENERATOR ###################### voxel_generator = voxel_builder.build(model_cfg.voxel_generator) ###################### # BUILD TARGET ASSIGNER ###################### bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) ###################### # BUILD NET ###################### center_limit_range = model_cfg.post_center_limit_range # net = second_builder.build(model_cfg, voxel_generator, target_assigner) net = second_builder.build(model_cfg, voxel_generator, target_assigner, input_cfg.batch_size) net.cuda() # net_train = torch.nn.DataParallel(net).cuda() print("num_trainable parameters:", len(list(net.parameters()))) # for n, p in net.named_parameters(): # print(n, p.shape) ###################### # BUILD OPTIMIZER ###################### # we need global_step to create lr_scheduler, so restore net first. torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) gstep = net.get_global_step() - 1 optimizer_cfg = train_cfg.optimizer if train_cfg.enable_mixed_precision: net.half() net.metrics_to_float() net.convert_norm_to_float(net) optimizer = optimizer_builder.build(optimizer_cfg, net.parameters()) if train_cfg.enable_mixed_precision: loss_scale = train_cfg.loss_scale_factor mixed_optimizer = torchplus.train.MixedPrecisionWrapper( optimizer, loss_scale) else: mixed_optimizer = optimizer # must restore optimizer AFTER using MixedPrecisionWrapper torchplus.train.try_restore_latest_checkpoints(model_dir, [mixed_optimizer]) lr_scheduler = lr_scheduler_builder.build(optimizer_cfg, optimizer, gstep) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 ###################### # PREPARE INPUT ###################### dataset = input_reader_builder.build( input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataset = input_reader_builder.build( eval_input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) def _worker_init_fn(worker_id): time_seed = np.array(time.time(), dtype=np.int32) np.random.seed(time_seed + worker_id) print(f"WORKER {worker_id} seed:", np.random.get_state()[1][0]) dataloader = torch.utils.data.DataLoader( dataset, batch_size=input_cfg.batch_size, shuffle=True, num_workers=input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch, worker_init_fn=_worker_init_fn) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=eval_input_cfg.batch_size, shuffle=False, num_workers=eval_input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch) data_iter = iter(dataloader) ###################### # TRAINING ###################### log_path = model_dir / 'log.txt' logf = open(log_path, 'a') logf.write(proto_str) logf.write("\n") summary_dir = model_dir / 'summary' summary_dir.mkdir(parents=True, exist_ok=True) writer = SummaryWriter(str(summary_dir)) total_step_elapsed = 0 remain_steps = train_cfg.steps - net.get_global_step() t = time.time() ckpt_start_time = t total_loop = train_cfg.steps // train_cfg.steps_per_eval + 1 # total_loop = remain_steps // train_cfg.steps_per_eval + 1 clear_metrics_every_epoch = train_cfg.clear_metrics_every_epoch if train_cfg.steps % train_cfg.steps_per_eval == 0: total_loop -= 1 mixed_optimizer.zero_grad() try: for _ in range(total_loop): if total_step_elapsed + train_cfg.steps_per_eval > train_cfg.steps: steps = train_cfg.steps % train_cfg.steps_per_eval else: steps = train_cfg.steps_per_eval for step in range(steps): lr_scheduler.step() try: example = next(data_iter) except StopIteration: print("end epoch") if clear_metrics_every_epoch: net.clear_metrics() data_iter = iter(dataloader) example = next(data_iter) example_torch = example_convert_to_torch(example, float_dtype) batch_size = example["anchors"].shape[0] example_tuple = list(example_torch.values()) example_tuple[11] = torch.from_numpy(example_tuple[11]) example_tuple[12] = torch.from_numpy(example_tuple[12]) assert 13==len(example_tuple), "something wring with training input size!" # training example:[0:'voxels', 1:'num_points', 2:'coordinates', 3:'rect', # 4:'Trv2c', 5:'P2', # 6:'anchors', 7:'anchors_mask', 8:'labels', 9:'reg_targets', 10:'reg_weights', # 11:'image_idx', 12:'image_shape'] # ret_dict = net(example_torch) # training input from example # print("example[0] size", example_tuple[0].size()) pillar_x = example_tuple[0][:,:,0].unsqueeze(0).unsqueeze(0) pillar_y = example_tuple[0][:,:,1].unsqueeze(0).unsqueeze(0) pillar_z = example_tuple[0][:,:,2].unsqueeze(0).unsqueeze(0) pillar_i = example_tuple[0][:,:,3].unsqueeze(0).unsqueeze(0) num_points_per_pillar = example_tuple[1].float().unsqueeze(0) # Find distance of x, y, and z from pillar center # assuming xyres_16.proto coors_x = example_tuple[2][:, 3].float() coors_y = example_tuple[2][:, 2].float() # self.x_offset = self.vx / 2 + pc_range[0] # self.y_offset = self.vy / 2 + pc_range[1] # this assumes xyres 20 # x_sub = coors_x.unsqueeze(1) * 0.16 + 0.1 # y_sub = coors_y.unsqueeze(1) * 0.16 + -39.9 # here assumes xyres 16 x_sub = coors_x.unsqueeze(1) * 0.16 + 0.08 y_sub = coors_y.unsqueeze(1) * 0.16 + -39.6 ones = torch.ones([1, 100],dtype=torch.float32, device=pillar_x.device ) x_sub_shaped = torch.mm(x_sub, ones).unsqueeze(0).unsqueeze(0) y_sub_shaped = torch.mm(y_sub, ones).unsqueeze(0).unsqueeze(0) num_points_for_a_pillar = pillar_x.size()[3] mask = get_paddings_indicator(num_points_per_pillar, num_points_for_a_pillar, axis=0) mask = mask.permute(0, 2, 1) mask = mask.unsqueeze(1) mask = mask.type_as(pillar_x) coors = example_tuple[2] anchors = example_tuple[6] labels = example_tuple[8] reg_targets = example_tuple[9] input = [pillar_x, pillar_y, pillar_z, pillar_i, num_points_per_pillar, x_sub_shaped, y_sub_shaped, mask, coors, anchors, labels, reg_targets] ret_dict = net(input) assert 10==len(ret_dict), "something wring with training output size!" # return 0 # ret_dict { # 0:"loss": loss, # 1:"cls_loss": cls_loss, # 2:"loc_loss": loc_loss, # 3:"cls_pos_loss": cls_pos_loss, # 4:"cls_neg_loss": cls_neg_loss, # 5:"cls_preds": cls_preds, # 6:"dir_loss_reduced": dir_loss_reduced, # 7:"cls_loss_reduced": cls_loss_reduced, # 8:"loc_loss_reduced": loc_loss_reduced, # 9:"cared": cared, # } # cls_preds = ret_dict["cls_preds"] cls_preds = ret_dict[5] # loss = ret_dict["loss"].mean() loss = ret_dict[0].mean() # cls_loss_reduced = ret_dict["cls_loss_reduced"].mean() cls_loss_reduced = ret_dict[7].mean() # loc_loss_reduced = ret_dict["loc_loss_reduced"].mean() loc_loss_reduced = ret_dict[8].mean() # cls_pos_loss = ret_dict["cls_pos_loss"] cls_pos_loss = ret_dict[3] # cls_neg_loss = ret_dict["cls_neg_loss"] cls_neg_loss = ret_dict[4] # loc_loss = ret_dict["loc_loss"] loc_loss = ret_dict[2] # cls_loss = ret_dict["cls_loss"] cls_loss = ret_dict[1] # dir_loss_reduced = ret_dict["dir_loss_reduced"] dir_loss_reduced = ret_dict[6] # cared = ret_dict["cared"] cared = ret_dict[9] # labels = example_torch["labels"] labels = example_tuple[8] if train_cfg.enable_mixed_precision: loss *= loss_scale loss.backward() torch.nn.utils.clip_grad_norm_(net.parameters(), 10.0) mixed_optimizer.step() mixed_optimizer.zero_grad() net.update_global_step() net_metrics = net.update_metrics(cls_loss_reduced, loc_loss_reduced, cls_preds, labels, cared) step_time = (time.time() - t) t = time.time() metrics = {} num_pos = int((labels > 0)[0].float().sum().cpu().numpy()) num_neg = int((labels == 0)[0].float().sum().cpu().numpy()) # if 'anchors_mask' not in example_torch: # num_anchors = example_torch['anchors'].shape[1] # else: # num_anchors = int(example_torch['anchors_mask'][0].sum()) num_anchors = int(example_tuple[7][0].sum()) global_step = net.get_global_step() if global_step % display_step == 0: loc_loss_elem = [ float(loc_loss[:, :, i].sum().detach().cpu().numpy() / batch_size) for i in range(loc_loss.shape[-1]) ] metrics["step"] = global_step metrics["steptime"] = step_time metrics.update(net_metrics) metrics["loss"] = {} metrics["loss"]["loc_elem"] = loc_loss_elem metrics["loss"]["cls_pos_rt"] = float( cls_pos_loss.detach().cpu().numpy()) metrics["loss"]["cls_neg_rt"] = float( cls_neg_loss.detach().cpu().numpy()) # if unlabeled_training: # metrics["loss"]["diff_rt"] = float( # diff_loc_loss_reduced.detach().cpu().numpy()) if model_cfg.use_direction_classifier: metrics["loss"]["dir_rt"] = float( dir_loss_reduced.detach().cpu().numpy()) # metrics["num_vox"] = int(example_torch["voxels"].shape[0]) metrics["num_vox"] = int(example_tuple[0].shape[0]) metrics["num_pos"] = int(num_pos) metrics["num_neg"] = int(num_neg) metrics["num_anchors"] = int(num_anchors) metrics["lr"] = float( mixed_optimizer.param_groups[0]['lr']) # metrics["image_idx"] = example['image_idx'][0] metrics["image_idx"] = example_tuple[11][0] flatted_metrics = flat_nested_json_dict(metrics) flatted_summarys = flat_nested_json_dict(metrics, "/") for k, v in flatted_summarys.items(): if isinstance(v, (list, tuple)): v = {str(i): e for i, e in enumerate(v)} writer.add_scalars(k, v, global_step) else: writer.add_scalar(k, v, global_step) metrics_str_list = [] for k, v in flatted_metrics.items(): if isinstance(v, float): metrics_str_list.append(f"{k}={v:.3}") elif isinstance(v, (list, tuple)): if v and isinstance(v[0], float): v_str = ', '.join([f"{e:.3}" for e in v]) metrics_str_list.append(f"{k}=[{v_str}]") else: metrics_str_list.append(f"{k}={v}") else: metrics_str_list.append(f"{k}={v}") log_str = ', '.join(metrics_str_list) print(log_str, file=logf) print(log_str) ckpt_elasped_time = time.time() - ckpt_start_time if ckpt_elasped_time > train_cfg.save_checkpoints_secs: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) ckpt_start_time = time.time() total_step_elapsed += steps torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) # Ensure that all evaluation points are saved forever torchplus.train.save_models(eval_checkpoint_dir, [net, optimizer], net.get_global_step(), max_to_keep=100) # net.eval() # result_path_step = result_path / f"step_{net.get_global_step()}" # result_path_step.mkdir(parents=True, exist_ok=True) # print("#################################") # print("#################################", file=logf) # print("# EVAL") # print("# EVAL", file=logf) # print("#################################") # print("#################################", file=logf) # print("Generate output labels...") # print("Generate output labels...", file=logf) # t = time.time() # dt_annos = [] # prog_bar = ProgressBar() # prog_bar.start(len(eval_dataset) // eval_input_cfg.batch_size + 1) # for example in iter(eval_dataloader): # example = example_convert_to_torch(example, float_dtype) # # evaluation example:[0:'voxels', 1:'num_points', 2:'coordinates', 3:'rect', # # 4:'Trv2c', 5:'P2', # # 6:'anchors', 7:'anchors_mask', 8:'image_idx', 9:'image_shape'] # example_tuple = list(example.values()) # example_tuple[8] = torch.from_numpy(example_tuple[8]) # example_tuple[9] = torch.from_numpy(example_tuple[9]) # if pickle_result: # dt_annos += predict_kitti_to_anno( # net, example_tuple, class_names, center_limit_range, # model_cfg.lidar_input) # else: # _predict_kitti_to_file(net, example, result_path_step, # class_names, center_limit_range, # model_cfg.lidar_input) # # prog_bar.print_bar() # # sec_per_ex = len(eval_dataset) / (time.time() - t) # print(f"avg forward time per example: {net.avg_forward_time:.3f}") # print( # f"avg postprocess time per example: {net.avg_postprocess_time:.3f}" # ) # # net.clear_time_metrics() # print(f'generate label finished({sec_per_ex:.2f}/s). start eval:') # print( # f'generate label finished({sec_per_ex:.2f}/s). start eval:', # file=logf) # gt_annos = [ # info["annos"] for info in eval_dataset.dataset.kitti_infos # ] # if not pickle_result: # dt_annos = kitti.get_label_annos(result_path_step) # result, mAPbbox, mAPbev, mAP3d, mAPaos = get_official_eval_result(gt_annos, dt_annos, class_names, # return_data=True) # print(result, file=logf) # print(result) # writer.add_text('eval_result', result, global_step) # # for i, class_name in enumerate(class_names): # writer.add_scalar('bev_ap:{}'.format(class_name), mAPbev[i, 1, 0], global_step) # writer.add_scalar('3d_ap:{}'.format(class_name), mAP3d[i, 1, 0], global_step) # writer.add_scalar('aos_ap:{}'.format(class_name), mAPaos[i, 1, 0], global_step) # writer.add_scalar('bev_map', np.mean(mAPbev[:, 1, 0]), global_step) # writer.add_scalar('3d_map', np.mean(mAP3d[:, 1, 0]), global_step) # writer.add_scalar('aos_map', np.mean(mAPaos[:, 1, 0]), global_step) # # result = get_coco_eval_result(gt_annos, dt_annos, class_names) # print(result, file=logf) # print(result) # if pickle_result: # with open(result_path_step / "result.pkl", 'wb') as f: # pickle.dump(dt_annos, f) # writer.add_text('eval_result', result, global_step) # net.train() except Exception as e: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) logf.close() raise e # save model before exit torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) logf.close()
def evaluate(config_path, model_dir, result_path=None, predict_test=False, ckpt_path=None, ref_detfile=None, pickle_result=True): model_dir = str(Path(model_dir).resolve()) if predict_test: result_name = 'predict_test' else: result_name = 'eval_results' if result_path is None: model_dir = Path(model_dir) result_path = model_dir / result_name else: result_path = pathlib.Path(result_path) if isinstance(config_path, str): config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) else: config = config_path input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config class_names = list(input_cfg.class_names) center_limit_range = model_cfg.post_center_limit_range ######################### # Build Voxel Generator ######################### voxel_generator = voxel_builder.build(model_cfg.voxel_generator) bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) net = second_builder.build(model_cfg, voxel_generator, target_assigner, input_cfg.batch_size) net.cuda() if train_cfg.enable_mixed_precision: net.half() net.metrics_to_float() net.convert_norm_to_float(net) if ckpt_path is None: torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) else: torchplus.train.restore(ckpt_path, net) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 net.eval() result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) t = time.time() # dt_annos = [] global_set = None print("Generate output labels...") example_tuple = generate_example() dt_annos = predict_kitti_to_anno(net, example_tuple, class_names, center_limit_range, model_cfg.lidar_input, global_set) print(dt_annos)
def train(config_path, model_dir, result_path=None, create_folder=False, display_step=50, summary_step=5, pickle_result=True): """train a VoxelNet model specified by a config file. """ if create_folder: if pathlib.Path(model_dir).exists(): model_dir = torchplus.train.create_folder(model_dir) model_dir = pathlib.Path(model_dir) model_dir.mkdir(parents=True, exist_ok=True) eval_checkpoint_dir = model_dir / 'eval_checkpoints' eval_checkpoint_dir.mkdir(parents=True, exist_ok=True) if result_path is None: result_path = model_dir / 'results' config_file_bkp = "pipeline.config" config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) shutil.copyfile(config_path, str(model_dir / config_file_bkp)) input_cfg = config.train_input_reader eval_input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config class_names = list(input_cfg.class_names) ######################### # Build Voxel Generator ######################### voxel_generator = voxel_builder.build(model_cfg.voxel_generator) ######################### # Build Target Assigner ######################### bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) ###################### # Build NetWork ###################### center_limit_range = model_cfg.post_center_limit_range # net = second_builder.build(model_cfg, voxel_generator, target_assigner) net = second_builder.build(model_cfg, voxel_generator, target_assigner, input_cfg.batch_size) net.cuda() # net_train = torch.nn.DataParallel(net).cuda() print("num_trainable parameters:", len(list(net.parameters()))) # for n, p in net.named_parameters(): # print(n, p.shape) ###################### # Build Optimizer ###################### # we need global_step to create lr_scheduler, so restore net first. torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) gstep = net.get_global_step() - 1 optimizer_cfg = train_cfg.optimizer if train_cfg.enable_mixed_precision: net.half() net.metrics_to_float() net.convert_norm_to_float(net) optimizer = optimizer_builder.build(optimizer_cfg, net.parameters()) if train_cfg.enable_mixed_precision: loss_scale = train_cfg.loss_scale_factor mixed_optimizer = torchplus.train.MixedPrecisionWrapper( optimizer, loss_scale) else: mixed_optimizer = optimizer # must restore optimizer AFTER using MixedPrecisionWrapper torchplus.train.try_restore_latest_checkpoints(model_dir, [mixed_optimizer]) lr_scheduler = lr_scheduler_builder.build(optimizer_cfg, optimizer, gstep) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 ###################### # Prepare Input ###################### dataset = input_reader_builder.build(input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataset = input_reader_builder.build(eval_input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) def _worker_init_fn(worker_id): time_seed = np.array(time.time(), dtype=np.int32) np.random.seed(time_seed + worker_id) print(f"WORKER {worker_id} seed:", np.random.get_state()[1][0]) dataloader = torch.utils.data.DataLoader(dataset, batch_size=input_cfg.batch_size, shuffle=True, num_workers=input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch, worker_init_fn=_worker_init_fn) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=eval_input_cfg.batch_size, shuffle=False, num_workers=eval_input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch) data_iter = iter(dataloader) ###################### # Training ###################### log_path = model_dir / 'log.txt' logf = open(log_path, 'a') logf.write(proto_str) logf.write("\n") summary_dir = model_dir / 'summary' summary_dir.mkdir(parents=True, exist_ok=True) writer = SummaryWriter(str(summary_dir)) total_step_elapsed = 0 remain_steps = train_cfg.steps - net.get_global_step() t = time.time() ckpt_start_time = t total_loop = train_cfg.steps // train_cfg.steps_per_eval + 1 # total_loop = remain_steps // train_cfg.steps_per_eval + 1 clear_metrics_every_epoch = train_cfg.clear_metrics_every_epoch if train_cfg.steps % train_cfg.steps_per_eval == 0: total_loop -= 1 mixed_optimizer.zero_grad() try: for _ in range(total_loop): if total_step_elapsed + train_cfg.steps_per_eval > train_cfg.steps: steps = train_cfg.steps % train_cfg.steps_per_eval else: steps = train_cfg.steps_per_eval for step in range(steps): lr_scheduler.step() try: example = next(data_iter) except StopIteration: print("end epoch") if clear_metrics_every_epoch: net.clear_metrics() data_iter = iter(dataloader) example = next(data_iter) example_torch = example_convert_to_torch(example, float_dtype) batch_size = example["anchors"].shape[0] example_tuple = list(example_torch.values()) example_tuple[11] = torch.from_numpy(example_tuple[11]) example_tuple[12] = torch.from_numpy(example_tuple[12]) assert 13 == len( example_tuple), "something write with training input size!" # ret_dict = net(example_torch) # Training Input form example pillar_x = example_tuple[0][:, :, 0].unsqueeze(0).unsqueeze(0) pillar_y = example_tuple[0][:, :, 1].unsqueeze(0).unsqueeze(0) pillar_z = example_tuple[0][:, :, 2].unsqueeze(0).unsqueeze(0) pillar_i = example_tuple[0][:, :, 3].unsqueeze(0).unsqueeze(0) num_points_per_pillar = example_tuple[1].float().unsqueeze(0) ################################################################ # Find distance of x, y, z from pillar center # assume config_file xyres_16.proto coors_x = example_tuple[2][:, 3].float() coors_y = example_tuple[2][:, 2].float() # self.x_offset = self.vx / 2 + pc_range[0] # self.y_offset = self.vy / 2 + pc_range[1] # this assumes xyres 20 # x_sub = coors_x.unsqueeze(1) * 0.16 + 0.1 # y_sub = coors_y.unsqueeze(1) * 0.16 + -39.9 ################################################################ # assumes xyres_16 x_sub = coors_x.unsqueeze(1) * 0.16 + 0.08 y_sub = coors_y.unsqueeze(1) * 0.16 - 39.6 ones = torch.ones([1, 100], dtype=torch.float32, device=pillar_x.device) x_sub_shaped = torch.mm(x_sub, ones).unsqueeze(0).unsqueeze(0) y_sub_shaped = torch.mm(y_sub, ones).unsqueeze(0).unsqueeze(0) num_points_for_a_pillar = pillar_x.size()[3] mask = get_paddings_indicator(num_points_per_pillar, num_points_for_a_pillar, axis=0) mask = mask.permute(0, 2, 1) mask = mask.unsqueeze(1) mask = mask.type_as(pillar_x) coors = example_tuple[2] anchors = example_tuple[6] labels = example_tuple[8] reg_targets = example_tuple[9] input = [ pillar_x, pillar_y, pillar_z, pillar_i, num_points_per_pillar, x_sub_shaped, y_sub_shaped, mask, coors, anchors, labels, reg_targets ] ret_dict = net(input) assert 10 == len( ret_dict), "something write with training output size!" cls_preds = ret_dict[5] loss = ret_dict[0].mean() cls_loss_reduced = ret_dict[7].mean() loc_loss_reduced = ret_dict[8].mean() cls_pos_loss = ret_dict[3] cls_neg_loss = ret_dict[4] loc_loss = ret_dict[2] cls_loss = ret_dict[1] dir_loss_reduced = ret_dict[6] cared = ret_dict[9] labels = example_tuple[8] if train_cfg.enable_mixed_precision: loss *= loss_scale loss.backward() torch.nn.utils.clip_grad_norm_(net.parameters(), 10.0) mixed_optimizer.step() mixed_optimizer.zero_grad() net.update_global_step() net_metrics = net.update_metrics(cls_loss_reduced, loc_loss_reduced, cls_preds, labels, cared) step_time = (time.time() - t) t = time.time() metrics = {} num_pos = int((labels > 0)[0].float().sum().cpu().numpy()) num_neg = int((labels == 0)[0].float().sum().cpu().numpy()) # if 'anchors_mask' not in example_torch: # num_anchors = example_torch['anchors'].shape[1] # else: # num_anchors = int(example_torch['anchors_mask'][0].sum()) num_anchors = int(example_tuple[7][0].sum()) global_step = net.get_global_step() if global_step % display_step == 0: loc_loss_elem = [ float(loc_loss[:, :, i].sum().detach().cpu().numpy() / batch_size) for i in range(loc_loss.shape[-1]) ] metrics["step"] = global_step metrics["steptime"] = step_time metrics.update(net_metrics) metrics["loss"] = {} metrics["loss"]["loc_elem"] = loc_loss_elem metrics["loss"]["cls_pos_rt"] = float( cls_pos_loss.detach().cpu().numpy()) metrics["loss"]["cls_neg_rt"] = float( cls_neg_loss.detach().cpu().numpy()) # if unlabeled_training: # metrics["loss"]["diff_rt"] = float( # diff_loc_loss_reduced.detach().cpu().numpy()) if model_cfg.use_direction_classifier: metrics["loss"]["dir_rt"] = float( dir_loss_reduced.detach().cpu().numpy()) metrics["num_vox"] = int(example_tuple[0].shape[0]) metrics["num_pos"] = int(num_pos) metrics["num_neg"] = int(num_neg) metrics["num_anchors"] = int(num_anchors) metrics["lr"] = float( mixed_optimizer.param_groups[0]['lr']) metrics["image_idx"] = example_tuple[11][0] flatted_metrics = flat_nested_json_dict(metrics) flatted_summarys = flat_nested_json_dict(metrics, "/") for k, v in flatted_summarys.items(): if isinstance(v, (list, tuple)): v = {str(i): e for i, e in enumerate(v)} writer.add_scalars(k, v, global_step) else: writer.add_scalar(k, v, global_step) metrics_str_list = [] for k, v in flatted_metrics.items(): if isinstance(v, float): metrics_str_list.append(f"{k}={v:.3}") elif isinstance(v, (list, tuple)): if v and isinstance(v[0], float): v_str = ', '.join([f"{e:.3}" for e in v]) metrics_str_list.append(f"{k}=[{v_str}]") else: metrics_str_list.append(f"{k}={v}") else: metrics_str_list.append(f"{k}={v}") log_str = ', '.join(metrics_str_list) print(log_str, file=logf) print(log_str) ckpt_elasped_time = time.time() - ckpt_start_time if ckpt_elasped_time > train_cfg.save_checkpoints_secs: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) ckpt_start_time = time.time() total_step_elapsed += steps torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) # Ensure that all evaluation points are saved forever torchplus.train.save_models(eval_checkpoint_dir, [net, optimizer], net.get_global_step(), max_to_keep=100) except Exception as e: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) logf.close() raise e # save model before exit torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) logf.close()
def train(config_path, model_dir, result_path=None, create_folder=False, display_step=50, summary_step=5, pickle_result=True): """train a VoxelNet model specified by a config file. """ if create_folder: if pathlib.Path(model_dir).exists(): model_dir = torchplus.train.create_folder(model_dir) model_dir = pathlib.Path(model_dir) model_dir.mkdir(parents=True, exist_ok=True) eval_checkpoint_dir = model_dir / 'eval_checkpoints' eval_checkpoint_dir.mkdir(parents=True, exist_ok=True) if result_path is None: result_path = model_dir / 'results' config_file_bkp = "pipeline.config" config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) shutil.copyfile(config_path, str(model_dir / config_file_bkp)) input_cfg = config.train_input_reader eval_input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config class_names = list(input_cfg.class_names) ###################### # BUILD VOXEL GENERATOR ###################### voxel_generator = voxel_builder.build(model_cfg.voxel_generator) ###################### # BUILD TARGET ASSIGNER ###################### bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) ###################### # BUILD NET ###################### center_limit_range = model_cfg.post_center_limit_range net = second_builder.build(model_cfg, voxel_generator, target_assigner) net.cuda() # net_train = torch.nn.DataParallel(net).cuda() print("num_trainable parameters:", len(list(net.parameters()))) # for n, p in net.named_parameters(): # print(n, p.shape) ###################### # BUILD OPTIMIZER ###################### # we need global_step to create lr_scheduler, so restore net first. torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) gstep = net.get_global_step() - 1 optimizer_cfg = train_cfg.optimizer if train_cfg.enable_mixed_precision: net.half() net.metrics_to_float() net.convert_norm_to_float(net) optimizer = optimizer_builder.build(optimizer_cfg, net.parameters()) if train_cfg.enable_mixed_precision: loss_scale = train_cfg.loss_scale_factor mixed_optimizer = torchplus.train.MixedPrecisionWrapper( optimizer, loss_scale) else: mixed_optimizer = optimizer # must restore optimizer AFTER using MixedPrecisionWrapper torchplus.train.try_restore_latest_checkpoints(model_dir, [mixed_optimizer]) lr_scheduler = lr_scheduler_builder.build(optimizer_cfg, optimizer, gstep) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 ###################### # PREPARE INPUT ###################### dataset = input_reader_builder.build(input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataset = input_reader_builder.build(eval_input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) def _worker_init_fn(worker_id): time_seed = np.array(time.time(), dtype=np.int32) np.random.seed(time_seed + worker_id) print(f"WORKER {worker_id} seed:", np.random.get_state()[1][0]) dataloader = torch.utils.data.DataLoader(dataset, batch_size=input_cfg.batch_size, shuffle=True, num_workers=input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch, worker_init_fn=_worker_init_fn) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=eval_input_cfg.batch_size, shuffle=False, num_workers=eval_input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch) data_iter = iter(dataloader) ###################### # TRAINING ###################### log_path = model_dir / 'log.txt' logf = open(log_path, 'a') logf.write(proto_str) logf.write("\n") summary_dir = model_dir / 'summary' summary_dir.mkdir(parents=True, exist_ok=True) writer = SummaryWriter(str(summary_dir)) total_step_elapsed = 0 remain_steps = train_cfg.steps - net.get_global_step() t = time.time() ckpt_start_time = t total_loop = train_cfg.steps // train_cfg.steps_per_eval + 1 # total_loop = remain_steps // train_cfg.steps_per_eval + 1 clear_metrics_every_epoch = train_cfg.clear_metrics_every_epoch if train_cfg.steps % train_cfg.steps_per_eval == 0: total_loop -= 1 mixed_optimizer.zero_grad() try: for _ in range(total_loop): if total_step_elapsed + train_cfg.steps_per_eval > train_cfg.steps: steps = train_cfg.steps % train_cfg.steps_per_eval else: steps = train_cfg.steps_per_eval for step in range(steps): lr_scheduler.step() try: example = next(data_iter) except StopIteration: print("end epoch") if clear_metrics_every_epoch: net.clear_metrics() data_iter = iter(dataloader) example = next(data_iter) example_torch = example_convert_to_torch(example, float_dtype) batch_size = example["anchors"].shape[0] ret_dict = net(example_torch) # box_preds = ret_dict["box_preds"] cls_preds = ret_dict["cls_preds"] loss = ret_dict["loss"].mean() cls_loss_reduced = ret_dict["cls_loss_reduced"].mean() loc_loss_reduced = ret_dict["loc_loss_reduced"].mean() cls_pos_loss = ret_dict["cls_pos_loss"] cls_neg_loss = ret_dict["cls_neg_loss"] loc_loss = ret_dict["loc_loss"] cls_loss = ret_dict["cls_loss"] dir_loss_reduced = ret_dict["dir_loss_reduced"] cared = ret_dict["cared"] labels = example_torch["labels"] if train_cfg.enable_mixed_precision: loss *= loss_scale loss.backward() torch.nn.utils.clip_grad_norm_(net.parameters(), 10.0) mixed_optimizer.step() mixed_optimizer.zero_grad() net.update_global_step() net_metrics = net.update_metrics(cls_loss_reduced, loc_loss_reduced, cls_preds, labels, cared) step_time = (time.time() - t) t = time.time() metrics = {} num_pos = int((labels > 0)[0].float().sum().cpu().numpy()) num_neg = int((labels == 0)[0].float().sum().cpu().numpy()) if 'anchors_mask' not in example_torch: num_anchors = example_torch['anchors'].shape[1] else: num_anchors = int(example_torch['anchors_mask'][0].sum()) global_step = net.get_global_step() if global_step % display_step == 0: loc_loss_elem = [ float(loc_loss[:, :, i].sum().detach().cpu().numpy() / batch_size) for i in range(loc_loss.shape[-1]) ] metrics["step"] = global_step metrics["steptime"] = step_time metrics.update(net_metrics) metrics["loss"] = {} metrics["loss"]["loc_elem"] = loc_loss_elem metrics["loss"]["cls_pos_rt"] = float( cls_pos_loss.detach().cpu().numpy()) metrics["loss"]["cls_neg_rt"] = float( cls_neg_loss.detach().cpu().numpy()) # if unlabeled_training: # metrics["loss"]["diff_rt"] = float( # diff_loc_loss_reduced.detach().cpu().numpy()) if model_cfg.use_direction_classifier: metrics["loss"]["dir_rt"] = float( dir_loss_reduced.detach().cpu().numpy()) metrics["num_vox"] = int(example_torch["voxels"].shape[0]) metrics["num_pos"] = int(num_pos) metrics["num_neg"] = int(num_neg) metrics["num_anchors"] = int(num_anchors) metrics["lr"] = float( mixed_optimizer.param_groups[0]['lr']) metrics["image_idx"] = example['image_idx'][0] flatted_metrics = flat_nested_json_dict(metrics) flatted_summarys = flat_nested_json_dict(metrics, "/") for k, v in flatted_summarys.items(): if isinstance(v, (list, tuple)): v = {str(i): e for i, e in enumerate(v)} writer.add_scalars(k, v, global_step) else: writer.add_scalar(k, v, global_step) metrics_str_list = [] for k, v in flatted_metrics.items(): if isinstance(v, float): metrics_str_list.append(f"{k}={v:.3}") elif isinstance(v, (list, tuple)): if v and isinstance(v[0], float): v_str = ', '.join([f"{e:.3}" for e in v]) metrics_str_list.append(f"{k}=[{v_str}]") else: metrics_str_list.append(f"{k}={v}") else: metrics_str_list.append(f"{k}={v}") log_str = ', '.join(metrics_str_list) print(log_str, file=logf) print(log_str) ckpt_elasped_time = time.time() - ckpt_start_time if ckpt_elasped_time > train_cfg.save_checkpoints_secs: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) ckpt_start_time = time.time() total_step_elapsed += steps torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) # Ensure that all evaluation points are saved forever torchplus.train.save_models(eval_checkpoint_dir, [net, optimizer], net.get_global_step(), max_to_keep=100) net.eval() result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) print("#################################") print("#################################", file=logf) print("# EVAL") print("# EVAL", file=logf) print("#################################") print("#################################", file=logf) print("Generate output labels...") print("Generate output labels...", file=logf) t = time.time() dt_annos = [] prog_bar = ProgressBar() prog_bar.start(len(eval_dataset) // eval_input_cfg.batch_size + 1) for example in iter(eval_dataloader): example = example_convert_to_torch(example, float_dtype) if pickle_result: dt_annos += predict_kitti_to_anno(net, example, class_names, center_limit_range, model_cfg.lidar_input) else: _predict_kitti_to_file(net, example, result_path_step, class_names, center_limit_range, model_cfg.lidar_input) prog_bar.print_bar() sec_per_ex = len(eval_dataset) / (time.time() - t) print(f"avg forward time per example: {net.avg_forward_time:.3f}") print( f"avg postprocess time per example: {net.avg_postprocess_time:.3f}" ) net.clear_time_metrics() print(f'generate label finished({sec_per_ex:.2f}/s). start eval:') print(f'generate label finished({sec_per_ex:.2f}/s). start eval:', file=logf) gt_annos = [ info["annos"] for info in eval_dataset.dataset.kitti_infos ] if not pickle_result: dt_annos = kitti.get_label_annos(result_path_step) result, mAPbbox, mAPbev, mAP3d, mAPaos = get_official_eval_result( gt_annos, dt_annos, class_names, return_data=True) print(result, file=logf) print(result) writer.add_text('eval_result', result, global_step) for i, class_name in enumerate(class_names): writer.add_scalar('bev_ap:{}'.format(class_name), mAPbev[i, 1, 0], global_step) writer.add_scalar('3d_ap:{}'.format(class_name), mAP3d[i, 1, 0], global_step) writer.add_scalar('aos_ap:{}'.format(class_name), mAPaos[i, 1, 0], global_step) writer.add_scalar('bev_map', np.mean(mAPbev[:, 1, 0]), global_step) writer.add_scalar('3d_map', np.mean(mAP3d[:, 1, 0]), global_step) writer.add_scalar('aos_map', np.mean(mAPaos[:, 1, 0]), global_step) result = get_coco_eval_result(gt_annos, dt_annos, class_names) print(result, file=logf) print(result) if pickle_result: with open(result_path_step / "result.pkl", 'wb') as f: pickle.dump(dt_annos, f) writer.add_text('eval_result', result, global_step) net.train() except Exception as e: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) logf.close() raise e # save model before exit torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) logf.close()
def onnx_model_generate(config_path, model_dir, result_path=None, predict_test=False, ckpt_path=None): model_dir = pathlib.Path(model_dir) if predict_test: result_name = 'predict_test' else: result_name = 'eval_results' if result_path is None: result_path = model_dir / result_name else: result_path = pathlib.Path(result_path) config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config class_names = list(input_cfg.class_names) center_limit_range = model_cfg.post_center_limit_range ########################## ## Build Voxel Generator ########################## voxel_generator = voxel_builder.build(model_cfg.voxel_generator) bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) net = second_builder.build(model_cfg, voxel_generator, target_assigner, 1) net.cuda() if train_cfg.enable_mixed_precision: net.half() net.metrics_to_float() net.convert_norm_to_float(net) if ckpt_path is None: torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) else: torchplus.train.restore(ckpt_path, net) eval_dataset = input_reader_builder.build(input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=False, collate_fn=merge_second_batch) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 net.eval() result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) dt_annos = [] global_set = None print("Generate output labels...") bar = ProgressBar() bar.start(len(eval_dataset) // input_cfg.batch_size + 1) for example in iter(eval_dataloader): example = example_convert_to_torch(example, float_dtype) example_tuple = list(example.values()) batch_image_shape = example_tuple[8] example_tuple[8] = torch.from_numpy(example_tuple[8]) example_tuple[9] = torch.from_numpy(example_tuple[9]) dt_annos = export_onnx(net, example_tuple, class_names, batch_image_shape, center_limit_range, model_cfg.lidar_input, global_set) return 0 bar.print_bar()