def export(originpath="weights", newpath="jsonparamweights", load_name="512_512_ADAM_PRES_18", load_period=1, decode_number=-1, multiperclass=True, nms_thresh=0.45, nms_topk=200, except_class_thresh=0.01): try: _, test_dataset = testdataloader() except Exception: logging.info("The dataset does not exist") exit(0) prediction = Prediction( from_sigmoid=False, num_classes=test_dataset.num_class, decode_number=decode_number, nms_thresh=nms_thresh, nms_topk=nms_topk, except_class_thresh=except_class_thresh, multiperclass=multiperclass) origin_weight_path = os.path.join(originpath, load_name) sym_path = os.path.join(origin_weight_path, f'{load_name}-symbol.json') param_path = os.path.join(origin_weight_path, f'{load_name}-{load_period:04d}.params') temp = load_name.split("_") new_weight_path = os.path.join(newpath, load_name) if not os.path.exists(new_weight_path): os.makedirs(new_weight_path) if os.path.exists(sym_path) and os.path.exists(param_path): logging.info(f"loading {os.path.basename(param_path)} weights\n") net = gluon.SymbolBlock.imports(sym_path, ['data'], param_path) else: raise FileExistsError # prepost postnet = PostNet(net=net, auxnet=prediction) try: export_block_for_cplusplus(path=os.path.join(new_weight_path, f"{load_name}_prepost"), block=postnet, data_shape=tuple((int(temp[0]), int(temp[1]))) + tuple((3,)), epoch=load_period, preprocess=True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공")
def export(originpath="weights", newpath="jsonparamweights", load_name="480_640_ADAM_PCENTER_RES18", load_period=1, topk=200, nms=False, except_class_thresh=0.01, nms_thresh=0.5): scale_factor = 4 # 고정 prediction = Prediction(topk=topk, scale=scale_factor, nms=nms, except_class_thresh=except_class_thresh, nms_thresh=nms_thresh) origin_weight_path = os.path.join(originpath, load_name) sym_path = os.path.join(origin_weight_path, f'{load_name}-symbol.json') param_path = os.path.join(origin_weight_path, f'{load_name}-{load_period:04d}.params') temp = load_name.split("_") new_weight_path = os.path.join(newpath, load_name) if not os.path.exists(new_weight_path): os.makedirs(new_weight_path) if os.path.exists(sym_path) and os.path.exists(param_path): logging.info(f"loading {os.path.basename(param_path)} weights\n") net = gluon.SymbolBlock.imports(sym_path, ['data'], param_path) else: raise FileExistsError # prepost postnet = PostNet(net=net, auxnet=prediction) try: export_block_for_cplusplus( path=os.path.join(new_weight_path, f"{load_name}_prepost"), block=postnet, data_shape=tuple((int(temp[0]), int(temp[1]))) + tuple((3, )), epoch=load_period, preprocess=True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공")
def run( image_list=True, # True일 때, 폴더에 있는 이미지(jpg)들 전부다 평가 / False일 때, 한장 평가 # image_path='Dataset/test/2. csm_meng_meng_baby_1_88cad0f74f.jpg', image_path='Dataset/test', weight_path="weights", load_name="480_640_ADAM_PCENTER_RES18", load_period=200, GPU_COUNT=0, topk=100, plot_class_thresh=0.5, image_save_path="result_image", image_show=False, image_save=True): if GPU_COUNT <= 0: ctx = mx.cpu(0) elif GPU_COUNT > 0: ctx = mx.gpu(0) # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if GPU_COUNT > 0: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: logging.info(f'Running on {ctx}') logging.info(f"test {load_name}") scale_factor = 4 # 고정 logging.info(f"scale factor {scale_factor}") netheight = int(load_name.split("_")[0]) netwidth = int(load_name.split("_")[1]) if not isinstance(netheight, int) and not isinstance(netwidth, int): logging.info("height is not int") logging.info("width is not int") raise ValueError else: logging.info(f"network input size : {(netheight, netwidth)}") try: _, test_dataset = testdataloader() except Exception: logging.info("The dataset does not exist") exit(0) weight_path = os.path.join(weight_path, load_name) sym = os.path.join(weight_path, f'{load_name}-symbol.json') params = os.path.join(weight_path, f'{load_name}-{load_period:04d}.params') logging.info("symbol model test") if os.path.exists(sym) and os.path.exists(params): logging.info(f"loading {os.path.basename(params)} weights\n") net = gluon.SymbolBlock.imports(sym, ['data'], params, ctx=ctx) else: raise FileExistsError try: net = export_block_for_cplusplus( block=net, data_shape=tuple((netheight, netwidth)) + tuple((3, )), preprocess=True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=ctx) except Exception as E: logging.error(f"adding preprocessing layer 실패 : {E}") else: logging.info(f"adding preprocessing layer 성공 ") net.hybridize(active=True, static_alloc=True, static_shape=True) prediction = Prediction(topk=topk, scale=scale_factor) if image_list: types = ('*.jpg', '*.png') images_path = [] for type in types: images_path.extend(glob.glob(os.path.join(image_path, type))) if images_path: for img_path in tqdm(images_path): name = os.path.splitext(os.path.basename(img_path))[0] image = cv2.imread(img_path, cv2.IMREAD_COLOR) height, width, _ = image.shape logging.info(f"real input size : {(height, width)}") origin_image = image.copy() image = cv2.resize(image, (netwidth, netheight), interpolation=3) image[:, :, (0, 1, 2)] = image[:, :, (2, 1, 0)] # BGR to RGB image = mx.nd.array(image, ctx=ctx) image = image.expand_dims(axis=0) heatmap_pred, offset_pred, wh_pred = net(image) ids, scores, bboxes = prediction(heatmap_pred, offset_pred, wh_pred) bbox = box_resize(bboxes[0], (netwidth, netheight), (width, height)) plot_bbox(origin_image, bbox, scores=scores[0], labels=ids[0], thresh=plot_class_thresh, reverse_rgb=False, class_names=test_dataset.classes, image_show=image_show, image_save=image_save, image_save_path=image_save_path, image_name=name) else: raise FileNotFoundError else: name = os.path.splitext(os.path.basename(image_path))[0] image = cv2.imread(image_path, cv2.IMREAD_COLOR) height, width, _ = image.shape logging.info(f"real input size : {(height, width)}") origin_image = image.copy() image = cv2.resize(image, (netwidth, netheight), interpolation=3) image[:, :, (0, 1, 2)] = image[:, :, (2, 1, 0)] # BGR to RGB image = mx.nd.array(image, ctx=ctx) image = image.expand_dims(axis=0) heatmap_pred, offset_pred, wh_pred = net(image) ids, scores, bboxes = prediction(heatmap_pred, offset_pred, wh_pred) bbox = box_resize(bboxes[0], (netwidth, netheight), (width, height)) plot_bbox(origin_image, bbox, scores=scores[0], labels=ids[0], thresh=plot_class_thresh, reverse_rgb=False, class_names=test_dataset.classes, image_show=image_show, image_save=image_save, image_save_path=image_save_path, image_name=name) cv2.destroyAllWindows()
def run(weight_path="weights", load_name="512_512_ADAM_PEFF_0", load_period=100, GPU_COUNT=0, decode_number=5000, multiperclass=True, nms_thresh=0.5, nms_topk=500, except_class_thresh=0.05, plot_class_thresh=0.5, video_name="webcam", video_save_path="result_video", video_show=True, video_save=True): if video_save: if not os.path.exists(video_save_path): os.makedirs(video_save_path) if GPU_COUNT <= 0: ctx = mx.cpu(0) elif GPU_COUNT > 0: ctx = mx.gpu(0) # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if GPU_COUNT > 0: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: logging.info(f'Running on {ctx}') logging.info(f"test {load_name}") netheight = int(load_name.split("_")[0]) netwidth = int(load_name.split("_")[1]) if not isinstance(netheight, int) and not isinstance(netwidth, int): logging.info("height is not int") logging.info("width is not int") raise ValueError else: logging.info(f"network input size : {(netheight, netwidth)}") try: _, test_dataset = testdataloader() except Exception: logging.info("The dataset does not exist") exit(0) weight_path = os.path.join(weight_path, load_name) sym = os.path.join(weight_path, f'{load_name}-symbol.json') params = os.path.join(weight_path, f'{load_name}-{load_period:04d}.params') logging.info("symbol model test") if os.path.exists(sym) and os.path.exists(params): logging.info(f"loading {os.path.basename(params)} weights\n") net = gluon.SymbolBlock.imports(sym, ['data'], params, ctx=ctx) else: raise FileExistsError try: net = export_block_for_cplusplus( block=net, data_shape=tuple((netheight, netwidth)) + tuple((3, )), preprocess=True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=ctx) except Exception as E: logging.error(f"adding preprocessing layer 실패 : {E}") else: logging.info(f"adding preprocessing layer 성공 ") net.hybridize(active=True, static_alloc=True, static_shape=True) # BoxEncoder, BoxDecoder 에서 같은 값을 가져야함 prediction = Prediction(from_sigmoid=False, num_classes=test_dataset.num_class, decode_number=decode_number, nms_thresh=nms_thresh, nms_topk=nms_topk, except_class_thresh=except_class_thresh, multiperclass=multiperclass) fourcc = cv2.VideoWriter_fourcc(*'DIVX') cap = cv2.VideoCapture(0) fps = cap.get(cv2.CAP_PROP_FPS) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) out = cv2.VideoWriter(os.path.join(video_save_path, f'{video_name}.avi'), fourcc, fps, (width, height)) logging.info(f"real input size : {(height, width)}") while True: ret, image = cap.read() if ret: origin_image = image.copy() image = cv2.resize(image, (netwidth, netheight), interpolation=3) image[:, :, (0, 1, 2)] = image[:, :, (2, 1, 0)] # BGR to RGB image = mx.nd.array(image, ctx=ctx) image = image.expand_dims(axis=0) cls_preds, box_preds, anchors = net(image) ids, scores, bboxes = prediction(cls_preds, box_preds, anchors) bbox = box_resize(bboxes[0], (netwidth, netheight), (width, height)) result = plot_bbox(origin_image, bbox, scores=scores[0], labels=ids[0], thresh=plot_class_thresh, reverse_rgb=False, class_names=test_dataset.classes) if video_save: out.write(result) if video_show: cv2.imshow(video_name, result) cv2.waitKey(1) cap.release() out.release() cv2.destroyAllWindows()
def onnx_export(path="weights", newpath="exportweights", load_name="608_608_SGD_PDark_53", load_period=1, target_size=(768, 768), anchors={"shallow": [(10, 13), (16, 30), (33, 23)], "middle": [(30, 61), (62, 45), (59, 119)], "deep": [(116, 90), (156, 198), (373, 326)]}, dtype=np.float32): try: _, test_dataset = testdataloader() except Exception: logging.info("The dataset does not exist") exit(0) weight_path = os.path.join(path, load_name) if not os.path.exists(weight_path): raise FileExistsError params = os.path.join(weight_path, f'{load_period}.params') temp = load_name.split("_") version = int(temp[-1]) net = YoloV3output(Darknetlayer=version, anchors=anchors, num_classes=test_dataset.num_class) net.load_parameters(params, allow_missing=True, ignore_extra=True) anchoroffstnet = AnchorOffstNet(net=net, version=version, anchors=anchors, target_size=target_size) new_weight_path = os.path.join(newpath, load_name) if not os.path.exists(new_weight_path): os.makedirs(new_weight_path) newname = str(target_size[0]) + "_" + str(target_size[1]) + "_" + temp[2] + "_" + temp[3] + "_" + temp[4] sym_pre = os.path.join(new_weight_path, f'{newname}_pre-symbol.json') params_pre = os.path.join(new_weight_path, f'{newname}_pre-{load_period:04d}.params') onnx_pre_file_path = os.path.join(new_weight_path, f"{newname}_pre.onnx") export_block_for_cplusplus(path=os.path.join(new_weight_path, f"{newname}_pre"), block=anchoroffstnet, data_shape=tuple(target_size) + tuple((3,)), epoch=load_period, preprocess=True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', remove_amp_cast=True) try: export_block_for_cplusplus(path=os.path.join(new_weight_path, f"{newname}_pre"), block=anchoroffstnet, data_shape=tuple(target_size) + tuple((3,)), epoch=load_period, preprocess=True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공") try: onnx_mxnet.export_model(sym=sym_pre, params=params_pre, input_shape=[tuple((1,)) + tuple(target_size) + tuple((3,))], input_type=dtype, onnx_file_path=onnx_pre_file_path, verbose=False) except Exception as E: logging.error(f"ONNX model export 예외 발생 : {E}") else: logging.info(f"ONNX model export 성공") try: check_onnx(onnx_pre_file_path) logging.info(f"{os.path.basename(onnx_pre_file_path)} saved completed") except Exception as E: logging.error(f"ONNX model check 예외 발생 : {E}") else: logging.info("ONNX model check completed")
def run(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], graphviz=True, epoch=100, input_size=[512, 512], batch_size=16, batch_log=100, batch_interval=10, subdivision=4, train_dataset_path="Dataset/train", valid_dataset_path="Dataset/valid", multiscale=True, factor_scale=[8, 5], data_augmentation=True, num_workers=4, optimizer="ADAM", lambda_off=1, lambda_size=0.1, save_period=5, load_period=10, learning_rate=0.001, decay_lr=0.999, decay_step=10, GPU_COUNT=0, base=18, pretrained_base=True, pretrained_path="modelparam", AMP=True, valid_size=8, eval_period=5, tensorboard=True, valid_graph_path="valid_Graph", using_mlflow=True, topk=100, plot_class_thresh=0.5): ''' AMP 가 모든 연산을 지원하지는 않는다. modulated convolution을 지원하지 않음 ''' if GPU_COUNT == 0: ctx = mx.cpu(0) AMP = False elif GPU_COUNT == 1: ctx = mx.gpu(0) else: ctx = [mx.gpu(i) for i in range(GPU_COUNT)] # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if isinstance(ctx, (list, tuple)): for i, c in enumerate(ctx): free_memory, total_memory = mx.context.gpu_memory_info(i) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: if GPU_COUNT == 1: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: logging.info(f'Running on {ctx}') if GPU_COUNT > 0 and batch_size < GPU_COUNT: logging.info("batch size must be greater than gpu number") exit(0) if AMP: amp.init() if multiscale: logging.info("Using MultiScale") if data_augmentation: logging.info("Using Data Augmentation") logging.info("training Center Detector") input_shape = (1, 3) + tuple(input_size) scale_factor = 4 # 고정 logging.info(f"scale factor {scale_factor}") try: train_dataloader, train_dataset = traindataloader( multiscale=multiscale, factor_scale=factor_scale, augmentation=data_augmentation, path=train_dataset_path, input_size=input_size, batch_size=batch_size, batch_interval=batch_interval, num_workers=num_workers, shuffle=True, mean=mean, std=std, scale_factor=scale_factor, make_target=True) valid_dataloader, valid_dataset = validdataloader( path=valid_dataset_path, input_size=input_size, batch_size=valid_size, num_workers=num_workers, shuffle=True, mean=mean, std=std, scale_factor=scale_factor, make_target=True) except Exception as E: logging.info(E) exit(0) train_update_number_per_epoch = len(train_dataloader) if train_update_number_per_epoch < 1: logging.warning("train batch size가 데이터 수보다 큼") exit(0) valid_list = glob.glob(os.path.join(valid_dataset_path, "*")) if valid_list: valid_update_number_per_epoch = len(valid_dataloader) if valid_update_number_per_epoch < 1: logging.warning("valid batch size가 데이터 수보다 큼") exit(0) num_classes = train_dataset.num_class # 클래스 수 name_classes = train_dataset.classes optimizer = optimizer.upper() if pretrained_base: model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_P" + "CENTER_RES" + str(base) else: model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_CENTER_RES" + str(base) weight_path = f"weights/{model}" sym_path = os.path.join(weight_path, f'{model}-symbol.json') param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params') if os.path.exists(param_path) and os.path.exists(sym_path): start_epoch = load_period logging.info(f"loading {os.path.basename(param_path)} weights\n") net = gluon.SymbolBlock.imports(sym_path, ['data'], param_path, ctx=ctx) else: start_epoch = 0 net = CenterNet(base=base, heads=OrderedDict([('heatmap', { 'num_output': num_classes, 'bias': -2.19 }), ('offset', { 'num_output': 2 }), ('wh', { 'num_output': 2 })]), head_conv_channel=64, pretrained=pretrained_base, root=pretrained_path, use_dcnv2=False, ctx=ctx) if isinstance(ctx, (list, tuple)): net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.summary(mx.nd.ones(shape=input_shape, ctx=ctx)) ''' active (bool, default True) – Whether to turn hybrid on or off. static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase. static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower. ''' if multiscale: net.hybridize(active=True, static_alloc=True, static_shape=False) else: net.hybridize(active=True, static_alloc=True, static_shape=True) if start_epoch + 1 >= epoch + 1: logging.info("this model has already been optimized") exit(0) if tensorboard: summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10, verbose=False) if isinstance(ctx, (list, tuple)): net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.forward(mx.nd.ones(shape=input_shape, ctx=ctx)) summary.add_graph(net) if graphviz: gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model) # optimizer unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size step = unit * decay_step lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate) for p in net.collect_params().values(): if p.grad_req != "null": p.grad_req = 'add' if AMP: ''' update_on_kvstore : bool, default None Whether to perform parameter updates on kvstore. If None, then trainer will choose the more suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored. ''' if optimizer.upper() == "ADAM": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "SGD": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0001, "momentum": 0.9, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling else: logging.error("optimizer not selected") exit(0) amp.init_trainer(trainer) else: if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False }) elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False }) elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0001, "momentum": 0.9, 'multi_precision': False }) else: logging.error("optimizer not selected") exit(0) heatmapfocalloss = HeatmapFocalLoss(from_sigmoid=True, alpha=2, beta=4) normedl1loss = NormedL1Loss() prediction = Prediction(batch_size=valid_size, topk=topk, scale=scale_factor) precision_recall = Voc_2007_AP(iou_thresh=0.5, class_names=name_classes) start_time = time.time() for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch): heatmap_loss_sum = 0 offset_loss_sum = 0 wh_loss_sum = 0 time_stamp = time.time() ''' target generator를 train_dataloader에서 만들어 버리는게 학습 속도가 훨씬 빠르다. ''' for batch_count, (image, _, heatmap, offset_target, wh_target, mask_target, _) in enumerate(train_dataloader, start=1): td_batch_size = image.shape[0] image_split = mx.nd.split(data=image, num_outputs=subdivision, axis=0) heatmap_split = mx.nd.split(data=heatmap, num_outputs=subdivision, axis=0) offset_target_split = mx.nd.split(data=offset_target, num_outputs=subdivision, axis=0) wh_target_split = mx.nd.split(data=wh_target, num_outputs=subdivision, axis=0) mask_target_split = mx.nd.split(data=mask_target, num_outputs=subdivision, axis=0) if subdivision == 1: image_split = [image_split] heatmap_split = [heatmap_split] offset_target_split = [offset_target_split] wh_target_split = [wh_target_split] mask_target_split = [mask_target_split] ''' autograd 설명 https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html ''' with autograd.record(train_mode=True): heatmap_all_losses = [] offset_all_losses = [] wh_all_losses = [] for image_part, heatmap_part, offset_target_part, wh_target_part, mask_target_part in zip( image_split, heatmap_split, offset_target_split, wh_target_split, mask_target_split): if GPU_COUNT <= 1: image_part = gluon.utils.split_and_load( image_part, [ctx], even_split=False) heatmap_part = gluon.utils.split_and_load( heatmap_part, [ctx], even_split=False) offset_target_part = gluon.utils.split_and_load( offset_target_part, [ctx], even_split=False) wh_target_part = gluon.utils.split_and_load( wh_target_part, [ctx], even_split=False) mask_target_part = gluon.utils.split_and_load( mask_target_part, [ctx], even_split=False) else: image_part = gluon.utils.split_and_load( image_part, ctx, even_split=False) heatmap_part = gluon.utils.split_and_load( heatmap_part, ctx, even_split=False) offset_target_part = gluon.utils.split_and_load( offset_target_part, ctx, even_split=False) wh_target_part = gluon.utils.split_and_load( wh_target_part, ctx, even_split=False) mask_target_part = gluon.utils.split_and_load( mask_target_part, ctx, even_split=False) # prediction, target space for Data Parallelism heatmap_losses = [] offset_losses = [] wh_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, heatmap_target, offset_target, wh_target, mask_target in zip( image_part, heatmap_part, offset_target_part, wh_target_part, mask_target_part): heatmap_pred, offset_pred, wh_pred = net(img) heatmap_loss = heatmapfocalloss( heatmap_pred, heatmap_target) offset_loss = normedl1loss(offset_pred, offset_target, mask_target) * lambda_off wh_loss = normedl1loss(wh_pred, wh_target, mask_target) * lambda_size heatmap_losses.append(heatmap_loss.asscalar()) offset_losses.append(offset_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) total_loss.append(heatmap_loss + offset_loss + wh_loss) if AMP: with amp.scale_loss(total_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(total_loss) heatmap_all_losses.append(sum(heatmap_losses)) offset_all_losses.append(sum(offset_losses)) wh_all_losses.append(sum(wh_losses)) trainer.step(batch_size=td_batch_size, ignore_stale_grad=False) # 비우기 for p in net.collect_params().values(): p.zero_grad() heatmap_loss_sum += sum(heatmap_all_losses) / td_batch_size offset_loss_sum += sum(offset_all_losses) / td_batch_size wh_loss_sum += sum(wh_all_losses) / td_batch_size if batch_count % batch_log == 0: logging.info( f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],' f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],' f'[Lr = {trainer.learning_rate}]' f'[heatmap loss = {sum(heatmap_all_losses) / td_batch_size:.3f}]' f'[offset loss = {sum(offset_all_losses) / td_batch_size:.3f}]' f'[wh loss = {sum(wh_all_losses) / td_batch_size:.3f}]') time_stamp = time.time() train_heatmap_loss_mean = np.divide(heatmap_loss_sum, train_update_number_per_epoch) train_offset_loss_mean = np.divide(offset_loss_sum, train_update_number_per_epoch) train_wh_loss_mean = np.divide(wh_loss_sum, train_update_number_per_epoch) train_total_loss_mean = train_heatmap_loss_mean + train_offset_loss_mean + train_wh_loss_mean logging.info( f"train heatmap loss : {train_heatmap_loss_mean} / train offset loss : {train_offset_loss_mean} / train wh loss : {train_wh_loss_mean} / train total loss : {train_total_loss_mean}" ) if i % eval_period == 0 and valid_list: heatmap_loss_sum = 0 offset_loss_sum = 0 wh_loss_sum = 0 # loss 구하기 for image, label, heatmap_all, offset_target_all, wh_target_all, mask_target_all, _ in valid_dataloader: vd_batch_size = image.shape[0] if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) heatmap_split = gluon.utils.split_and_load( heatmap_all, [ctx], even_split=False) offset_target_split = gluon.utils.split_and_load( offset_target_all, [ctx], even_split=False) wh_target_split = gluon.utils.split_and_load( wh_target_all, [ctx], even_split=False) mask_target_split = gluon.utils.split_and_load( mask_target_all, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) heatmap_split = gluon.utils.split_and_load( heatmap_all, ctx, even_split=False) offset_target_split = gluon.utils.split_and_load( offset_target_all, ctx, even_split=False) wh_target_split = gluon.utils.split_and_load( wh_target_all, ctx, even_split=False) mask_target_split = gluon.utils.split_and_load( mask_target_all, ctx, even_split=False) # prediction, target space for Data Parallelism heatmap_losses = [] offset_losses = [] wh_losses = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, lb, heatmap_target, offset_target, wh_target, mask_target in zip( image, label, heatmap_split, offset_target_split, wh_target_split, mask_target_split): gt_box = lb[:, :, :4] gt_id = lb[:, :, 4:5] heatmap_pred, offset_pred, wh_pred = net(img) id, score, bbox = prediction(heatmap_pred, offset_pred, wh_pred) precision_recall.update(pred_bboxes=bbox, pred_labels=id, pred_scores=score, gt_boxes=gt_box * scale_factor, gt_labels=gt_id) heatmap_loss = heatmapfocalloss(heatmap_pred, heatmap_target) offset_loss = normedl1loss(offset_pred, offset_target, mask_target) * lambda_off wh_loss = normedl1loss(wh_pred, wh_target, mask_target) * lambda_size heatmap_losses.append(heatmap_loss.asscalar()) offset_losses.append(offset_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) heatmap_loss_sum += sum(heatmap_losses) / vd_batch_size offset_loss_sum += sum(offset_losses) / vd_batch_size wh_loss_sum += sum(wh_losses) / vd_batch_size valid_heatmap_loss_mean = np.divide(heatmap_loss_sum, valid_update_number_per_epoch) valid_offset_loss_mean = np.divide(offset_loss_sum, valid_update_number_per_epoch) valid_wh_loss_mean = np.divide(wh_loss_sum, valid_update_number_per_epoch) valid_total_loss_mean = valid_heatmap_loss_mean + valid_offset_loss_mean + valid_wh_loss_mean logging.info( f"valid heatmap loss : {valid_heatmap_loss_mean} / valid offset loss : {valid_offset_loss_mean} / valid wh loss : {valid_wh_loss_mean} / valid total loss : {valid_total_loss_mean}" ) AP_appender = [] round_position = 2 class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list( ) for j, c, p, r in zip(range(len(recall)), class_name, precision, recall): name, AP = precision_recall.get_AP(c, p, r) logging.info( f"class {j}'s {name} AP : {round(AP * 100, round_position)}%" ) AP_appender.append(AP) mAP_result = np.mean(AP_appender) logging.info(f"mAP : {round(mAP_result * 100, round_position)}%") precision_recall.get_PR_curve(name=class_name, precision=precision, recall=recall, threshold=threshold, AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i) precision_recall.reset() if tensorboard: # gpu N 개를 대비한 코드 (Data Parallelism) dataloader_iter = iter(valid_dataloader) image, label, _, _, _, _, _ = next(dataloader_iter) if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) ground_truth_colors = {} for k in range(num_classes): ground_truth_colors[k] = (0, 0, 1) batch_image = [] heatmap_image = [] for img, lb in zip(image, label): gt_boxes = lb[:, :, :4] gt_ids = lb[:, :, 4:5] heatmap_pred, offset_pred, wh_pred = net(img) ids, scores, bboxes = prediction(heatmap_pred, offset_pred, wh_pred) for ig, gt_id, gt_box, heatmap, id, score, bbox in zip( img, gt_ids, gt_boxes, heatmap_pred, ids, scores, bboxes): ig = ig.transpose((1, 2, 0)) * mx.nd.array( std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context) ig = (ig * 255).clip(0, 255) # heatmap 그리기 heatmap = mx.nd.multiply(heatmap, 255.0) # 0 ~ 255 범위로 바꾸기 heatmap = mx.nd.max( heatmap, axis=0, keepdims=True) # channel 축으로 가장 큰것 뽑기 heatmap = mx.nd.transpose( heatmap, axes=(1, 2, 0)) # (height, width, channel=1) heatmap = mx.nd.repeat( heatmap, repeats=3, axis=-1) # (height, width, channel=3) heatmap = heatmap.asnumpy( ) # mxnet.ndarray -> numpy.ndarray heatmap = cv2.resize(heatmap, dsize=(input_size[1], input_size[0])) # 사이즈 원복 heatmap = heatmap.astype("uint8") # float32 -> uint8 heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET) heatmap[:, :, (0, 1, 2)] = heatmap[:, :, (2, 1, 0)] # BGR -> RGB heatmap = np.transpose( heatmap, axes=(2, 0, 1)) # (channel=3, height, width) # ground truth box 그리기 ground_truth = plot_bbox( ig, gt_box * scale_factor, scores=None, labels=gt_id, thresh=None, reverse_rgb=True, class_names=valid_dataset.classes, absolute_coordinates=True, colors=ground_truth_colors) # prediction box 그리기 prediction_box = plot_bbox( ground_truth, bbox, scores=score, labels=id, thresh=plot_class_thresh, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True) # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다. prediction_box = cv2.cvtColor(prediction_box, cv2.COLOR_BGR2RGB) prediction_box = np.transpose(prediction_box, axes=(2, 0, 1)) batch_image.append( prediction_box) # (batch, channel, height, width) heatmap_image.append(heatmap) all_image = np.concatenate( [np.array(batch_image), np.array(heatmap_image)], axis=-1) summary.add_image(tag="valid_result", image=all_image, global_step=i) summary.add_scalar(tag="heatmap_loss", value={ "train_heatmap_loss_mean": train_heatmap_loss_mean, "valid_heatmap_loss_mean": valid_heatmap_loss_mean }, global_step=i) summary.add_scalar(tag="offset_loss", value={ "train_offset_loss_mean": train_offset_loss_mean, "valid_offset_loss_mean": valid_offset_loss_mean }, global_step=i) summary.add_scalar(tag="wh_loss", value={ "train_wh_loss_mean": train_wh_loss_mean, "valid_wh_loss_mean": valid_wh_loss_mean }, global_step=i) summary.add_scalar(tag="total_loss", value={ "train_total_loss": train_total_loss_mean, "valid_total_loss": valid_total_loss_mean }, global_step=i) params = net.collect_params().values() if GPU_COUNT > 1: for c in ctx: for p in params: summary.add_histogram(tag=p.name, values=p.data(ctx=c), global_step=i, bins='default') else: for p in params: summary.add_histogram(tag=p.name, values=p.data(), global_step=i, bins='default') if i % save_period == 0: if not os.path.exists(weight_path): os.makedirs(weight_path) ''' Hybrid models can be serialized as JSON files using the export function Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface. When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc. ''' if GPU_COUNT >= 1: context = mx.gpu(0) else: context = mx.cpu(0) postnet = PostNet(net=net, auxnet=prediction) # 새로운 객체가 생성 try: net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True) net.save_parameters(os.path.join(weight_path, f"{i}.params")) # onnx 추출용 # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 export_block_for_cplusplus( path=os.path.join(weight_path, f"{model}_prepost"), block=postnet, data_shape=tuple(input_size) + tuple((3, )), epoch=i, preprocess= True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=context, remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공") net.collect_params().reset_ctx(ctx) end_time = time.time() learning_time = end_time - start_time logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H") logging.info("optimization completed") if using_mlflow: ml.log_metric("learning time", round(learning_time / 3600, 2))
def export( originpath="weights", newpath="exportweights", load_name="512_512_ADAM_PVGG16_512", load_period=1, target_size=(768, 768), box_sizes300=[21, 45, 101.25, 157.5, 213.75, 270, 326.25], box_ratios300=[[1, 2, 0.5]] + # conv4_3 [[1, 2, 0.5, 3, 1.0 / 3]] * 3 + # conv7, conv8_2, conv9_2, conv10_2 [[1, 2, 0.5]] * 2, # conv11_2, conv12_2 box_sizes512=[21, 51.2, 133.12, 215.04, 296.96, 378.88, 460.8, 542.72], box_ratios512=[[1, 2, 0.5]] + # conv4_3 [[1, 2, 0.5, 3, 1.0 / 3]] * 4 + # conv7, conv8_2, conv9_2, conv10_2 [[1, 2, 0.5]] * 2, # conv11_2, conv12_2 anchor_box_offset=(0.5, 0.5), anchor_box_clip=False, dtype=np.float32): _, test_dataset = testdataloader() weight_path = os.path.join(originpath, load_name) if not os.path.exists(weight_path): raise FileExistsError params = os.path.join(weight_path, f'{load_period}.params') temp = load_name.split("_") version = int(temp[-1]) if version == 300: box_sizes = box_sizes300 box_ratios = box_ratios300 elif version == 512: box_sizes = box_sizes512 box_ratios = box_ratios512 net = SSD_VGG16_Except_Anchor(version=int(load_name.split("_")[-1]), input_size=target_size, box_sizes=box_sizes, box_ratios=box_ratios, num_classes=test_dataset.num_class) net.load_parameters(params, allow_missing=True, ignore_extra=True) ''' vgg16을 AnchorNet에서 선언한 순간 내부의 auxiliary param으로 인식하기 때문에 반드시 initialization을 해줘야 하는데, (외부에서 선언해서 보내면 이럴 필요없고 net.export만 수정해주면 된다. defer init을 쓴 상태라서 실제 forward를 한번 해줘야 한다.(anchornet.forward(mx.nd.ones(shape=(1, 3) + target_size, ctx=ctx))) 현재는 밖에서 보내면, 순간 내부의 auxiliary param으로 인식하지는 않아서, forward를 할 필요는 없다. 어쨌든 두 방법 모두 사용하지 않는 변수가 생기기 때문에, net.export(내부 코드)의 assert name in aux_names 부분을 주석 처리 해야한다. 단 위와 같이 저장 한 경우, json, param을 불러 올 때, 아래와 같이 allow_missing=True, ignore_extra=True 인자를 gluon.SymbolBlock.imports(내부코드)를 수정해야 한다. ret.collect_params().load 함수에 allow_missing=True, ignore_extra=True 를 주면 된다 net = gluon.SymbolBlock.imports(sym, ['data'], params, ctx=ctx, allow_missing=True, ignore_extra=True) --> 인자로 주게 만들어 놓지... < 상세 설명 > target_size에 맞는 anchor를 얻기 위해서는 아래의 AnchorNet에서 VGG16 네트워크를 또 한번 호출해야 한다. -> 이렇게 되면 새롭게 호출하는 VGG16가 새로 저장하는 json에 추가적으로 써지고, 파라미터도 저장이 되기 때문에 비효율적이다.(해결책? 아직은...) 이에 따라 또다른 문제가 생기는데(정확히 말해서 net.export에 내가 말하고자 하는 기능이 고려가 안되있다.), 현재 net.export는 할당한 파라미터들을 forward 에서 다 사용하지 않으면, AssertionError를 발생시킨다. 실제 연산에 사용되는 파라미터만 저장하려고 하기 떄문에 이러한 오류가 발생하는데, 그냥 다 저장하게 net.export의 내부 코드 한줄을 수정하면 된다. 이렇게 저장한 json, param을 불러 올때는 아래와 같이 불러와야 하는데, allow_missing=True, ignore_extra=True 는 본인이 직접 추가한 것이다.(gluon.SymbolBlock.imports 에는 allow_missing, ignore_extra 인자가 고려 되어 있지 않다.) gluon.SymbolBlock.imports(sym, ['data'], params, ctx=ctx, allow_missing=True, ignore_extra=True) 아래와 같이 gluon.SymbolBlock.imports을 수정하면 된다. def imports(symbol_file, input_names, param_file=None, ctx=None, allow_missing=True, ignore_extra=True): sym = symbol.load(symbol_file) if isinstance(input_names, str): input_names = [input_names] if param_file is None: # Get a valid type inference by using fp32 inputs = [symbol.var(i, dtype=mx_real_t) for i in input_names] else: # Do not specify type, rely on saved params type instead inputs = [symbol.var(i) for i in input_names] ret = SymbolBlock(sym, inputs) if param_file is not None: # allow_missing=True, ignore_extra=True 추가 함. ret.collect_params().load(param_file, ctx=ctx, cast_dtype=True, dtype_source='saved', allow_missing=allow_missing, ignore_extra=ignore_extra) return ret 따라서 안쓰는 파라미터를 저장하지 않으면 net.export 내부를 수정해야 한다.(아래 설명) ''' anchornet = AnchorNet(net=net, version=version, box_sizes300=box_sizes300, box_ratios300=box_ratios300, box_sizes512=box_sizes512, box_ratios512=box_ratios512, anchor_box_clip=anchor_box_clip, anchor_box_offset=anchor_box_offset, target_size=target_size) new_weight_path = os.path.join(newpath, load_name) if not os.path.exists(new_weight_path): os.makedirs(new_weight_path) newname = str(target_size[0]) + "_" + str( target_size[1]) + "_" + temp[2] + "_" + temp[3] + "_" + temp[4] sym_pre = os.path.join(new_weight_path, f'{newname}_pre-symbol.json') params_pre = os.path.join(new_weight_path, f'{newname}_pre-{load_period:04d}.params') onnx_pre_file_path = os.path.join(new_weight_path, f"{newname}_pre.onnx") try: ''' def export(self, path, epoch=0, remove_amp_cast=True): """Export HybridBlock to json format that can be loaded by `SymbolBlock.imports`, `mxnet.mod.Module` or the C++ interface. .. note:: When there are only one input, it will have name `data`. When there Are more than one inputs, they will be named as `data0`, `data1`, etc. Parameters ---------- path : str Path to save model. Two files `path-symbol.json` and `path-xxxx.params` will be created, where xxxx is the 4 digits epoch number. epoch : int Epoch number of saved model. """ if not self._cached_graph: raise RuntimeError( "Please first call block.hybridize() and then run forward with " "this block at least once before calling export.") sym = self._cached_graph[1] sym.save('%s-symbol.json'%path, remove_amp_cast=remove_amp_cast) arg_names = set(sym.list_arguments()) aux_names = set(sym.list_auxiliary_states()) arg_dict = {} for name, param in self.collect_params().items(): if name in arg_names: arg_dict['arg:%s'%name] = param._reduce() else: #assert name in aux_names # 여기 주석 처리 해야함 arg_dict['aux:%s'%name] = param._reduce() ndarray.save('%s-%04d.params'%(path, epoch), arg_dict) ''' export_block_for_cplusplus( path=os.path.join(new_weight_path, f"{newname}_pre"), block=anchornet, data_shape=tuple(target_size) + tuple((3, )), epoch=load_period, preprocess=True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공") try: onnx_mxnet.export_model( sym=sym_pre, params=params_pre, input_shape=[tuple((1, )) + tuple(target_size) + tuple((3, ))], input_type=dtype, onnx_file_path=onnx_pre_file_path, verbose=False) except Exception as E: logging.error(f"ONNX model export 예외 발생 : {E}") else: logging.info(f"ONNX model export 성공") try: check_onnx(onnx_pre_file_path) logging.info(f"{os.path.basename(onnx_pre_file_path)} saved completed") except Exception as E: logging.error(f"ONNX model check 예외 발생 : {E}") else: logging.info("ONNX model check completed")
def run(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], offset_alloc_size=(64, 64), anchors={"shallow": [(10, 13), (16, 30), (33, 23)], "middle": [(30, 61), (62, 45), (59, 119)], "deep": [(116, 90), (156, 198), (373, 326)]}, graphviz=False, epoch=100, input_size=[416, 416], batch_log=100, batch_size=16, batch_interval=10, subdivision=4, train_dataset_path="Dataset/train", valid_dataset_path="Dataset/valid", multiscale=False, factor_scale=[13, 5], ignore_threshold=0.5, dynamic=False, data_augmentation=True, num_workers=4, optimizer="ADAM", save_period=5, load_period=10, learning_rate=0.001, decay_lr=0.999, decay_step=10, GPU_COUNT=0, Darknetlayer=53, pretrained_base=True, pretrained_path="modelparam", AMP=True, valid_size=8, eval_period=5, tensorboard=True, valid_graph_path="valid_Graph", using_mlflow=True, multiperclass=True, nms_thresh=0.5, nms_topk=500, iou_thresh=0.5, except_class_thresh=0.05, plot_class_thresh=0.5): if GPU_COUNT == 0: ctx = mx.cpu(0) AMP = False elif GPU_COUNT == 1: ctx = mx.gpu(0) else: ctx = [mx.gpu(i) for i in range(GPU_COUNT)] # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if isinstance(ctx, (list, tuple)): for i, c in enumerate(ctx): free_memory, total_memory = mx.context.gpu_memory_info(i) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info(f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB') else: if GPU_COUNT == 1: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info(f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB') else: logging.info(f'Running on {ctx}') # 입력 사이즈를 32의 배수로 지정해 버리기 - stride가 일그러지는 것을 막기 위함 if input_size[0] % 32 != 0 and input_size[1] % 32 != 0: logging.info("The input size must be a multiple of 32") exit(0) if GPU_COUNT > 0 and batch_size < GPU_COUNT: logging.info("batch size must be greater than gpu number") exit(0) if AMP: amp.init() if multiscale: logging.info("Using MultiScale") if data_augmentation: logging.info("Using Data Augmentation") logging.info("training YoloV3 Detector") input_shape = (1, 3) + tuple(input_size) try: net = Yolov3(Darknetlayer=Darknetlayer, anchors=anchors, pretrained=False, ctx=mx.cpu()) train_dataloader, train_dataset = traindataloader(multiscale=multiscale, factor_scale=factor_scale, augmentation=data_augmentation, path=train_dataset_path, input_size=input_size, batch_size=batch_size, batch_interval=batch_interval, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, ignore_threshold=ignore_threshold, dynamic=dynamic, from_sigmoid=False, make_target=True) valid_dataloader, valid_dataset = validdataloader(path=valid_dataset_path, input_size=input_size, batch_size=valid_size, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, ignore_threshold=ignore_threshold, dynamic=dynamic, from_sigmoid=False, make_target=True) except Exception: logging.info("dataset 없음") exit(0) train_update_number_per_epoch = len(train_dataloader) if train_update_number_per_epoch < 1: logging.warning("train batch size가 데이터 수보다 큼") exit(0) valid_list = glob.glob(os.path.join(valid_dataset_path, "*")) if valid_list: valid_update_number_per_epoch = len(valid_dataloader) if valid_update_number_per_epoch < 1: logging.warning("valid batch size가 데이터 수보다 큼") exit(0) num_classes = train_dataset.num_class # 클래스 수 name_classes = train_dataset.classes optimizer = optimizer.upper() if pretrained_base: model = str(input_size[0]) + "_" + str(input_size[1]) + "_" + optimizer + "_P" + "Dark_" + str(Darknetlayer) else: model = str(input_size[0]) + "_" + str(input_size[1]) + "_" + optimizer + "_Dark_" + str(Darknetlayer) weight_path = f"weights/{model}" sym_path = os.path.join(weight_path, f'{model}-symbol.json') param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params') if os.path.exists(param_path) and os.path.exists(sym_path): start_epoch = load_period logging.info(f"loading {os.path.basename(param_path)} weights\n") net = gluon.SymbolBlock.imports(sym_path, ['data'], param_path, ctx=ctx) else: start_epoch = 0 ''' mxnet c++에서 arbitrary input image 를 받기 위한 전략 alloc_size : tuple of int, default is (128, 128) For advanced users. Define `alloc_size` to generate large enough offset maps, which will later saved in parameters. During inference, we support arbitrary input image by cropping corresponding area of the anchor map. This allow us to export to symbol so we can run it in c++, Scalar, etc. ''' net = Yolov3(Darknetlayer=Darknetlayer, input_size=input_size, anchors=anchors, num_classes=num_classes, # foreground만 pretrained=pretrained_base, pretrained_path=pretrained_path, alloc_size=offset_alloc_size, ctx=ctx) if isinstance(ctx, (list, tuple)): net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.summary(mx.nd.ones(shape=input_shape, ctx=ctx)) ''' active (bool, default True) – Whether to turn hybrid on or off. static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase. static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower. ''' if multiscale: net.hybridize(active=True, static_alloc=True, static_shape=False) else: net.hybridize(active=True, static_alloc=True, static_shape=True) if start_epoch + 1 >= epoch + 1: logging.info("this model has already been optimized") exit(0) if tensorboard: summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10, verbose=False) if isinstance(ctx, (list, tuple)): net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.forward(mx.nd.ones(shape=input_shape, ctx=ctx)) summary.add_graph(net) if graphviz: gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model) # optimizer unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size step = unit * decay_step lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate) for p in net.collect_params().values(): if p.grad_req != "null": p.grad_req = 'add' if AMP: ''' update_on_kvstore : bool, default None Whether to perform parameter updates on kvstore. If None, then trainer will choose the more suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored. ''' if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False}, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False}, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0005, "momentum": 0.9, 'multi_precision': False}, update_on_kvstore=False) # for Dynamic loss scaling else: logging.error("optimizer not selected") exit(0) amp.init_trainer(trainer) else: if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False}) elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False}) elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0005, "momentum": 0.9, 'multi_precision': False}) else: logging.error("optimizer not selected") exit(0) loss = Yolov3Loss(sparse_label=True, from_sigmoid=False, batch_axis=None, num_classes=num_classes, reduction="sum", exclude=False) prediction = Prediction( from_sigmoid=False, num_classes=num_classes, nms_thresh=nms_thresh, nms_topk=nms_topk, except_class_thresh=except_class_thresh, multiperclass=multiperclass) precision_recall = Voc_2007_AP(iou_thresh=iou_thresh, class_names=name_classes) start_time = time.time() for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch): xcyc_loss_sum = 0 wh_loss_sum = 0 object_loss_sum = 0 class_loss_sum = 0 time_stamp = time.time() for batch_count, (image, _, xcyc_all, wh_all, objectness_all, class_all, weights_all, _) in enumerate( train_dataloader, start=1): td_batch_size = image.shape[0] image = mx.nd.split(data=image, num_outputs=subdivision, axis=0) xcyc_all = mx.nd.split(data=xcyc_all, num_outputs=subdivision, axis=0) wh_all = mx.nd.split(data=wh_all, num_outputs=subdivision, axis=0) objectness_all = mx.nd.split(data=objectness_all, num_outputs=subdivision, axis=0) class_all = mx.nd.split(data=class_all, num_outputs=subdivision, axis=0) weights_all = mx.nd.split(data=weights_all, num_outputs=subdivision, axis=0) if subdivision == 1: image = [image] xcyc_all = [xcyc_all] wh_all = [wh_all] objectness_all = [objectness_all] class_all = [class_all] weights_all = [weights_all] ''' autograd 설명 https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html ''' with autograd.record(train_mode=True): xcyc_all_losses = [] wh_all_losses = [] object_all_losses = [] class_all_losses = [] for image_split, xcyc_split, wh_split, objectness_split, class_split, weights_split in zip(image, xcyc_all, wh_all, objectness_all, class_all, weights_all): if GPU_COUNT <= 1: image_split = gluon.utils.split_and_load(image_split, [ctx], even_split=False) xcyc_split = gluon.utils.split_and_load(xcyc_split, [ctx], even_split=False) wh_split = gluon.utils.split_and_load(wh_split, [ctx], even_split=False) objectness_split = gluon.utils.split_and_load(objectness_split, [ctx], even_split=False) class_split = gluon.utils.split_and_load(class_split, [ctx], even_split=False) weights_split = gluon.utils.split_and_load(weights_split, [ctx], even_split=False) else: image_split = gluon.utils.split_and_load(image_split, ctx, even_split=False) xcyc_split = gluon.utils.split_and_load(xcyc_split, ctx, even_split=False) wh_split = gluon.utils.split_and_load(wh_split, ctx, even_split=False) objectness_split = gluon.utils.split_and_load(objectness_split, ctx, even_split=False) class_split = gluon.utils.split_and_load(class_split, ctx, even_split=False) weights_split = gluon.utils.split_and_load(weights_split, ctx, even_split=False) xcyc_losses = [] wh_losses = [] object_losses = [] class_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, xcyc_target, wh_target, objectness, class_target, weights in zip(image_split, xcyc_split, wh_split, objectness_split, class_split, weights_split): output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net( img) xcyc_loss, wh_loss, object_loss, class_loss = loss(output1, output2, output3, xcyc_target, wh_target, objectness, class_target, weights) xcyc_losses.append(xcyc_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) object_losses.append(object_loss.asscalar()) class_losses.append(class_loss.asscalar()) total_loss.append(xcyc_loss + wh_loss + object_loss + class_loss) if AMP: with amp.scale_loss(total_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(total_loss) xcyc_all_losses.append(sum(xcyc_losses)) wh_all_losses.append(sum(wh_losses)) object_all_losses.append(sum(object_losses)) class_all_losses.append(sum(class_losses)) trainer.step(batch_size=td_batch_size, ignore_stale_grad=False) # 비우기 for p in net.collect_params().values(): p.zero_grad() xcyc_loss_sum += sum(xcyc_all_losses) / td_batch_size wh_loss_sum += sum(wh_all_losses) / td_batch_size object_loss_sum += sum(object_all_losses) / td_batch_size class_loss_sum += sum(class_all_losses) / td_batch_size if batch_count % batch_log == 0: logging.info(f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],' f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],' f'[Lr = {trainer.learning_rate}]' f'[xcyc loss = {sum(xcyc_all_losses) / td_batch_size:.3f}]' f'[wh loss = {sum(wh_all_losses) / td_batch_size:.3f}]' f'[obj loss = {sum(object_all_losses) / td_batch_size:.3f}]' f'[class loss = {sum(class_all_losses) / td_batch_size:.3f}]') time_stamp = time.time() train_xcyc_loss_mean = np.divide(xcyc_loss_sum, train_update_number_per_epoch) train_wh_loss_mean = np.divide(wh_loss_sum, train_update_number_per_epoch) train_object_loss_mean = np.divide(object_loss_sum, train_update_number_per_epoch) train_class_loss_mean = np.divide(class_loss_sum, train_update_number_per_epoch) train_total_loss_mean = train_xcyc_loss_mean + train_wh_loss_mean + train_object_loss_mean + train_class_loss_mean logging.info( f"train xcyc loss : {train_xcyc_loss_mean} / " f"train wh loss : {train_wh_loss_mean} / " f"train object loss : {train_object_loss_mean} / " f"train class loss : {train_class_loss_mean} / " f"train total loss : {train_total_loss_mean}" ) if i % eval_period == 0 and valid_list: xcyc_loss_sum = 0 wh_loss_sum = 0 object_loss_sum = 0 class_loss_sum = 0 # loss 구하기 for image, label, xcyc_all, wh_all, objectness_all, class_all, weights_all, _ in valid_dataloader: vd_batch_size, _, height, width = image.shape if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) xcyc_all = gluon.utils.split_and_load(xcyc_all, [ctx], even_split=False) wh_all = gluon.utils.split_and_load(wh_all, [ctx], even_split=False) objectness_all = gluon.utils.split_and_load(objectness_all, [ctx], even_split=False) class_all = gluon.utils.split_and_load(class_all, [ctx], even_split=False) weights_all = gluon.utils.split_and_load(weights_all, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) xcyc_all = gluon.utils.split_and_load(xcyc_all, ctx, even_split=False) wh_all = gluon.utils.split_and_load(wh_all, ctx, even_split=False) objectness_all = gluon.utils.split_and_load(objectness_all, ctx, even_split=False) class_all = gluon.utils.split_and_load(class_all, ctx, even_split=False) weights_all = gluon.utils.split_and_load(weights_all, ctx, even_split=False) xcyc_losses = [] wh_losses = [] object_losses = [] class_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, lb, xcyc_target, wh_target, objectness, class_target, weights in zip(image, label, xcyc_all, wh_all, objectness_all, class_all, weights_all): gt_box = lb[:, :, :4] gt_id = lb[:, :, 4:5] output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net( img) id, score, bbox = prediction(output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3) precision_recall.update(pred_bboxes=bbox, pred_labels=id, pred_scores=score, gt_boxes=gt_box, gt_labels=gt_id) xcyc_loss, wh_loss, object_loss, class_loss = loss(output1, output2, output3, xcyc_target, wh_target, objectness, class_target, weights) xcyc_losses.append(xcyc_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) object_losses.append(object_loss.asscalar()) class_losses.append(class_loss.asscalar()) total_loss.append(xcyc_losses + wh_losses + object_losses + class_losses) xcyc_loss_sum += sum(xcyc_losses) / vd_batch_size wh_loss_sum += sum(wh_losses) / vd_batch_size object_loss_sum += sum(object_losses) / vd_batch_size class_loss_sum += sum(class_losses) / vd_batch_size valid_xcyc_loss_mean = np.divide(xcyc_loss_sum, valid_update_number_per_epoch) valid_wh_loss_mean = np.divide(wh_loss_sum, valid_update_number_per_epoch) valid_object_loss_mean = np.divide(object_loss_sum, valid_update_number_per_epoch) valid_class_loss_mean = np.divide(class_loss_sum, valid_update_number_per_epoch) valid_total_loss_mean = valid_xcyc_loss_mean + valid_wh_loss_mean + valid_object_loss_mean + valid_class_loss_mean logging.info( f"valid xcyc loss : {valid_xcyc_loss_mean} / " f"valid wh loss : {valid_wh_loss_mean} / " f"valid object loss : {valid_object_loss_mean} / " f"valid class loss : {valid_class_loss_mean} / " f"valid total loss : {valid_total_loss_mean}" ) AP_appender = [] round_position = 2 class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list() for j, c, p, r in zip(range(len(recall)), class_name, precision, recall): name, AP = precision_recall.get_AP(c, p, r) logging.info(f"class {j}'s {name} AP : {round(AP * 100, round_position)}%") AP_appender.append(AP) mAP_result = np.mean(AP_appender) logging.info(f"mAP : {round(mAP_result * 100, round_position)}%") precision_recall.get_PR_curve(name=class_name, precision=precision, recall=recall, threshold=threshold, AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i) precision_recall.reset() if tensorboard: # gpu N 개를 대비한 코드 (Data Parallelism) dataloader_iter = iter(valid_dataloader) image, label, _, _, _, _, _, _ = next(dataloader_iter) if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) ground_truth_colors = {} for k in range(num_classes): ground_truth_colors[k] = (0, 0, 1) batch_image = [] for img, lb in zip(image, label): gt_boxes = lb[:, :, :4] gt_ids = lb[:, :, 4:5] output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net( img) ids, scores, bboxes = prediction(output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3) for ig, gt_id, gt_box, id, score, bbox in zip(img, gt_ids, gt_boxes, ids, scores, bboxes): ig = ig.transpose( (1, 2, 0)) * mx.nd.array(std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context) ig = (ig * 255).clip(0, 255) # ground truth box 그리기 ground_truth = plot_bbox(ig, gt_box, scores=None, labels=gt_id, thresh=None, reverse_rgb=True, class_names=valid_dataset.classes, absolute_coordinates=True, colors=ground_truth_colors) # prediction box 그리기 prediction_box = plot_bbox(ground_truth, bbox, scores=score, labels=id, thresh=plot_class_thresh, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True) # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다. prediction_box = cv2.cvtColor(prediction_box, cv2.COLOR_BGR2RGB) prediction_box = np.transpose(prediction_box, axes=(2, 0, 1)) batch_image.append(prediction_box) # (batch, channel, height, width) summary.add_image(tag="valid_result", image=np.array(batch_image), global_step=i) summary.add_scalar(tag="xy_loss", value={"train_xcyc_loss": train_xcyc_loss_mean, "valid_xcyc_loss": valid_xcyc_loss_mean}, global_step=i) summary.add_scalar(tag="wh_loss", value={"train_wh_loss": train_wh_loss_mean, "valid_wh_loss": valid_wh_loss_mean}, global_step=i) summary.add_scalar(tag="object_loss", value={"train_object_loss": train_object_loss_mean, "valid_object_loss": valid_object_loss_mean}, global_step=i) summary.add_scalar(tag="class_loss", value={"train_class_loss": train_class_loss_mean, "valid_class_loss": valid_class_loss_mean}, global_step=i) summary.add_scalar(tag="total_loss", value={ "train_total_loss": train_total_loss_mean, "valid_total_loss": valid_total_loss_mean}, global_step=i) params = net.collect_params().values() if GPU_COUNT > 1: for c in ctx: for p in params: summary.add_histogram(tag=p.name, values=p.data(ctx=c), global_step=i, bins='default') else: for p in params: summary.add_histogram(tag=p.name, values=p.data(), global_step=i, bins='default') if i % save_period == 0: weight_epoch_path = os.path.join(weight_path, str(i)) if not os.path.exists(weight_epoch_path): os.makedirs(weight_epoch_path) ''' Hybrid models can be serialized as JSON files using the export function Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface. When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc. ''' if GPU_COUNT >= 1: context = mx.gpu(0) else: context = mx.cpu(0) postnet = PostNet(net=net, auxnet=prediction) try: net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True) # for onnx net.save_parameters(os.path.join(weight_path, f"{i}.params")) # onnx 추출용 # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 / onnx로는 추출 못함. export_block_for_cplusplus(path=os.path.join(weight_epoch_path, f"{model}_prepost"), block=postnet, data_shape=tuple(input_size) + tuple((3,)), epoch=i, preprocess=True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=context, remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공") net.collect_params().reset_ctx(ctx) end_time = time.time() learning_time = end_time - start_time logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H") logging.info("optimization completed") if using_mlflow: ml.log_metric("learning time", round(learning_time / 3600, 2))
def run(image_list=False, image_path="", weight_path="weights", load_name="512_512_ADAM_PRES_18", load_period=10, GPU_COUNT=1, decode_number=5000, multiperclass=True, nms_thresh=0.5, nms_topk=500, except_class_thresh=0.05, plot_class_thresh=0.5, image_save_path="result_image", image_show=True, image_save=True): if GPU_COUNT <= 0: ctx = mx.cpu(0) elif GPU_COUNT > 0: ctx = mx.gpu(0) # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if GPU_COUNT > 0: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: logging.info(f'Running on {ctx}') logging.info(f"test {load_name}") netheight = int(load_name.split("_")[0]) netwidth = int(load_name.split("_")[1]) if not isinstance(netheight, int) and not isinstance(netwidth, int): logging.info("height is not int") logging.info("width is not int") raise ValueError else: logging.info(f"network input size : {(netheight, netwidth)}") try: _, test_dataset = testdataloader() except Exception: logging.info("The dataset does not exist") exit(0) weight_path = os.path.join(weight_path, load_name) sym = os.path.join(weight_path, f'{load_name}-symbol.json') params = os.path.join(weight_path, f'{load_name}-{load_period:04d}.params') logging.info("symbol model test") if os.path.exists(sym) and os.path.exists(params): logging.info(f"loading {os.path.basename(params)} weights\n") net = gluon.SymbolBlock.imports(sym, ['data'], params, ctx=ctx) else: raise FileExistsError try: net = export_block_for_cplusplus( block=net, data_shape=tuple((netheight, netwidth)) + tuple((3, )), preprocess=True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=ctx) except Exception as E: logging.error(f"adding preprocessing layer 실패 : {E}") else: logging.info(f"adding preprocessing layer 성공 ") net.hybridize(active=True, static_alloc=True, static_shape=True) # BoxEncoder, BoxDecoder 에서 같은 값을 가져야함 prediction = Prediction(from_sigmoid=False, num_classes=test_dataset.num_class, decode_number=decode_number, nms_thresh=nms_thresh, nms_topk=nms_topk, except_class_thresh=except_class_thresh, multiperclass=multiperclass) if image_list: types = ('*.jpg', '*.png') images_path = [] for type in types: images_path.extend(glob.glob(os.path.join(image_path, type))) if images_path: for img_path in tqdm(images_path): name = os.path.splitext(os.path.basename(img_path))[0] image = cv2.imread(img_path, cv2.IMREAD_COLOR) height, width, _ = image.shape logging.info(f"real input size : {(height, width)}") origin_image = image.copy() image = cv2.resize(image, (netwidth, netheight), interpolation=3) image[:, :, (0, 1, 2)] = image[:, :, (2, 1, 0)] # BGR to RGB image = mx.nd.array(image, ctx=ctx) image = image.expand_dims(axis=0) cls_preds, box_preds, anchors = net(image) ids, scores, bboxes = prediction(cls_preds, box_preds, anchors) bbox = box_resize(bboxes[0], (netwidth, netheight), (width, height)) plot_bbox(origin_image, bbox, scores=scores[0], labels=ids[0], thresh=plot_class_thresh, reverse_rgb=False, class_names=test_dataset.classes, image_show=image_show, image_save=image_save, image_save_path=image_save_path, image_name=name) else: raise FileNotFoundError else: name = os.path.splitext(os.path.basename(image_path))[0] image = cv2.imread(image_path, cv2.IMREAD_COLOR) height, width, _ = image.shape logging.info(f"real input size : {(height, width)}") origin_image = image.copy() image = cv2.resize(image, (netwidth, netheight), interpolation=3) image[:, :, (0, 1, 2)] = image[:, :, (2, 1, 0)] # BGR to RGB image = mx.nd.array(image, ctx=ctx) image = image.expand_dims(axis=0) cls_preds, box_preds, anchors = net(image) ids, scores, bboxes = prediction(cls_preds, box_preds, anchors) bbox = box_resize(bboxes[0], (netwidth, netheight), (width, height)) plot_bbox(origin_image, bbox, scores=scores[0], labels=ids[0], thresh=plot_class_thresh, reverse_rgb=False, class_names=test_dataset.classes, image_show=image_show, image_save=image_save, image_save_path=image_save_path, image_name=name) cv2.destroyAllWindows()
def run(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], anchor_alloc_size=[256, 256], anchor_sizes=[32, 64, 128, 256, 512], anchor_size_ratios=[1, pow(2, 1 / 3), pow(2, 2 / 3)], anchor_aspect_ratios=[0.5, 1, 2], anchor_box_clip=True, graphviz=True, epoch=100, input_size=[512, 512], batch_log=100, batch_size=16, batch_interval=10, subdivision=4, train_dataset_path="Dataset/train", valid_dataset_path="Dataset/valid", multiscale=True, factor_scale=[8, 5], foreground_iou_thresh=0.5, background_iou_thresh=0.4, data_augmentation=True, num_workers=4, optimizer="ADAM", weight_decay=0.000001, save_period=5, load_period=10, learning_rate=0.001, decay_lr=0.999, decay_step=10, GPU_COUNT=0, base=0, AMP=True, valid_size=8, eval_period=5, tensorboard=True, valid_graph_path="valid_Graph", valid_html_auto_open=True, using_mlflow=True, decode_number=5000, multiperclass=True, nms_thresh=0.5, nms_topk=500, iou_thresh=0.5, except_class_thresh=0.05, plot_class_thresh=0.5): if GPU_COUNT == 0: ctx = mx.cpu(0) AMP = False elif GPU_COUNT == 1: ctx = mx.gpu(0) else: ctx = [mx.gpu(i) for i in range(GPU_COUNT)] # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if isinstance(ctx, (list, tuple)): for i, c in enumerate(ctx): free_memory, total_memory = mx.context.gpu_memory_info(i) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: if GPU_COUNT == 1: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: logging.info(f'Running on {ctx}') if GPU_COUNT > 0 and batch_size < GPU_COUNT: logging.info("batch size must be greater than gpu number") exit(0) if AMP: amp.init() if multiscale: logging.info("Using MultiScale") if data_augmentation: logging.info("Using Data Augmentation") logging.info("training Efficient Detector") input_shape = (1, 3) + tuple(input_size) net = Efficient(version=base, anchor_sizes=anchor_sizes, anchor_size_ratios=anchor_size_ratios, anchor_aspect_ratios=anchor_aspect_ratios, anchor_box_clip=anchor_box_clip, alloc_size=anchor_alloc_size, ctx=mx.cpu()) train_dataloader, train_dataset = traindataloader( multiscale=multiscale, factor_scale=factor_scale, augmentation=data_augmentation, path=train_dataset_path, input_size=input_size, batch_size=batch_size, batch_interval=batch_interval, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, foreground_iou_thresh=foreground_iou_thresh, background_iou_thresh=background_iou_thresh, make_target=True) train_update_number_per_epoch = len(train_dataloader) if train_update_number_per_epoch < 1: logging.warning("train batch size가 데이터 수보다 큼") exit(0) valid_list = glob.glob(os.path.join(valid_dataset_path, "*")) if valid_list: valid_dataloader, valid_dataset = validdataloader( path=valid_dataset_path, input_size=input_size, batch_size=valid_size, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, foreground_iou_thresh=foreground_iou_thresh, background_iou_thresh=background_iou_thresh, make_target=True) valid_update_number_per_epoch = len(valid_dataloader) if valid_update_number_per_epoch < 1: logging.warning("valid batch size가 데이터 수보다 큼") exit(0) num_classes = train_dataset.num_class # 클래스 수 name_classes = train_dataset.classes optimizer = optimizer.upper() model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_EFF_" + str(base) weight_path = os.path.join("weights", f"{model}") sym_path = os.path.join(weight_path, f'{model}-symbol.json') param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params') optimizer_path = os.path.join(weight_path, f'{model}-{load_period:04d}.opt') if os.path.exists(param_path) and os.path.exists(sym_path): start_epoch = load_period logging.info(f"loading {os.path.basename(param_path)}\n") net = gluon.SymbolBlock.imports(sym_path, ['data'], param_path, ctx=ctx) else: start_epoch = 0 net = Efficient( version=base, input_size=input_size, anchor_sizes=anchor_sizes, anchor_size_ratios=anchor_size_ratios, anchor_aspect_ratios=anchor_aspect_ratios, num_classes=num_classes, # foreground만 anchor_box_clip=anchor_box_clip, alloc_size=anchor_alloc_size, ctx=ctx) if isinstance(ctx, (list, tuple)): net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.summary(mx.nd.ones(shape=input_shape, ctx=ctx)) ''' active (bool, default True) – Whether to turn hybrid on or off. static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase. static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower. ''' if multiscale: net.hybridize(active=True, static_alloc=True, static_shape=False) else: net.hybridize(active=True, static_alloc=True, static_shape=True) if start_epoch + 1 >= epoch + 1: logging.info("this model has already been optimized") exit(0) if tensorboard: summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10, verbose=False) if isinstance(ctx, (list, tuple)): net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.forward(mx.nd.ones(shape=input_shape, ctx=ctx)) summary.add_graph(net) if graphviz: gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model) # optimizer unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size step = unit * decay_step lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate) for p in net.collect_params().values(): if p.grad_req != "null": p.grad_req = 'add' ''' update_on_kvstore : bool, default None Whether to perform parameter updates on kvstore. If None, then trainer will choose the more suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored. ''' if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": weight_decay, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False }, update_on_kvstore=False if AMP else None) # for Dynamic loss scaling elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": weight_decay, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False }, update_on_kvstore=False if AMP else None) # for Dynamic loss scaling elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": weight_decay, "momentum": 0.9, 'multi_precision': False }, update_on_kvstore=False if AMP else None) # for Dynamic loss scaling else: logging.error("optimizer not selected") exit(0) if AMP: amp.init_trainer(trainer) # optimizer weight 불러오기 if os.path.exists(optimizer_path): try: trainer.load_states(optimizer_path) except Exception as E: logging.info(E) else: logging.info(f"loading {os.path.basename(optimizer_path)}\n") ''' localization loss -> Smooth L1 loss confidence loss -> Focal ''' confidence_loss = FocalLoss(alpha=0.25, gamma=2, sparse_label=True, from_sigmoid=False, batch_axis=None, num_class=num_classes, reduction="sum", exclude=False) localization_loss = HuberLoss(rho=1, batch_axis=None, reduction="sum", exclude=False) prediction = Prediction(batch_size=batch_size, from_sigmoid=False, num_classes=num_classes, decode_number=decode_number, nms_thresh=nms_thresh, nms_topk=nms_topk, except_class_thresh=except_class_thresh, multiperclass=multiperclass) precision_recall = Voc_2007_AP(iou_thresh=iou_thresh, class_names=name_classes) ctx_list = ctx if isinstance(ctx, (list, tuple)) else [ctx] start_time = time.time() for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch): conf_loss_sum = 0 loc_loss_sum = 0 time_stamp = time.time() for batch_count, (image, _, cls_all, box_all, _) in enumerate(train_dataloader, start=1): td_batch_size = image.shape[0] image = mx.nd.split(data=image, num_outputs=subdivision, axis=0) cls_all = mx.nd.split(data=cls_all, num_outputs=subdivision, axis=0) box_all = mx.nd.split(data=box_all, num_outputs=subdivision, axis=0) if subdivision == 1: image = [image] cls_all = [cls_all] box_all = [box_all] ''' autograd 설명 https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html ''' with autograd.record(train_mode=True): cls_all_losses = [] box_all_losses = [] for image_split, cls_split, box_split in zip( image, cls_all, box_all): image_split = gluon.utils.split_and_load(image_split, ctx_list, even_split=False) cls_split = gluon.utils.split_and_load(cls_split, ctx_list, even_split=False) box_split = gluon.utils.split_and_load(box_split, ctx_list, even_split=False) # prediction, target space for Data Parallelism cls_losses = [] box_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, cls_target, box_target in zip( image_split, cls_split, box_split): cls_pred, box_pred, anchor = net(img) except_ignore_samples = cls_target > -1 positive_samples = cls_target > 0 positive_numbers = positive_samples.sum() conf_loss = confidence_loss( cls_pred, cls_target, except_ignore_samples.expand_dims(axis=-1)) conf_loss = mx.nd.divide(conf_loss, positive_numbers + 1) cls_losses.append(conf_loss.asscalar()) loc_loss = localization_loss( box_pred, box_target, positive_samples.expand_dims(axis=-1)) box_losses.append(loc_loss.asscalar()) total_loss.append(conf_loss + loc_loss) if AMP: with amp.scale_loss(total_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(total_loss) cls_all_losses.append(sum(cls_losses)) box_all_losses.append(sum(box_losses)) trainer.step(batch_size=td_batch_size, ignore_stale_grad=False) # 비우기 for p in net.collect_params().values(): p.zero_grad() conf_loss_sum += sum(cls_all_losses) / td_batch_size loc_loss_sum += sum(box_all_losses) / td_batch_size if batch_count % batch_log == 0: logging.info( f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],' f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],' f'[Lr = {trainer.learning_rate}]' f'[confidence loss = {sum(cls_all_losses) / td_batch_size:.3f}]' f'[localization loss = {sum(box_all_losses) / td_batch_size:.3f}]' ) time_stamp = time.time() train_conf_loss_mean = np.divide(conf_loss_sum, train_update_number_per_epoch) train_loc_loss_mean = np.divide(loc_loss_sum, train_update_number_per_epoch) train_total_loss_mean = train_conf_loss_mean + train_loc_loss_mean logging.info( f"train confidence loss : {train_conf_loss_mean} / train localization loss : {train_loc_loss_mean} / train total loss : {train_total_loss_mean}" ) if i % save_period == 0: weight_epoch_path = os.path.join(weight_path, str(i)) if not os.path.exists(weight_epoch_path): os.makedirs(weight_epoch_path) # optimizer weight 저장하기 try: trainer.save_states( os.path.join(weight_path, f'{model}-{i:04d}.opt')) except Exception as E: logging.error(f"optimizer weight export 예외 발생 : {E}") else: logging.info("optimizer weight export 성공") ''' Hybrid models can be serialized as JSON files using the export function Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface. When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc. ''' if GPU_COUNT >= 1: context = mx.gpu(0) else: context = mx.cpu(0) ''' mxnet1.6.0 버전 에서 AMP 사용시 위에 미리 선언한 prediction을 사용하면 문제가 될 수 있다. -yolo v3, gaussian yolo v3 에서는 문제가 발생한다. mxnet 1.5.x 버전에서는 아래와 같이 새로 선언하지 않아도 정상 동작한다. block들은 함수 인자로 보낼 경우 자기 자신이 보내진다.(복사되는 것이 아님) export_block_for_cplusplus 에서 prediction 이 hybridize 되면서 미리 선언한 prediction도 hybridize화 되면서 symbol 형태가 된다. 이런 현상을 보면 아래와같이 다시 선언해 주는게 맞는 것 같다. ''' auxnet = Prediction(from_sigmoid=False, num_classes=num_classes, decode_number=decode_number, nms_thresh=nms_thresh, nms_topk=nms_topk, except_class_thresh=except_class_thresh, multiperclass=multiperclass) postnet = PostNet(net=net, auxnet=auxnet) try: net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True) net.save_parameters(os.path.join(weight_path, f"{i}.params")) # onnx 추출용 # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 export_block_for_cplusplus( path=os.path.join(weight_epoch_path, f"{model}_prepost"), block=postnet, data_shape=tuple(input_size) + tuple((3, )), epoch=i, preprocess= True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=context, remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공") net.collect_params().reset_ctx(ctx) if i % eval_period == 0 and valid_list: conf_loss_sum = 0 loc_loss_sum = 0 # loss 구하기 for image, label, cls_all, box_all, _ in valid_dataloader: vd_batch_size = image.shape[0] image = gluon.utils.split_and_load(image, ctx_list, even_split=False) label = gluon.utils.split_and_load(label, ctx_list, even_split=False) cls_all = gluon.utils.split_and_load(cls_all, ctx_list, even_split=False) box_all = gluon.utils.split_and_load(box_all, ctx_list, even_split=False) # prediction, target space for Data Parallelism cls_losses = [] box_losses = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, lb, cls_target, box_target in zip( image, label, cls_all, box_all): gt_box = lb[:, :, :4] gt_id = lb[:, :, 4:5] cls_pred, box_pred, anchor = net(img) id, score, bbox = prediction(cls_pred, box_pred, anchor) precision_recall.update(pred_bboxes=bbox, pred_labels=id, pred_scores=score, gt_boxes=gt_box, gt_labels=gt_id) except_ignore_samples = cls_target > -1 positive_samples = cls_target > 0 positive_numbers = positive_samples.sum() conf_loss = confidence_loss( cls_pred, cls_target, except_ignore_samples.expand_dims(axis=-1)) conf_loss = mx.nd.divide(conf_loss, positive_numbers + 1) cls_losses.append(conf_loss.asscalar()) loc_loss = localization_loss( box_pred, box_target, positive_samples.expand_dims(axis=-1)) box_losses.append(loc_loss.asscalar()) conf_loss_sum += sum(cls_losses) / vd_batch_size loc_loss_sum += sum(box_losses) / vd_batch_size valid_conf_loss_mean = np.divide(conf_loss_sum, valid_update_number_per_epoch) valid_loc_loss_mean = np.divide(loc_loss_sum, valid_update_number_per_epoch) valid_total_loss_mean = valid_conf_loss_mean + valid_loc_loss_mean logging.info( f"valid confidence loss : {valid_conf_loss_mean} / valid localization loss : {valid_loc_loss_mean} / valid total loss : {valid_total_loss_mean}" ) AP_appender = [] round_position = 2 class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list( ) for j, c, p, r in zip(range(len(recall)), class_name, precision, recall): name, AP = precision_recall.get_AP(c, p, r) logging.info( f"class {j}'s {name} AP : {round(AP * 100, round_position)}%" ) AP_appender.append(AP) AP_appender = np.nan_to_num(AP_appender) mAP_result = np.mean(AP_appender) logging.info(f"mAP : {round(mAP_result * 100, round_position)}%") precision_recall.get_PR_curve(name=class_name, precision=precision, recall=recall, threshold=threshold, AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i, auto_open=valid_html_auto_open) precision_recall.reset() if tensorboard: # gpu N 개를 대비한 코드 (Data Parallelism) dataloader_iter = iter(valid_dataloader) image, label, _, _, _ = next(dataloader_iter) image = gluon.utils.split_and_load(image, ctx_list, even_split=False) label = gluon.utils.split_and_load(label, ctx_list, even_split=False) ground_truth_colors = {} for k in range(num_classes): ground_truth_colors[k] = (0, 1, 0) batch_image = [] for img, lb in zip(image, label): gt_boxes = lb[:, :, :4] gt_ids = lb[:, :, 4:5] cls_pred, box_pred, anchor = net(img) ids, scores, bboxes = prediction(cls_pred, box_pred, anchor) for ig, gt_id, gt_box, id, score, bbox in zip( img, gt_ids, gt_boxes, ids, scores, bboxes): ig = ig.transpose((1, 2, 0)) * mx.nd.array( std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context) ig = (ig * 255).clip(0, 255) ig = ig.astype(np.uint8) # ground truth box 그리기 ground_truth = plot_bbox( ig, gt_box, scores=None, labels=gt_id, thresh=None, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True, colors=ground_truth_colors) # prediction box 그리기 prediction_box = plot_bbox( ground_truth, bbox, scores=score, labels=id, thresh=plot_class_thresh, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True) # Tensorboard에 그리기 (height, width, channel) -> (channel, height, width) 를한다. prediction_box = np.transpose(prediction_box, axes=(2, 0, 1)) batch_image.append( prediction_box) # (batch, channel, height, width) summary.add_image(tag="valid_result", image=np.array(batch_image), global_step=i) summary.add_scalar(tag="conf_loss", value={ "train_conf_loss": train_conf_loss_mean, "valid_conf_loss": valid_conf_loss_mean }, global_step=i) summary.add_scalar(tag="loc_loss", value={ "train_loc_loss": train_loc_loss_mean, "valid_loc_loss": valid_loc_loss_mean }, global_step=i) summary.add_scalar(tag="total_loss", value={ "train_total_loss": train_total_loss_mean, "valid_total_loss": valid_total_loss_mean }, global_step=i) for p in net.collect_params().values(): summary.add_histogram(tag=p.name, values=p.data(ctx=ctx_list[0]), global_step=i, bins='default') end_time = time.time() learning_time = end_time - start_time logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H") logging.info("optimization completed") if using_mlflow: ml.log_metric("learning time", round(learning_time / 3600, 2))
def run( video_list=True, # True일 때, 폴더에 있는 비디오(mp4)들 전부다 평가 / False일 때, 비디오 한개 평가 # video_path='test_video/C050105_001.mp4', video_path='test_video', weight_path="weights", load_name="480_640_ADAM_PCENTER_RES18", load_period=100, topk=500, nms=False, except_class_thresh=0.01, nms_thresh=0.5, plot_class_thresh=0.5, video_save_path="result_video", video_show=True, video_save=True): if video_save: if not os.path.exists(video_save_path): os.makedirs(video_save_path) if mx.context.num_gpus() > 0: GPU_COUNT = mx.context.num_gpus() else: GPU_COUNT = 0 if GPU_COUNT <= 0: ctx = mx.cpu(0) elif GPU_COUNT > 0: ctx = mx.gpu(0) # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if GPU_COUNT > 0: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: logging.info(f'Running on {ctx}') logging.info(f"test {load_name}") scale_factor = 4 # 고정 logging.info(f"scale factor {scale_factor}") netheight = int(load_name.split("_")[0]) netwidth = int(load_name.split("_")[1]) if not isinstance(netheight, int) and not isinstance(netwidth, int): logging.info("height is not int") logging.info("width is not int") raise ValueError else: logging.info(f"network input size : {(netheight, netwidth)}") _, test_dataset = testdataloader() weight_path = os.path.join(weight_path, load_name) sym = os.path.join(weight_path, f'{load_name}-symbol.json') params = os.path.join(weight_path, f'{load_name}-{load_period:04d}.params') logging.info("symbol model test") if os.path.exists(sym) and os.path.exists(params): logging.info(f"loading {os.path.basename(params)} weights\n") net = gluon.SymbolBlock.imports(sym, ['data'], params, ctx=ctx) else: raise FileExistsError try: net = export_block_for_cplusplus( block=net, data_shape=tuple((netheight, netwidth)) + tuple((3, )), preprocess=True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=ctx) except Exception as E: logging.error(f"adding preprocessing layer 실패 : {E}") else: logging.info(f"adding preprocessing layer 성공 ") net.hybridize(active=True, static_alloc=True, static_shape=True) prediction = Prediction(topk=topk, scale=scale_factor, nms=nms, except_class_thresh=except_class_thresh, nms_thresh=nms_thresh) fourcc = cv2.VideoWriter_fourcc(*'DIVX') if video_list: types = ('*.mp4', '*.avi') videos_path = [] for type in types: videos_path.extend(glob.glob(os.path.join(video_path, type))) if videos_path: for v_path in tqdm(videos_path): cap = cv2.VideoCapture(v_path) nframe = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) fps = cap.get(cv2.CAP_PROP_FPS) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) name = os.path.splitext(os.path.basename(v_path))[0] out = cv2.VideoWriter( os.path.join(video_save_path, f'{name}.avi'), fourcc, fps, (width, height)) logging.info(f"real input size : {(height, width)}") # while(cap.isOpened()): for _ in tqdm(range(0, nframe)): ret, image = cap.read() if ret: origin_image = image.copy() image = cv2.resize(image, (netwidth, netheight), interpolation=3) image[:, :, (0, 1, 2)] = image[:, :, (2, 1, 0)] # BGR to RGB image = mx.nd.array(image, ctx=ctx) image = image.expand_dims(axis=0) heatmap_pred, offset_pred, wh_pred = net(image) ids, scores, bboxes = prediction( heatmap_pred, offset_pred, wh_pred) bbox = box_resize(bboxes[0], (netwidth, netheight), (width, height)) result = plot_bbox(origin_image, bbox, scores=scores[0], labels=ids[0], thresh=plot_class_thresh, reverse_rgb=False, class_names=test_dataset.classes) if video_save: out.write(result) if video_show: cv2.imshow(name, result) cv2.waitKey(1) cap.release() out.release() cv2.destroyAllWindows() else: raise FileNotFoundError else: cap = cv2.VideoCapture(video_path) nframe = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) fps = cap.get(cv2.CAP_PROP_FPS) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) name = os.path.splitext(os.path.basename(video_path))[0] out = cv2.VideoWriter(os.path.join(video_save_path, f'{name}.avi'), fourcc, fps, (width, height)) logging.info(f"real input size : {(height, width)}") # while(cap.isOpened()): for _ in tqdm(range(0, nframe)): ret, image = cap.read() if ret: origin_image = image.copy() image = cv2.resize(image, (netwidth, netheight), interpolation=3) image[:, :, (0, 1, 2)] = image[:, :, (2, 1, 0)] # BGR to RGB image = mx.nd.array(image, ctx=ctx) image = image.expand_dims(axis=0) heatmap_pred, offset_pred, wh_pred = net(image) ids, scores, bboxes = prediction(heatmap_pred, offset_pred, wh_pred) bbox = box_resize(bboxes[0], (netwidth, netheight), (width, height)) result = plot_bbox(origin_image, bbox, scores=scores[0], labels=ids[0], thresh=plot_class_thresh, reverse_rgb=False, class_names=test_dataset.classes) if video_save: out.write(result) if video_show: cv2.imshow(name, result) cv2.waitKey(1) cap.release() out.release() cv2.destroyAllWindows()
def run(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], anchor_alloc_size=[256, 256], box_sizes=[21, 51.2, 133.12, 215.04, 296.96, 378.88, 460.8, 542.72], box_ratios=[[1, 2, 0.5]] + [[1, 2, 0.5, 3, 1.0 / 3]] * 4 + [[1, 2, 0.5]] * 2, anchor_box_clip=True, graphviz=True, epoch=100, input_size=[400, 600], batch_log=100, batch_size=16, batch_interval=10, subdivision=4, train_dataset_path="Dataset/train", valid_dataset_path="Dataset/valid", multiscale=True, factor_scale=[8, 5], foreground_iou_thresh=0.5, data_augmentation=True, num_workers=4, optimizer="ADAM", save_period=10, load_period=10, learning_rate=0.001, decay_lr=0.999, decay_step=10, GPU_COUNT=0, base="VGG16_512", pretrained_base=True, pretrained_path="modelparam", classHardNegativeMining=True, boxHardNegativeMining=True, AMP=True, valid_size=8, eval_period=5, tensorboard=True, valid_graph_path="valid_Graph", using_mlflow=True, decode_number=-1, multiperclass=True, nms_thresh=0.45, nms_topk=500, iou_thresh=0.5, except_class_thresh=0.01, plot_class_thresh=0.5): if GPU_COUNT == 0: ctx = mx.cpu(0) AMP = False elif GPU_COUNT == 1: ctx = mx.gpu(0) else: ctx = [mx.gpu(i) for i in range(GPU_COUNT)] # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if isinstance(ctx, (list, tuple)): for i, c in enumerate(ctx): free_memory, total_memory = mx.context.gpu_memory_info(i) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: if GPU_COUNT == 1: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: logging.info(f'Running on {ctx}') if GPU_COUNT > 0 and batch_size < GPU_COUNT: logging.info("batch size must be greater than gpu number") exit(0) if AMP: amp.init() if multiscale: logging.info("Using MultiScale") if data_augmentation: logging.info("Using Data Augmentation") logging.info("training SSD Detector") input_shape = (1, 3) + tuple(input_size) try: if base.upper() == "VGG16_300": # 입력 사이즈 300 x 300 추천 net = SSD_VGG16(version=300, input_size=input_size, box_sizes=box_sizes, box_ratios=box_ratios, anchor_box_clip=anchor_box_clip, alloc_size=anchor_alloc_size, ctx=mx.cpu()) elif base.upper() == "VGG16_512": # 입력 사이즈 512 x 512 추천 net = SSD_VGG16(version=512, input_size=input_size, box_sizes=box_sizes, box_ratios=box_ratios, anchor_box_clip=anchor_box_clip, ctx=mx.cpu()) train_dataloader, train_dataset = traindataloader( multiscale=multiscale, factor_scale=factor_scale, augmentation=data_augmentation, path=train_dataset_path, input_size=input_size, batch_size=batch_size, batch_interval=batch_interval, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, foreground_iou_thresh=foreground_iou_thresh, make_target=True) valid_dataloader, valid_dataset = validdataloader( path=valid_dataset_path, input_size=input_size, batch_size=valid_size, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, foreground_iou_thresh=foreground_iou_thresh, make_target=True) except Exception: logging.info("dataset 없음") exit(0) train_update_number_per_epoch = len(train_dataloader) if train_update_number_per_epoch < 1: logging.warning("train batch size가 데이터 수보다 큼") exit(0) valid_list = glob.glob(os.path.join(valid_dataset_path, "*")) if valid_list: valid_update_number_per_epoch = len(valid_dataloader) if valid_update_number_per_epoch < 1: logging.warning("valid batch size가 데이터 수보다 큼") exit(0) num_classes = train_dataset.num_class # 클래스 수 name_classes = train_dataset.classes # 이름 다시 붙이기 optimizer = optimizer.upper() base = base.upper() if pretrained_base: model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_P" + base else: model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_" + base weight_path = f"weights/{model}" sym_path = os.path.join(weight_path, f'{model}-symbol.json') param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params') if os.path.exists(param_path) and os.path.exists(sym_path): start_epoch = load_period logging.info(f"loading {os.path.basename(param_path)} weights\n") net = gluon.SymbolBlock.imports(sym_path, ['data'], param_path, ctx=ctx) else: start_epoch = 0 if base.upper() == "VGG16_300": # 입력 사이즈 300 x 300 추천 net = SSD_VGG16( version=300, input_size=input_size, # box_sizes=[21, 45, 101.25, 157.5, 213.75, 270, 326.25], # box_ratios=[[1, 2, 0.5]] + # conv4_3 # [[1, 2, 0.5, 3, 1.0 / 3]] * 3 + # conv7, conv8_2, conv9_2, conv10_2 # [[1, 2, 0.5]] * 2, # conv11_2, conv12_2 box_sizes=box_sizes, box_ratios=box_ratios, num_classes=num_classes, pretrained=pretrained_base, pretrained_path=pretrained_path, anchor_box_clip=anchor_box_clip, alloc_size=anchor_alloc_size, ctx=ctx) elif base.upper() == "VGG16_512": # 입력 사이즈 512 x 512 추천 net = SSD_VGG16( version=512, input_size=input_size, # box_sizes=[21, 51.2, 133.12, 215.04, 296.96, 378.88, 460.8, 542.72], # box_ratios=[[1, 2, 0.5]] + # conv4_3 # [[1, 2, 0.5, 3, 1.0 / 3]] * 4 + # conv7, conv8_2, conv9_2, conv10_2 # [[1, 2, 0.5]] * 2, # conv11_2, conv12_2 box_sizes=box_sizes, box_ratios=box_ratios, num_classes=num_classes, pretrained=pretrained_base, pretrained_path=pretrained_path, anchor_box_clip=anchor_box_clip, ctx=ctx) else: logging.warning("backbone 없음") exit(0) if isinstance(ctx, (list, tuple)): net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.summary(mx.nd.ones(shape=input_shape, ctx=ctx)) ''' active (bool, default True) – Whether to turn hybrid on or off. static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase. static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower. ''' if multiscale: net.hybridize(active=True, static_alloc=True, static_shape=False) else: net.hybridize(active=True, static_alloc=True, static_shape=True) if start_epoch + 1 >= epoch + 1: logging.info("this model has already been optimized") exit(0) if tensorboard: summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10, verbose=False) if isinstance(ctx, (list, tuple)): net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.forward(mx.nd.ones(shape=input_shape, ctx=ctx)) summary.add_graph(net) if graphviz: gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model) # optimizer unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size step = unit * decay_step lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate) for p in net.collect_params().values(): if p.grad_req != "null": p.grad_req = 'add' if AMP: ''' update_on_kvstore : bool, default None Whether to perform parameter updates on kvstore. If None, then trainer will choose the more suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored. ''' if optimizer.upper() == "ADAM": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "SGD": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0005, "momentum": 0.9, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling else: logging.error("optimizer not selected") exit(0) amp.init_trainer(trainer) else: if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False }) elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False }) elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0005, "momentum": 0.9, 'multi_precision': False }) else: logging.error("optimizer not selected") exit(0) ''' localization loss -> Smooth L1 loss confidence loss -> Softmax ''' if not classHardNegativeMining: confidence_loss = SoftmaxCrossEntropyLoss(axis=-1, sparse_label=True, from_log_softmax=False, batch_axis=None, reduction="sum", exclude=False) if not boxHardNegativeMining: localization_loss = HuberLoss(rho=1, batch_axis=None, reduction="sum", exclude=False) prediction = Prediction(from_softmax=False, num_classes=num_classes, decode_number=decode_number, nms_thresh=nms_thresh, nms_topk=nms_topk, except_class_thresh=except_class_thresh, multiperclass=multiperclass) precision_recall = Voc_2007_AP(iou_thresh=iou_thresh, class_names=name_classes) start_time = time.time() for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch): conf_loss_sum = 0 loc_loss_sum = 0 time_stamp = time.time() for batch_count, (image, _, cls_all, box_all, _) in enumerate(train_dataloader, start=1): td_batch_size = image.shape[0] image = mx.nd.split(data=image, num_outputs=subdivision, axis=0) cls_all = mx.nd.split(data=cls_all, num_outputs=subdivision, axis=0) box_all = mx.nd.split(data=box_all, num_outputs=subdivision, axis=0) if subdivision == 1: image = [image] cls_t_all = [cls_t_all] box_t_all = [box_t_all] with autograd.record(train_mode=True): cls_all_losses = [] box_all_losses = [] for image_split, cls_split, box_split in zip( image, cls_all, box_all): if GPU_COUNT <= 1: image_split = gluon.utils.split_and_load( image_split, [ctx], even_split=False) cls_split = gluon.utils.split_and_load( cls_split, [ctx], even_split=False) box_split = gluon.utils.split_and_load( box_split, [ctx], even_split=False) else: image_split = gluon.utils.split_and_load( image_split, ctx, even_split=False) cls_split = gluon.utils.split_and_load( cls_split, ctx, even_split=False) box_split = gluon.utils.split_and_load( box_split, ctx, even_split=False) # prediction, target space for Data Parallelism cls_losses = [] box_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, cls_target, box_target in zip( image_split, cls_split, box_split): # 1. SSD network Inference cls_pred, box_pred, anchor = net(img) ''' 4. Hard negative mining (class에만 loss 계산) Hard negative mining After the matching step, most of the default boxes are negatives, especially when the number of possible default boxes is large. This introduces a significant imbalance between the positive and negative training examples. Instead of using all the negative examples, we sort them using the highest confidence loss for each default box and pick the top ones so that the ratio between the negatives and positives is at most 3:1. We found that this leads to faster optimization and a more stable training ''' weight_term_alpha = 1 negative_mining_ratio = 3 positive_samples = cls_target > 0 # True or False positive_numbers = positive_samples.sum() if classHardNegativeMining: pred = mx.nd.log_softmax(cls_pred, axis=-1) negative_samples = 1 - positive_samples conf_loss = -mx.nd.pick( pred, cls_target, axis=-1) # (batch, all feature number) ''' we sort them using the highest confidence loss for each default box and pick the top ones so that the ratio between the negatives and positives is at most 3:1. ''' negative_samples_conf_loss = (conf_loss * negative_samples) # 아래 3줄의 코드 출처 : from gluoncv.loss import SSDMultiBoxLoss negative_samples_index = mx.nd.argsort( negative_samples_conf_loss, axis=-1, is_ascend=False) selection = mx.nd.argsort(negative_samples_index, axis=-1, is_ascend=True) hard_negative_samples = selection <= mx.nd.multiply( positive_numbers, negative_mining_ratio).expand_dims(-1) pos_hardnega = positive_samples + hard_negative_samples conf_loss = mx.nd.where( pos_hardnega > 0, conf_loss, mx.nd.zeros_like(conf_loss)) conf_loss = mx.nd.sum(conf_loss) if positive_numbers: conf_loss = mx.nd.divide( conf_loss, positive_numbers) else: conf_loss = mx.nd.multiply(conf_loss, 0) cls_losses.append(conf_loss.asscalar()) else: conf_loss = confidence_loss( cls_pred, cls_target, positive_samples.expand_dims(axis=-1)) if positive_numbers: conf_loss = mx.nd.divide( conf_loss, positive_numbers) else: conf_loss = mx.nd.multiply(conf_loss, 0) cls_losses.append(conf_loss.asscalar()) if boxHardNegativeMining: # loc loss에도 hard HardNegativeMining 적용해보자. pred = mx.nd.log_softmax(cls_pred, axis=-1) negative_samples = 1 - positive_samples conf_loss_for_box = -mx.nd.pick( pred, cls_target, axis=-1) # (batch, all feature number) negative_samples_conf_loss = (conf_loss_for_box * negative_samples) negative_samples_index = mx.nd.argsort( negative_samples_conf_loss, axis=-1, is_ascend=False) selection = mx.nd.argsort(negative_samples_index, axis=-1, is_ascend=True) hard_negative_samples = selection <= mx.nd.multiply( positive_numbers, negative_mining_ratio).expand_dims(-1) pos_hardnega = positive_samples + hard_negative_samples pos_hardnega = mx.nd.repeat( pos_hardnega.reshape(shape=(0, 0, 1)), repeats=4, axis=-1) loc_loss = mx.nd.abs(box_pred - box_target) loc_loss = mx.nd.where(loc_loss > 1, loc_loss - 0.5, (0.5 / 1) * mx.nd.square(loc_loss)) loc_loss = mx.nd.where(pos_hardnega > 0, loc_loss, mx.nd.zeros_like(loc_loss)) loc_loss = mx.nd.sum(loc_loss) if positive_numbers: loc_loss = mx.nd.divide( loc_loss, positive_numbers) else: loc_loss = mx.nd.multiply(loc_loss, 0) box_losses.append(loc_loss.asscalar()) else: loc_loss = localization_loss( box_pred, box_target, positive_samples.expand_dims(axis=-1)) if positive_numbers: loc_loss = mx.nd.divide( loc_loss, positive_numbers) else: loc_loss = mx.nd.multiply(loc_loss, 0) box_losses.append(loc_loss.asscalar()) total_loss.append(conf_loss + weight_term_alpha * loc_loss) if AMP: with amp.scale_loss(total_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(total_loss) cls_all_losses.append(sum(cls_losses)) box_all_losses.append(sum(box_losses)) trainer.step(batch_size=td_batch_size, ignore_stale_grad=False) # 비우기 for p in net.collect_params().values(): p.zero_grad() conf_loss_sum += sum(cls_all_losses) / td_batch_size loc_loss_sum += sum(box_all_losses) / td_batch_size if batch_count % batch_log == 0: logging.info( f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],' f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],' f'[Lr = {trainer.learning_rate}]' f'[confidence loss = {sum(cls_all_losses) / td_batch_size:.3f}]' f'[localization loss = {sum(box_all_losses) / td_batch_size:.3f}]' ) time_stamp = time.time() train_conf_loss_mean = np.divide(conf_loss_sum, train_update_number_per_epoch) train_loc_loss_mean = np.divide(loc_loss_sum, train_update_number_per_epoch) train_total_loss_mean = train_conf_loss_mean + train_loc_loss_mean logging.info( f"train confidence loss : {train_conf_loss_mean} / train localization loss : {train_loc_loss_mean} / train total loss : {train_total_loss_mean}" ) if i % eval_period == 0 and valid_list: if classHardNegativeMining: confidence_loss = SoftmaxCrossEntropyLoss( axis=-1, sparse_label=True, from_log_softmax=False, batch_axis=None, reduction="sum", exclude=False) if boxHardNegativeMining: localization_loss = HuberLoss(rho=1, batch_axis=None, reduction="sum", exclude=False) conf_loss_sum = 0 loc_loss_sum = 0 for image, label, cls_all, box_all, _ in valid_dataloader: vd_batch_size = image.shape[0] if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) cls_all = gluon.utils.split_and_load(cls_all, [ctx], even_split=False) box_all = gluon.utils.split_and_load(box_all, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) cls_all = gluon.utils.split_and_load(cls_all, [ctx], even_split=False) box_all = gluon.utils.split_and_load(box_all, [ctx], even_split=False) # prediction, target space for Data Parallelism cls_losses = [] box_losses = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, lb, cls_target, box_target in zip( image, label, cls_all, box_all): gt_box = lb[:, :, :4] gt_id = lb[:, :, 4:5] cls_pred, box_pred, anchor = net(img) id, score, bbox = prediction(cls_pred, box_pred, anchor) precision_recall.update(pred_bboxes=bbox, pred_labels=id, pred_scores=score, gt_boxes=gt_box, gt_labels=gt_id) positive_samples = cls_target > 0 positive_numbers = positive_samples.sum() conf_loss = confidence_loss( cls_pred, cls_target, positive_samples.expand_dims(axis=-1)) if positive_numbers: conf_loss = mx.nd.divide(conf_loss, positive_numbers) else: conf_loss = mx.nd.multiply(conf_loss, 0) cls_losses.append(conf_loss.asscalar()) loc_loss = localization_loss( box_pred, box_target, positive_samples.expand_dims(axis=-1)) if positive_numbers: loc_loss = mx.nd.divide(loc_loss, positive_numbers) else: loc_loss = mx.nd.multiply(loc_loss, 0) box_losses.append(loc_loss.asscalar()) conf_loss_sum += sum(cls_losses) / vd_batch_size loc_loss_sum += sum(box_losses) / vd_batch_size valid_conf_loss_mean = np.divide(conf_loss_sum, valid_update_number_per_epoch) valid_loc_loss_mean = np.divide(loc_loss_sum, valid_update_number_per_epoch) valid_total_loss_mean = valid_conf_loss_mean + valid_loc_loss_mean logging.info( f"valid confidence loss : {valid_conf_loss_mean} / valid localization loss : {valid_loc_loss_mean} / valid total loss : {valid_total_loss_mean}" ) AP_appender = [] round_position = 2 class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list( ) for j, c, p, r in zip(range(len(recall)), class_name, precision, recall): name, AP = precision_recall.get_AP(c, p, r) logging.info( f"class {j}'s {name} AP : {round(AP * 100, round_position)}%" ) AP_appender.append(AP) mAP_result = np.mean(AP_appender) logging.info(f"mAP : {round(mAP_result * 100, round_position)}%") precision_recall.get_PR_curve(name=class_name, precision=precision, recall=recall, threshold=threshold, AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i) precision_recall.reset() if tensorboard: # gpu N 개를 대비한 코드 (Data Parallelism) dataloader_iter = iter(valid_dataloader) image, label, _, _, _ = next(dataloader_iter) if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) ground_truth_colors = {} for k in range(num_classes): ground_truth_colors[k] = (0, 0, 1) batch_image = [] for img, lb in zip(image, label): gt_boxes = lb[:, :, :4] gt_ids = lb[:, :, 4:5] cls_pred, box_pred, anchor = net(img) ids, scores, bboxes = prediction(cls_pred, box_pred, anchor) for ig, gt_id, gt_box, id, score, bbox in zip( img, gt_ids, gt_boxes, ids, scores, bboxes): ig = ig.transpose((1, 2, 0)) * mx.nd.array( std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context) ig = (ig * 255).clip(0, 255) # ground truth box 그리기 ground_truth = plot_bbox( ig, gt_box, scores=None, labels=gt_id, thresh=None, reverse_rgb=True, class_names=valid_dataset.classes, absolute_coordinates=True, colors=ground_truth_colors) # prediction box 그리기 prediction_box = plot_bbox( ground_truth, bbox, scores=score, labels=id, thresh=plot_class_thresh, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True) # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다. prediction_box = cv2.cvtColor(prediction_box, cv2.COLOR_BGR2RGB) prediction_box = np.transpose(prediction_box, axes=(2, 0, 1)) batch_image.append( prediction_box) # (batch, channel, height, width) summary.add_image(tag="valid_result", image=np.array(batch_image), global_step=i) summary.add_scalar(tag="conf_loss", value={ "train_conf_loss": train_conf_loss_mean, "valid_conf_loss": valid_conf_loss_mean }, global_step=i) summary.add_scalar(tag="loc_loss", value={ "train_loc_loss": train_loc_loss_mean, "valid_loc_loss": valid_loc_loss_mean }, global_step=i) summary.add_scalar(tag="total_loss", value={ "train_total_loss": train_total_loss_mean, "valid_total_loss": valid_total_loss_mean }, global_step=i) params = net.collect_params().values() if GPU_COUNT > 1: for c in ctx: for p in params: summary.add_histogram(tag=p.name, values=p.data(ctx=c), global_step=i, bins='default') else: for p in params: summary.add_histogram(tag=p.name, values=p.data(), global_step=i, bins='default') if i % save_period == 0: weight_epoch_path = os.path.join(weight_path, str(i)) if not os.path.exists(weight_epoch_path): os.makedirs(weight_epoch_path) ''' Hybrid models can be serialized as JSON files using the export function Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface. When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc. ''' if GPU_COUNT >= 1: context = mx.gpu(0) else: context = mx.cpu(0) postnet = PostNet(net=net, auxnet=prediction) try: net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True) net.save_parameters(os.path.join(weight_path, f"{i}.params")) # onnx 추출용 # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 / onnx로는 추출 못함. export_block_for_cplusplus( path=os.path.join(weight_epoch_path, f"{model}_prepost"), block=postnet, data_shape=tuple(input_size) + tuple((3, )), epoch=i, preprocess= True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=context, remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공") net.collect_params().reset_ctx(ctx) end_time = time.time() learning_time = end_time - start_time logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H") logging.info("optimization completed") if using_mlflow: ml.log_metric("learning time", round(learning_time / 3600, 2))