def run(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], graphviz=True, epoch=100, input_size=[512, 512], batch_size=16, batch_log=100, batch_interval=10, subdivision=4, train_dataset_path="Dataset/train", valid_dataset_path="Dataset/valid", multiscale=True, factor_scale=[8, 5], data_augmentation=True, num_workers=4, optimizer="ADAM", lambda_off=1, lambda_size=0.1, save_period=5, load_period=10, learning_rate=0.001, decay_lr=0.999, decay_step=10, GPU_COUNT=0, base=18, pretrained_base=True, pretrained_path="modelparam", AMP=True, valid_size=8, eval_period=5, tensorboard=True, valid_graph_path="valid_Graph", using_mlflow=True, topk=100, plot_class_thresh=0.5): ''' AMP 가 모든 연산을 지원하지는 않는다. modulated convolution을 지원하지 않음 ''' if GPU_COUNT == 0: ctx = mx.cpu(0) AMP = False elif GPU_COUNT == 1: ctx = mx.gpu(0) else: ctx = [mx.gpu(i) for i in range(GPU_COUNT)] # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if isinstance(ctx, (list, tuple)): for i, c in enumerate(ctx): free_memory, total_memory = mx.context.gpu_memory_info(i) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: if GPU_COUNT == 1: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: logging.info(f'Running on {ctx}') if GPU_COUNT > 0 and batch_size < GPU_COUNT: logging.info("batch size must be greater than gpu number") exit(0) if AMP: amp.init() if multiscale: logging.info("Using MultiScale") if data_augmentation: logging.info("Using Data Augmentation") logging.info("training Center Detector") input_shape = (1, 3) + tuple(input_size) scale_factor = 4 # 고정 logging.info(f"scale factor {scale_factor}") try: train_dataloader, train_dataset = traindataloader( multiscale=multiscale, factor_scale=factor_scale, augmentation=data_augmentation, path=train_dataset_path, input_size=input_size, batch_size=batch_size, batch_interval=batch_interval, num_workers=num_workers, shuffle=True, mean=mean, std=std, scale_factor=scale_factor, make_target=True) valid_dataloader, valid_dataset = validdataloader( path=valid_dataset_path, input_size=input_size, batch_size=valid_size, num_workers=num_workers, shuffle=True, mean=mean, std=std, scale_factor=scale_factor, make_target=True) except Exception as E: logging.info(E) exit(0) train_update_number_per_epoch = len(train_dataloader) if train_update_number_per_epoch < 1: logging.warning("train batch size가 데이터 수보다 큼") exit(0) valid_list = glob.glob(os.path.join(valid_dataset_path, "*")) if valid_list: valid_update_number_per_epoch = len(valid_dataloader) if valid_update_number_per_epoch < 1: logging.warning("valid batch size가 데이터 수보다 큼") exit(0) num_classes = train_dataset.num_class # 클래스 수 name_classes = train_dataset.classes optimizer = optimizer.upper() if pretrained_base: model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_P" + "CENTER_RES" + str(base) else: model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_CENTER_RES" + str(base) weight_path = f"weights/{model}" sym_path = os.path.join(weight_path, f'{model}-symbol.json') param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params') if os.path.exists(param_path) and os.path.exists(sym_path): start_epoch = load_period logging.info(f"loading {os.path.basename(param_path)} weights\n") net = gluon.SymbolBlock.imports(sym_path, ['data'], param_path, ctx=ctx) else: start_epoch = 0 net = CenterNet(base=base, heads=OrderedDict([('heatmap', { 'num_output': num_classes, 'bias': -2.19 }), ('offset', { 'num_output': 2 }), ('wh', { 'num_output': 2 })]), head_conv_channel=64, pretrained=pretrained_base, root=pretrained_path, use_dcnv2=False, ctx=ctx) if isinstance(ctx, (list, tuple)): net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.summary(mx.nd.ones(shape=input_shape, ctx=ctx)) ''' active (bool, default True) – Whether to turn hybrid on or off. static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase. static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower. ''' if multiscale: net.hybridize(active=True, static_alloc=True, static_shape=False) else: net.hybridize(active=True, static_alloc=True, static_shape=True) if start_epoch + 1 >= epoch + 1: logging.info("this model has already been optimized") exit(0) if tensorboard: summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10, verbose=False) if isinstance(ctx, (list, tuple)): net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.forward(mx.nd.ones(shape=input_shape, ctx=ctx)) summary.add_graph(net) if graphviz: gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model) # optimizer unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size step = unit * decay_step lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate) for p in net.collect_params().values(): if p.grad_req != "null": p.grad_req = 'add' if AMP: ''' update_on_kvstore : bool, default None Whether to perform parameter updates on kvstore. If None, then trainer will choose the more suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored. ''' if optimizer.upper() == "ADAM": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "SGD": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0001, "momentum": 0.9, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling else: logging.error("optimizer not selected") exit(0) amp.init_trainer(trainer) else: if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False }) elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False }) elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0001, "momentum": 0.9, 'multi_precision': False }) else: logging.error("optimizer not selected") exit(0) heatmapfocalloss = HeatmapFocalLoss(from_sigmoid=True, alpha=2, beta=4) normedl1loss = NormedL1Loss() prediction = Prediction(batch_size=valid_size, topk=topk, scale=scale_factor) precision_recall = Voc_2007_AP(iou_thresh=0.5, class_names=name_classes) start_time = time.time() for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch): heatmap_loss_sum = 0 offset_loss_sum = 0 wh_loss_sum = 0 time_stamp = time.time() ''' target generator를 train_dataloader에서 만들어 버리는게 학습 속도가 훨씬 빠르다. ''' for batch_count, (image, _, heatmap, offset_target, wh_target, mask_target, _) in enumerate(train_dataloader, start=1): td_batch_size = image.shape[0] image_split = mx.nd.split(data=image, num_outputs=subdivision, axis=0) heatmap_split = mx.nd.split(data=heatmap, num_outputs=subdivision, axis=0) offset_target_split = mx.nd.split(data=offset_target, num_outputs=subdivision, axis=0) wh_target_split = mx.nd.split(data=wh_target, num_outputs=subdivision, axis=0) mask_target_split = mx.nd.split(data=mask_target, num_outputs=subdivision, axis=0) if subdivision == 1: image_split = [image_split] heatmap_split = [heatmap_split] offset_target_split = [offset_target_split] wh_target_split = [wh_target_split] mask_target_split = [mask_target_split] ''' autograd 설명 https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html ''' with autograd.record(train_mode=True): heatmap_all_losses = [] offset_all_losses = [] wh_all_losses = [] for image_part, heatmap_part, offset_target_part, wh_target_part, mask_target_part in zip( image_split, heatmap_split, offset_target_split, wh_target_split, mask_target_split): if GPU_COUNT <= 1: image_part = gluon.utils.split_and_load( image_part, [ctx], even_split=False) heatmap_part = gluon.utils.split_and_load( heatmap_part, [ctx], even_split=False) offset_target_part = gluon.utils.split_and_load( offset_target_part, [ctx], even_split=False) wh_target_part = gluon.utils.split_and_load( wh_target_part, [ctx], even_split=False) mask_target_part = gluon.utils.split_and_load( mask_target_part, [ctx], even_split=False) else: image_part = gluon.utils.split_and_load( image_part, ctx, even_split=False) heatmap_part = gluon.utils.split_and_load( heatmap_part, ctx, even_split=False) offset_target_part = gluon.utils.split_and_load( offset_target_part, ctx, even_split=False) wh_target_part = gluon.utils.split_and_load( wh_target_part, ctx, even_split=False) mask_target_part = gluon.utils.split_and_load( mask_target_part, ctx, even_split=False) # prediction, target space for Data Parallelism heatmap_losses = [] offset_losses = [] wh_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, heatmap_target, offset_target, wh_target, mask_target in zip( image_part, heatmap_part, offset_target_part, wh_target_part, mask_target_part): heatmap_pred, offset_pred, wh_pred = net(img) heatmap_loss = heatmapfocalloss( heatmap_pred, heatmap_target) offset_loss = normedl1loss(offset_pred, offset_target, mask_target) * lambda_off wh_loss = normedl1loss(wh_pred, wh_target, mask_target) * lambda_size heatmap_losses.append(heatmap_loss.asscalar()) offset_losses.append(offset_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) total_loss.append(heatmap_loss + offset_loss + wh_loss) if AMP: with amp.scale_loss(total_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(total_loss) heatmap_all_losses.append(sum(heatmap_losses)) offset_all_losses.append(sum(offset_losses)) wh_all_losses.append(sum(wh_losses)) trainer.step(batch_size=td_batch_size, ignore_stale_grad=False) # 비우기 for p in net.collect_params().values(): p.zero_grad() heatmap_loss_sum += sum(heatmap_all_losses) / td_batch_size offset_loss_sum += sum(offset_all_losses) / td_batch_size wh_loss_sum += sum(wh_all_losses) / td_batch_size if batch_count % batch_log == 0: logging.info( f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],' f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],' f'[Lr = {trainer.learning_rate}]' f'[heatmap loss = {sum(heatmap_all_losses) / td_batch_size:.3f}]' f'[offset loss = {sum(offset_all_losses) / td_batch_size:.3f}]' f'[wh loss = {sum(wh_all_losses) / td_batch_size:.3f}]') time_stamp = time.time() train_heatmap_loss_mean = np.divide(heatmap_loss_sum, train_update_number_per_epoch) train_offset_loss_mean = np.divide(offset_loss_sum, train_update_number_per_epoch) train_wh_loss_mean = np.divide(wh_loss_sum, train_update_number_per_epoch) train_total_loss_mean = train_heatmap_loss_mean + train_offset_loss_mean + train_wh_loss_mean logging.info( f"train heatmap loss : {train_heatmap_loss_mean} / train offset loss : {train_offset_loss_mean} / train wh loss : {train_wh_loss_mean} / train total loss : {train_total_loss_mean}" ) if i % eval_period == 0 and valid_list: heatmap_loss_sum = 0 offset_loss_sum = 0 wh_loss_sum = 0 # loss 구하기 for image, label, heatmap_all, offset_target_all, wh_target_all, mask_target_all, _ in valid_dataloader: vd_batch_size = image.shape[0] if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) heatmap_split = gluon.utils.split_and_load( heatmap_all, [ctx], even_split=False) offset_target_split = gluon.utils.split_and_load( offset_target_all, [ctx], even_split=False) wh_target_split = gluon.utils.split_and_load( wh_target_all, [ctx], even_split=False) mask_target_split = gluon.utils.split_and_load( mask_target_all, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) heatmap_split = gluon.utils.split_and_load( heatmap_all, ctx, even_split=False) offset_target_split = gluon.utils.split_and_load( offset_target_all, ctx, even_split=False) wh_target_split = gluon.utils.split_and_load( wh_target_all, ctx, even_split=False) mask_target_split = gluon.utils.split_and_load( mask_target_all, ctx, even_split=False) # prediction, target space for Data Parallelism heatmap_losses = [] offset_losses = [] wh_losses = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, lb, heatmap_target, offset_target, wh_target, mask_target in zip( image, label, heatmap_split, offset_target_split, wh_target_split, mask_target_split): gt_box = lb[:, :, :4] gt_id = lb[:, :, 4:5] heatmap_pred, offset_pred, wh_pred = net(img) id, score, bbox = prediction(heatmap_pred, offset_pred, wh_pred) precision_recall.update(pred_bboxes=bbox, pred_labels=id, pred_scores=score, gt_boxes=gt_box * scale_factor, gt_labels=gt_id) heatmap_loss = heatmapfocalloss(heatmap_pred, heatmap_target) offset_loss = normedl1loss(offset_pred, offset_target, mask_target) * lambda_off wh_loss = normedl1loss(wh_pred, wh_target, mask_target) * lambda_size heatmap_losses.append(heatmap_loss.asscalar()) offset_losses.append(offset_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) heatmap_loss_sum += sum(heatmap_losses) / vd_batch_size offset_loss_sum += sum(offset_losses) / vd_batch_size wh_loss_sum += sum(wh_losses) / vd_batch_size valid_heatmap_loss_mean = np.divide(heatmap_loss_sum, valid_update_number_per_epoch) valid_offset_loss_mean = np.divide(offset_loss_sum, valid_update_number_per_epoch) valid_wh_loss_mean = np.divide(wh_loss_sum, valid_update_number_per_epoch) valid_total_loss_mean = valid_heatmap_loss_mean + valid_offset_loss_mean + valid_wh_loss_mean logging.info( f"valid heatmap loss : {valid_heatmap_loss_mean} / valid offset loss : {valid_offset_loss_mean} / valid wh loss : {valid_wh_loss_mean} / valid total loss : {valid_total_loss_mean}" ) AP_appender = [] round_position = 2 class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list( ) for j, c, p, r in zip(range(len(recall)), class_name, precision, recall): name, AP = precision_recall.get_AP(c, p, r) logging.info( f"class {j}'s {name} AP : {round(AP * 100, round_position)}%" ) AP_appender.append(AP) mAP_result = np.mean(AP_appender) logging.info(f"mAP : {round(mAP_result * 100, round_position)}%") precision_recall.get_PR_curve(name=class_name, precision=precision, recall=recall, threshold=threshold, AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i) precision_recall.reset() if tensorboard: # gpu N 개를 대비한 코드 (Data Parallelism) dataloader_iter = iter(valid_dataloader) image, label, _, _, _, _, _ = next(dataloader_iter) if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) ground_truth_colors = {} for k in range(num_classes): ground_truth_colors[k] = (0, 0, 1) batch_image = [] heatmap_image = [] for img, lb in zip(image, label): gt_boxes = lb[:, :, :4] gt_ids = lb[:, :, 4:5] heatmap_pred, offset_pred, wh_pred = net(img) ids, scores, bboxes = prediction(heatmap_pred, offset_pred, wh_pred) for ig, gt_id, gt_box, heatmap, id, score, bbox in zip( img, gt_ids, gt_boxes, heatmap_pred, ids, scores, bboxes): ig = ig.transpose((1, 2, 0)) * mx.nd.array( std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context) ig = (ig * 255).clip(0, 255) # heatmap 그리기 heatmap = mx.nd.multiply(heatmap, 255.0) # 0 ~ 255 범위로 바꾸기 heatmap = mx.nd.max( heatmap, axis=0, keepdims=True) # channel 축으로 가장 큰것 뽑기 heatmap = mx.nd.transpose( heatmap, axes=(1, 2, 0)) # (height, width, channel=1) heatmap = mx.nd.repeat( heatmap, repeats=3, axis=-1) # (height, width, channel=3) heatmap = heatmap.asnumpy( ) # mxnet.ndarray -> numpy.ndarray heatmap = cv2.resize(heatmap, dsize=(input_size[1], input_size[0])) # 사이즈 원복 heatmap = heatmap.astype("uint8") # float32 -> uint8 heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET) heatmap[:, :, (0, 1, 2)] = heatmap[:, :, (2, 1, 0)] # BGR -> RGB heatmap = np.transpose( heatmap, axes=(2, 0, 1)) # (channel=3, height, width) # ground truth box 그리기 ground_truth = plot_bbox( ig, gt_box * scale_factor, scores=None, labels=gt_id, thresh=None, reverse_rgb=True, class_names=valid_dataset.classes, absolute_coordinates=True, colors=ground_truth_colors) # prediction box 그리기 prediction_box = plot_bbox( ground_truth, bbox, scores=score, labels=id, thresh=plot_class_thresh, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True) # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다. prediction_box = cv2.cvtColor(prediction_box, cv2.COLOR_BGR2RGB) prediction_box = np.transpose(prediction_box, axes=(2, 0, 1)) batch_image.append( prediction_box) # (batch, channel, height, width) heatmap_image.append(heatmap) all_image = np.concatenate( [np.array(batch_image), np.array(heatmap_image)], axis=-1) summary.add_image(tag="valid_result", image=all_image, global_step=i) summary.add_scalar(tag="heatmap_loss", value={ "train_heatmap_loss_mean": train_heatmap_loss_mean, "valid_heatmap_loss_mean": valid_heatmap_loss_mean }, global_step=i) summary.add_scalar(tag="offset_loss", value={ "train_offset_loss_mean": train_offset_loss_mean, "valid_offset_loss_mean": valid_offset_loss_mean }, global_step=i) summary.add_scalar(tag="wh_loss", value={ "train_wh_loss_mean": train_wh_loss_mean, "valid_wh_loss_mean": valid_wh_loss_mean }, global_step=i) summary.add_scalar(tag="total_loss", value={ "train_total_loss": train_total_loss_mean, "valid_total_loss": valid_total_loss_mean }, global_step=i) params = net.collect_params().values() if GPU_COUNT > 1: for c in ctx: for p in params: summary.add_histogram(tag=p.name, values=p.data(ctx=c), global_step=i, bins='default') else: for p in params: summary.add_histogram(tag=p.name, values=p.data(), global_step=i, bins='default') if i % save_period == 0: if not os.path.exists(weight_path): os.makedirs(weight_path) ''' Hybrid models can be serialized as JSON files using the export function Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface. When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc. ''' if GPU_COUNT >= 1: context = mx.gpu(0) else: context = mx.cpu(0) postnet = PostNet(net=net, auxnet=prediction) # 새로운 객체가 생성 try: net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True) net.save_parameters(os.path.join(weight_path, f"{i}.params")) # onnx 추출용 # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 export_block_for_cplusplus( path=os.path.join(weight_path, f"{model}_prepost"), block=postnet, data_shape=tuple(input_size) + tuple((3, )), epoch=i, preprocess= True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=context, remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공") net.collect_params().reset_ctx(ctx) end_time = time.time() learning_time = end_time - start_time logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H") logging.info("optimization completed") if using_mlflow: ml.log_metric("learning time", round(learning_time / 3600, 2))
root = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) ''' heatmap의 bias가 -2.19 인 이유는??? retinanet의 식과 같은데... 흠.. For the final conv layer of the classification subnet, we set the bias initialization to b = − log((1 − π)/π), where π specifies that at the start of training every anchor should be labeled as foreground with confidence of ∼π. We use π = .01 in all experiments, although results are robust to the exact value. As explained in §3.3, this initialization prevents the large number of background anchors from generating a large, destabilizing loss value in the first iteration of training ''' net = CenterNet(base=18, input_frame_number=2, heads=OrderedDict([('heatmap', { 'num_output': 1, 'bias': -2.19 }), ('offset', { 'num_output': 2 }), ('wh', { 'num_output': 2 })]), head_conv_channel=64, pretrained=False) prediction = Prediction(batch_size=10, unique_ids=["smoke"], topk=100, scale=scale_factor, nms=True, except_class_thresh=0.1, nms_thresh=0.5) heatmap, offset, wh = net(torch.rand(2, 6, input_size[0], input_size[1])) ids, scores, bboxes = prediction(heatmap, offset, wh)
mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), make_target=False) dataset = DetectionDataset(path=os.path.join(root, 'valid'), transform=transform) num_classes = dataset.num_class name_classes = dataset.classes length = len(dataset) image, label, _, _, _ = dataset[random.randint(0, length - 1)] net = CenterNet(base=18, heads=OrderedDict([('heatmap', { 'num_output': num_classes, 'bias': -2.19 }), ('offset', { 'num_output': 2 }), ('wh', { 'num_output': 2 })]), head_conv_channel=64, pretrained=False) prediction = Prediction(unique_ids=name_classes, topk=100, scale=4) precision_recall_2007 = Voc_2007_AP(iou_thresh=0.5, class_names=name_classes) precision_recall_2010 = Voc_2010_AP(iou_thresh=0.5, class_names=name_classes) # batch 형태로 만들기 data = image[None, :, :, :] label = label[None, :, :]
os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) transform = CenterTrainTransform(input_size, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) dataset = DetectionDataset(path=os.path.join(root, 'Dataset', 'train'), transform=transform) num_classes = dataset.num_class name_classes = dataset.classes length = len(dataset) image, label, _ = dataset[random.randint(0, length - 1)] net = CenterNet(base=18, heads=OrderedDict([('heatmap', { 'num_output': num_classes, 'bias': -2.19 }), ('offset', { 'num_output': 2 }), ('wh', { 'num_output': 2 })]), head_conv_channel=64) net.hybridize(active=True, static_alloc=True, static_shape=True) prediction = Prediction(topk=100, scale=4) precision_recall_2007 = Voc_2007_AP(iou_thresh=0.5, class_names=name_classes) precision_recall_2010 = Voc_2010_AP(iou_thresh=0.5, class_names=name_classes) # batch 형태로 만들기 data = image.expand_dims(axis=0) label = np.expand_dims(label, axis=0)
def run(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], epoch=100, input_size=[512, 512], input_frame_number=2, batch_size=16, batch_log=100, subdivision=4, train_dataset_path="Dataset/train", valid_dataset_path="Dataset/valid", data_augmentation=True, num_workers=4, optimizer="ADAM", lambda_off=1, lambda_size=0.1, save_period=5, load_period=10, learning_rate=0.001, decay_lr=0.999, decay_step=10, GPU_COUNT=0, base=18, pretrained_base=True, valid_size=8, eval_period=5, tensorboard=True, valid_graph_path="valid_Graph", valid_html_auto_open=True, using_mlflow=True, topk=100, iou_thresh=0.5, nms=False, except_class_thresh=0.01, nms_thresh=0.5, plot_class_thresh=0.5): if GPU_COUNT == 0: device = torch.device("cpu") elif GPU_COUNT == 1: device = torch.device("cuda") else: device = [torch.device(f"cuda:{i}") for i in range(0, GPU_COUNT)] if isinstance(device, (list, tuple)): context = device[0] else: context = device # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") # free memory는 정확하지 않은 것 같고, torch.cuda.max_memory_allocated() 가 정확히 어떻게 동작하는지? if isinstance(device, (list, tuple)): for i, d in enumerate(device): total_memory = torch.cuda.get_device_properties(d).total_memory free_memory = total_memory - torch.cuda.max_memory_allocated(d) free_memory = round(free_memory / (1024**3), 2) total_memory = round(total_memory / (1024**3), 2) logging.info(f'{torch.cuda.get_device_name(d)}') logging.info( f'Running on {d} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: if GPU_COUNT == 1: total_memory = torch.cuda.get_device_properties( device).total_memory free_memory = total_memory - torch.cuda.max_memory_allocated( device) free_memory = round(free_memory / (1024**3), 2) total_memory = round(total_memory / (1024**3), 2) logging.info(f'{torch.cuda.get_device_name(device)}') logging.info( f'Running on {device} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: logging.info(f'Running on {device}') if GPU_COUNT > 0 and batch_size < GPU_COUNT: logging.info("batch size must be greater than gpu number") exit(0) if data_augmentation: logging.info("Using Data Augmentation") logging.info("training Center Detector") input_shape = (1, 3 * input_frame_number) + tuple(input_size) scale_factor = 4 # 고정 logging.info(f"scale factor {scale_factor}") train_dataloader, train_dataset = traindataloader( augmentation=data_augmentation, path=train_dataset_path, input_size=input_size, input_frame_number=input_frame_number, batch_size=batch_size, pin_memory=True, num_workers=num_workers, shuffle=True, mean=mean, std=std, scale_factor=scale_factor, make_target=True) train_update_number_per_epoch = len(train_dataloader) if train_update_number_per_epoch < 1: logging.warning("train batch size가 데이터 수보다 큼") exit(0) valid_list = glob.glob(os.path.join(valid_dataset_path, "*")) if valid_list: valid_dataloader, valid_dataset = validdataloader( path=valid_dataset_path, input_size=input_size, input_frame_number=input_frame_number, batch_size=valid_size, num_workers=num_workers, pin_memory=True, shuffle=True, mean=mean, std=std, scale_factor=scale_factor, make_target=True) valid_update_number_per_epoch = len(valid_dataloader) if valid_update_number_per_epoch < 1: logging.warning("valid batch size가 데이터 수보다 큼") exit(0) num_classes = train_dataset.num_class # 클래스 수 name_classes = train_dataset.classes optimizer = optimizer.upper() if pretrained_base: model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_P" + "CENTER_RES" + str(base) else: model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_CENTER_RES" + str(base) # https://discuss.pytorch.org/t/how-to-save-the-optimizer-setting-in-a-log-in-pytorch/17187 weight_path = os.path.join("weights", f"{model}") param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.pt') start_epoch = 0 net = CenterNet(base=base, input_frame_number=input_frame_number, heads=OrderedDict([('heatmap', { 'num_output': num_classes, 'bias': -2.19 }), ('offset', { 'num_output': 2 }), ('wh', { 'num_output': 2 })]), head_conv_channel=64, pretrained=pretrained_base) # https://github.com/sksq96/pytorch-summary modelsummary(net.to(context), input_shape[1:]) if tensorboard: summary = SummaryWriter(log_dir=os.path.join("torchboard", model), max_queue=10, flush_secs=10) summary.add_graph(net.to(context), input_to_model=torch.ones(input_shape, device=context), verbose=False) if os.path.exists(param_path): start_epoch = load_period checkpoint = torch.load(param_path) if 'model_state_dict' in checkpoint: try: net.load_state_dict(checkpoint['model_state_dict']) except Exception as E: logging.info(E) else: logging.info(f"loading model_state_dict") if start_epoch + 1 >= epoch + 1: logging.info("this model has already been optimized") exit(0) net.to(context) if optimizer.upper() == "ADAM": trainer = Adam(net.parameters(), lr=learning_rate, betas=(0.9, 0.999), weight_decay=0.000001) elif optimizer.upper() == "RMSPROP": trainer = RMSprop(net.parameters(), lr=learning_rate, alpha=0.99, weight_decay=0.000001, momentum=0) elif optimizer.upper() == "SGD": trainer = SGD(net.parameters(), lr=learning_rate, momentum=0.9, weight_decay=0.000001) else: logging.error("optimizer not selected") exit(0) if os.path.exists(param_path): # optimizer weight 불러오기 checkpoint = torch.load(param_path) if 'optimizer_state_dict' in checkpoint: try: trainer.load_state_dict(checkpoint['optimizer_state_dict']) except Exception as E: logging.info(E) else: logging.info(f"loading optimizer_state_dict") if isinstance(device, (list, tuple)): net = DataParallel(net, device_ids=device, output_device=context, dim=0) # optimizer # https://pytorch.org/docs/master/optim.html?highlight=lr%20sche#torch.optim.lr_scheduler.CosineAnnealingLR unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size step = unit * decay_step lr_sch = lr_scheduler.StepLR(trainer, step, gamma=decay_lr, last_epoch=-1) heatmapfocalloss = HeatmapFocalLoss(from_sigmoid=True, alpha=2, beta=4) normedl1loss = NormedL1Loss() prediction = Prediction(batch_size=valid_size, unique_ids=name_classes, topk=topk, scale=scale_factor, nms=nms, except_class_thresh=except_class_thresh, nms_thresh=nms_thresh) precision_recall = Voc_2007_AP(iou_thresh=iou_thresh, class_names=name_classes) # torch split이 numpy, mxnet split과 달라서 아래와 같은 작업을 하는 것 if batch_size % subdivision == 0: chunk = int(batch_size) // int(subdivision) else: logging.info(f"batch_size / subdivision 이 나누어 떨어지지 않습니다.") logging.info(f"subdivision 을 다시 설정하고 학습 진행하세요.") exit(0) start_time = time.time() for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch): heatmap_loss_sum = 0 offset_loss_sum = 0 wh_loss_sum = 0 time_stamp = time.time() # multiscale을 하게되면 여기서 train_dataloader을 다시 만드는 것이 좋겠군.. for batch_count, (image, _, heatmap_target, offset_target, wh_target, mask_target, _) in enumerate(train_dataloader, start=1): trainer.zero_grad() image = image.to(context) ''' 이렇게 하는 이유? 209 line에서 net = net.to(context)로 함 gpu>=1 인 경우 net = DataParallel(net, device_ids=device, output_device=context, dim=0) 에서 output_device - gradient가 계산되는 곳을 context로 했기 때문에 아래의 target들도 context로 지정해줘야 함 ''' heatmap_target = heatmap_target.to(context) offset_target = offset_target.to(context) wh_target = wh_target.to(context) mask_target = mask_target.to(context) image_split = torch.split(image, chunk, dim=0) heatmap_target_split = torch.split(heatmap_target, chunk, dim=0) offset_target_split = torch.split(offset_target, chunk, dim=0) wh_target_split = torch.split(wh_target, chunk, dim=0) mask_target_split = torch.split(mask_target, chunk, dim=0) heatmap_losses = [] offset_losses = [] wh_losses = [] total_loss = [] for image_part, heatmap_target_part, offset_target_part, wh_target_part, mask_target_part in zip( image_split, heatmap_target_split, offset_target_split, wh_target_split, mask_target_split): heatmap_pred, offset_pred, wh_pred = net(image_part) ''' pytorch는 trainer.step()에서 batch_size 인자가 없다. Loss 구현시 고려해야 한다.(mean 모드) ''' heatmap_loss = torch.div( heatmapfocalloss(heatmap_pred, heatmap_target_part), subdivision) offset_loss = torch.div( normedl1loss(offset_pred, offset_target_part, mask_target_part) * lambda_off, subdivision) wh_loss = torch.div( normedl1loss(wh_pred, wh_target_part, mask_target_part) * lambda_size, subdivision) heatmap_losses.append(heatmap_loss.item()) offset_losses.append(offset_loss.item()) wh_losses.append(wh_loss.item()) total_loss.append(heatmap_loss + offset_loss + wh_loss) # batch size만큼 나눠줘야 하지 않나? autograd.backward(total_loss) trainer.step() lr_sch.step() heatmap_loss_sum += sum(heatmap_losses) offset_loss_sum += sum(offset_losses) wh_loss_sum += sum(wh_losses) if batch_count % batch_log == 0: logging.info( f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],' f'[Speed {image.shape[0] / (time.time() - time_stamp):.3f} samples/sec],' f'[Lr = {lr_sch.get_last_lr()}]' f'[heatmap loss = {sum(heatmap_losses):.3f}]' f'[offset loss = {sum(offset_losses):.3f}]' f'[wh loss = {sum(wh_losses):.3f}]') time_stamp = time.time() train_heatmap_loss_mean = np.divide(heatmap_loss_sum, train_update_number_per_epoch) train_offset_loss_mean = np.divide(offset_loss_sum, train_update_number_per_epoch) train_wh_loss_mean = np.divide(wh_loss_sum, train_update_number_per_epoch) train_total_loss_mean = train_heatmap_loss_mean + train_offset_loss_mean + train_wh_loss_mean logging.info( f"train heatmap loss : {train_heatmap_loss_mean} / train offset loss : {train_offset_loss_mean} / train wh loss : {train_wh_loss_mean} / train total loss : {train_total_loss_mean}" ) if i % save_period == 0: if not os.path.exists(weight_path): os.makedirs(weight_path) module = net.module if isinstance(device, (list, tuple)) else net auxnet = Prediction(unique_ids=name_classes, topk=topk, scale=scale_factor, nms=nms, except_class_thresh=except_class_thresh, nms_thresh=nms_thresh) prepostnet = PrePostNet( net=module, auxnet=auxnet, input_frame_number=input_frame_number) # 새로운 객체가 생성 try: torch.save( { 'model_state_dict': net.state_dict(), 'optimizer_state_dict': trainer.state_dict() }, os.path.join(weight_path, f'{model}-{i:04d}.pt')) # torch.jit.trace() 보다는 control-flow 연산 적용이 가능한 torch.jit.script() 을 사용하자 # torch.jit.script script = torch.jit.script(module) script.save(os.path.join(weight_path, f'{model}-{i:04d}.jit')) script = torch.jit.script(prepostnet) script.save( os.path.join(weight_path, f'{model}-prepost-{i:04d}.jit')) # # torch.jit.trace - 안 써짐 # 오류 : Expected object of device type cuda but got device type cpu for argument #2 'other' in call to _th_fmod # trace = torch.jit.trace(prepostnet, torch.rand(input_shape[0], input_shape[1], input_shape[2], input_shape[3], device=context)) # trace.save(os.path.join(weight_path, f'{model}-{i:04d}.jit')) except Exception as E: logging.error(f"pt, jit export 예외 발생 : {E}") else: logging.info("pt, jit export 성공") if i % eval_period == 0 and valid_list: heatmap_loss_sum = 0 offset_loss_sum = 0 wh_loss_sum = 0 # loss 구하기 for image, label, heatmap_target, offset_target, wh_target, mask_target, _ in valid_dataloader: image = image.to(context) label = label.to(context) gt_box = label[:, :, :4] gt_id = label[:, :, 4:5] heatmap_target = heatmap_target.to(context) offset_target = offset_target.to(context) wh_target = wh_target.to(context) mask_target = mask_target.to(context) heatmap_pred, offset_pred, wh_pred = net(image) id, score, bbox = prediction(heatmap_pred, offset_pred, wh_pred) precision_recall.update(pred_bboxes=bbox, pred_labels=id, pred_scores=score, gt_boxes=gt_box * scale_factor, gt_labels=gt_id) heatmap_loss = heatmapfocalloss(heatmap_pred, heatmap_target) offset_loss = normedl1loss(offset_pred, offset_target, mask_target) * lambda_off wh_loss = normedl1loss(wh_pred, wh_target, mask_target) * lambda_size heatmap_loss_sum += heatmap_loss.item() offset_loss_sum += offset_loss.item() wh_loss_sum += wh_loss.item() valid_heatmap_loss_mean = np.divide(heatmap_loss_sum, valid_update_number_per_epoch) valid_offset_loss_mean = np.divide(offset_loss_sum, valid_update_number_per_epoch) valid_wh_loss_mean = np.divide(wh_loss_sum, valid_update_number_per_epoch) valid_total_loss_mean = valid_heatmap_loss_mean + valid_offset_loss_mean + valid_wh_loss_mean logging.info( f"valid heatmap loss : {valid_heatmap_loss_mean} / valid offset loss : {valid_offset_loss_mean} / valid wh loss : {valid_wh_loss_mean} / valid total loss : {valid_total_loss_mean}" ) AP_appender = [] round_position = 2 class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list( ) for j, c, p, r in zip(range(len(recall)), class_name, precision, recall): name, AP = precision_recall.get_AP(c, p, r) logging.info( f"class {j}'s {name} AP : {round(AP * 100, round_position)}%" ) AP_appender.append(AP) mAP_result = np.mean(AP_appender) logging.info(f"mAP : {round(mAP_result * 100, round_position)}%") precision_recall.get_PR_curve(name=class_name, precision=precision, recall=recall, threshold=threshold, AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i, auto_open=valid_html_auto_open) precision_recall.reset() if tensorboard: batch_image = [] ground_truth_colors = {} for k in range(num_classes): ground_truth_colors[k] = (0, 1, 0) # RGB dataloader_iter = iter(valid_dataloader) image, label, _, _, _, _, _ = next(dataloader_iter) image = image.to(context) label = label.to(context) gt_boxes = label[:, :, :4] gt_ids = label[:, :, 4:5] heatmap_pred, offset_pred, wh_pred = net(image) ids, scores, bboxes = prediction(heatmap_pred, offset_pred, wh_pred) for img, gt_id, gt_box, heatmap, id, score, bbox in zip( image, gt_ids, gt_boxes, heatmap_pred, ids, scores, bboxes): split_img = torch.split(img, 3, dim=0) # numpy split과 다르네... hconcat_image_list = [] for j, ig in enumerate(split_img): ig = ig.permute((1, 2, 0)) * torch.tensor( std, device=ig.device) + torch.tensor( mean, device=ig.device) ig = (ig * 255).clamp(0, 255) ig = ig.to(torch.uint8) ig = ig.detach().cpu().numpy().copy() if j == len(split_img) - 1: # 마지막 이미지 # heatmap 그리기 heatmap = heatmap.detach().cpu().numpy().copy() heatmap = np.multiply(heatmap, 255.0) # 0 ~ 255 범위로 바꾸기 heatmap = np.amax( heatmap, axis=0, keepdims=True) # channel 축으로 가장 큰것 뽑기 heatmap = np.transpose( heatmap, axes=(1, 2, 0)) # (height, width, channel=1) heatmap = np.repeat(heatmap, 3, axis=-1) heatmap = heatmap.astype( "uint8") # float32 -> uint8 heatmap = cv2.resize( heatmap, dsize=(input_size[1], input_size[0])) # 사이즈 원복 heatmap = cv2.applyColorMap( heatmap, cv2.COLORMAP_JET) heatmap = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB) # ground truth box 그리기 ground_truth = plot_bbox( ig, gt_box * scale_factor, scores=None, labels=gt_id, thresh=None, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True, colors=ground_truth_colors) # # prediction box 그리기 prediction_box = plot_bbox( ground_truth, bbox, scores=score, labels=id, thresh=plot_class_thresh, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True, heatmap=heatmap) hconcat_image_list.append(prediction_box) else: hconcat_image_list.append(ig) hconcat_images = np.concatenate(hconcat_image_list, axis=1) # Tensorboard에 그리기 위해 (height, width, channel) -> (channel, height, width) 를한다. hconcat_images = np.transpose(hconcat_images, axes=(2, 0, 1)) batch_image.append( hconcat_images) # (batch, channel, height, width) img_grid = torchvision.utils.make_grid( torch.as_tensor(batch_image), nrow=1) summary.add_image(tag="valid_result", img_tensor=img_grid, global_step=i) summary.add_scalar(tag="heatmap_loss/train_heatmap_loss_mean", scalar_value=train_heatmap_loss_mean, global_step=i) summary.add_scalar(tag="heatmap_loss/valid_heatmap_loss_mean", scalar_value=valid_heatmap_loss_mean, global_step=i) summary.add_scalar(tag="offset_loss/train_offset_loss_mean", scalar_value=train_offset_loss_mean, global_step=i) summary.add_scalar(tag="offset_loss/valid_offset_loss_mean", scalar_value=valid_offset_loss_mean, global_step=i) summary.add_scalar(tag="wh_loss/train_wh_loss_mean", scalar_value=train_wh_loss_mean, global_step=i) summary.add_scalar(tag="wh_loss/valid_wh_loss_mean", scalar_value=valid_wh_loss_mean, global_step=i) summary.add_scalar(tag="total_loss/train_total_loss", scalar_value=train_total_loss_mean, global_step=i) summary.add_scalar(tag="total_loss/valid_total_loss", scalar_value=valid_total_loss_mean, global_step=i) for name, param in net.named_parameters(): summary.add_histogram(tag=name, values=param, global_step=i) end_time = time.time() learning_time = end_time - start_time logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H") logging.info("optimization completed") if using_mlflow: ml.log_metric("learning time", round(learning_time / 3600, 2))
input_size = (512, 512) root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) transform = CenterValidTransform(input_size, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), make_target=False) dataset = DetectionDataset(path=os.path.join(root, 'Dataset', 'train'), transform=transform) num_classes = dataset.num_class name_classes = dataset.classes length = len(dataset) image, label, _, _, _ = dataset[random.randint(0, length - 1)] net = CenterNet(base=18, heads=OrderedDict([ ('heatmap', {'num_output': num_classes, 'bias': -2.19}), ('offset', {'num_output': 2}), ('wh', {'num_output': 2}) ]), head_conv_channel=64, pretrained=False, root=os.path.join(root, "modelparam"), use_dcnv2=False, ctx=mx.cpu()) net.hybridize(active=True, static_alloc=True, static_shape=True) prediction = Prediction(topk=100, scale=4) precision_recall_2007 = Voc_2007_AP(iou_thresh=0.5, class_names=name_classes) precision_recall_2010 = Voc_2010_AP(iou_thresh=0.5, class_names=name_classes) # batch 형태로 만들기 data = image.expand_dims(axis=0) label = np.expand_dims(label, axis=0) label = mx.nd.array(label)