def test_add_image(): shape = list(rand_shape_nd(4)) shape[1] = 3 shape = tuple(shape) sw = SummaryWriter(logdir=_LOGDIR) sw.add_image(tag='test_add_image', image=mx.nd.random.normal(shape=shape), global_step=0) sw.close() check_event_file_and_remove_logdir()
def __call__(self, mxb_writer: mxboard.SummaryWriter, samples_processed: int, *args, **kwargs): if samples_processed - self._last_call > self._freq: self._last_call = samples_processed # generate image from model samples = self._nn.generate( *self._conditioning_variables).asnumpy() img = samples.reshape((samples.shape[0], *self._image_shape)) mxb_writer.add_image('Generated_image', img, samples_processed)
def run(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], anchor_alloc_size=[256, 256], anchor_sizes=[32, 64, 128, 256, 512], anchor_size_ratios=[1, pow(2, 1 / 3), pow(2, 2 / 3)], anchor_aspect_ratios=[0.5, 1, 2], anchor_box_clip=True, graphviz=True, epoch=100, input_size=[512, 512], batch_log=100, batch_size=16, batch_interval=10, subdivision=4, train_dataset_path="Dataset/train", valid_dataset_path="Dataset/valid", multiscale=True, factor_scale=[8, 5], foreground_iou_thresh=0.5, background_iou_thresh=0.4, data_augmentation=True, num_workers=4, optimizer="ADAM", weight_decay=0.000001, save_period=5, load_period=10, learning_rate=0.001, decay_lr=0.999, decay_step=10, GPU_COUNT=0, base=0, AMP=True, valid_size=8, eval_period=5, tensorboard=True, valid_graph_path="valid_Graph", valid_html_auto_open=True, using_mlflow=True, decode_number=5000, multiperclass=True, nms_thresh=0.5, nms_topk=500, iou_thresh=0.5, except_class_thresh=0.05, plot_class_thresh=0.5): if GPU_COUNT == 0: ctx = mx.cpu(0) AMP = False elif GPU_COUNT == 1: ctx = mx.gpu(0) else: ctx = [mx.gpu(i) for i in range(GPU_COUNT)] # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if isinstance(ctx, (list, tuple)): for i, c in enumerate(ctx): free_memory, total_memory = mx.context.gpu_memory_info(i) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: if GPU_COUNT == 1: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: logging.info(f'Running on {ctx}') if GPU_COUNT > 0 and batch_size < GPU_COUNT: logging.info("batch size must be greater than gpu number") exit(0) if AMP: amp.init() if multiscale: logging.info("Using MultiScale") if data_augmentation: logging.info("Using Data Augmentation") logging.info("training Efficient Detector") input_shape = (1, 3) + tuple(input_size) net = Efficient(version=base, anchor_sizes=anchor_sizes, anchor_size_ratios=anchor_size_ratios, anchor_aspect_ratios=anchor_aspect_ratios, anchor_box_clip=anchor_box_clip, alloc_size=anchor_alloc_size, ctx=mx.cpu()) train_dataloader, train_dataset = traindataloader( multiscale=multiscale, factor_scale=factor_scale, augmentation=data_augmentation, path=train_dataset_path, input_size=input_size, batch_size=batch_size, batch_interval=batch_interval, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, foreground_iou_thresh=foreground_iou_thresh, background_iou_thresh=background_iou_thresh, make_target=True) train_update_number_per_epoch = len(train_dataloader) if train_update_number_per_epoch < 1: logging.warning("train batch size가 데이터 수보다 큼") exit(0) valid_list = glob.glob(os.path.join(valid_dataset_path, "*")) if valid_list: valid_dataloader, valid_dataset = validdataloader( path=valid_dataset_path, input_size=input_size, batch_size=valid_size, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, foreground_iou_thresh=foreground_iou_thresh, background_iou_thresh=background_iou_thresh, make_target=True) valid_update_number_per_epoch = len(valid_dataloader) if valid_update_number_per_epoch < 1: logging.warning("valid batch size가 데이터 수보다 큼") exit(0) num_classes = train_dataset.num_class # 클래스 수 name_classes = train_dataset.classes optimizer = optimizer.upper() model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_EFF_" + str(base) weight_path = os.path.join("weights", f"{model}") sym_path = os.path.join(weight_path, f'{model}-symbol.json') param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params') optimizer_path = os.path.join(weight_path, f'{model}-{load_period:04d}.opt') if os.path.exists(param_path) and os.path.exists(sym_path): start_epoch = load_period logging.info(f"loading {os.path.basename(param_path)}\n") net = gluon.SymbolBlock.imports(sym_path, ['data'], param_path, ctx=ctx) else: start_epoch = 0 net = Efficient( version=base, input_size=input_size, anchor_sizes=anchor_sizes, anchor_size_ratios=anchor_size_ratios, anchor_aspect_ratios=anchor_aspect_ratios, num_classes=num_classes, # foreground만 anchor_box_clip=anchor_box_clip, alloc_size=anchor_alloc_size, ctx=ctx) if isinstance(ctx, (list, tuple)): net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.summary(mx.nd.ones(shape=input_shape, ctx=ctx)) ''' active (bool, default True) – Whether to turn hybrid on or off. static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase. static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower. ''' if multiscale: net.hybridize(active=True, static_alloc=True, static_shape=False) else: net.hybridize(active=True, static_alloc=True, static_shape=True) if start_epoch + 1 >= epoch + 1: logging.info("this model has already been optimized") exit(0) if tensorboard: summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10, verbose=False) if isinstance(ctx, (list, tuple)): net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.forward(mx.nd.ones(shape=input_shape, ctx=ctx)) summary.add_graph(net) if graphviz: gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model) # optimizer unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size step = unit * decay_step lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate) for p in net.collect_params().values(): if p.grad_req != "null": p.grad_req = 'add' ''' update_on_kvstore : bool, default None Whether to perform parameter updates on kvstore. If None, then trainer will choose the more suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored. ''' if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": weight_decay, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False }, update_on_kvstore=False if AMP else None) # for Dynamic loss scaling elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": weight_decay, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False }, update_on_kvstore=False if AMP else None) # for Dynamic loss scaling elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": weight_decay, "momentum": 0.9, 'multi_precision': False }, update_on_kvstore=False if AMP else None) # for Dynamic loss scaling else: logging.error("optimizer not selected") exit(0) if AMP: amp.init_trainer(trainer) # optimizer weight 불러오기 if os.path.exists(optimizer_path): try: trainer.load_states(optimizer_path) except Exception as E: logging.info(E) else: logging.info(f"loading {os.path.basename(optimizer_path)}\n") ''' localization loss -> Smooth L1 loss confidence loss -> Focal ''' confidence_loss = FocalLoss(alpha=0.25, gamma=2, sparse_label=True, from_sigmoid=False, batch_axis=None, num_class=num_classes, reduction="sum", exclude=False) localization_loss = HuberLoss(rho=1, batch_axis=None, reduction="sum", exclude=False) prediction = Prediction(batch_size=batch_size, from_sigmoid=False, num_classes=num_classes, decode_number=decode_number, nms_thresh=nms_thresh, nms_topk=nms_topk, except_class_thresh=except_class_thresh, multiperclass=multiperclass) precision_recall = Voc_2007_AP(iou_thresh=iou_thresh, class_names=name_classes) ctx_list = ctx if isinstance(ctx, (list, tuple)) else [ctx] start_time = time.time() for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch): conf_loss_sum = 0 loc_loss_sum = 0 time_stamp = time.time() for batch_count, (image, _, cls_all, box_all, _) in enumerate(train_dataloader, start=1): td_batch_size = image.shape[0] image = mx.nd.split(data=image, num_outputs=subdivision, axis=0) cls_all = mx.nd.split(data=cls_all, num_outputs=subdivision, axis=0) box_all = mx.nd.split(data=box_all, num_outputs=subdivision, axis=0) if subdivision == 1: image = [image] cls_all = [cls_all] box_all = [box_all] ''' autograd 설명 https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html ''' with autograd.record(train_mode=True): cls_all_losses = [] box_all_losses = [] for image_split, cls_split, box_split in zip( image, cls_all, box_all): image_split = gluon.utils.split_and_load(image_split, ctx_list, even_split=False) cls_split = gluon.utils.split_and_load(cls_split, ctx_list, even_split=False) box_split = gluon.utils.split_and_load(box_split, ctx_list, even_split=False) # prediction, target space for Data Parallelism cls_losses = [] box_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, cls_target, box_target in zip( image_split, cls_split, box_split): cls_pred, box_pred, anchor = net(img) except_ignore_samples = cls_target > -1 positive_samples = cls_target > 0 positive_numbers = positive_samples.sum() conf_loss = confidence_loss( cls_pred, cls_target, except_ignore_samples.expand_dims(axis=-1)) conf_loss = mx.nd.divide(conf_loss, positive_numbers + 1) cls_losses.append(conf_loss.asscalar()) loc_loss = localization_loss( box_pred, box_target, positive_samples.expand_dims(axis=-1)) box_losses.append(loc_loss.asscalar()) total_loss.append(conf_loss + loc_loss) if AMP: with amp.scale_loss(total_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(total_loss) cls_all_losses.append(sum(cls_losses)) box_all_losses.append(sum(box_losses)) trainer.step(batch_size=td_batch_size, ignore_stale_grad=False) # 비우기 for p in net.collect_params().values(): p.zero_grad() conf_loss_sum += sum(cls_all_losses) / td_batch_size loc_loss_sum += sum(box_all_losses) / td_batch_size if batch_count % batch_log == 0: logging.info( f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],' f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],' f'[Lr = {trainer.learning_rate}]' f'[confidence loss = {sum(cls_all_losses) / td_batch_size:.3f}]' f'[localization loss = {sum(box_all_losses) / td_batch_size:.3f}]' ) time_stamp = time.time() train_conf_loss_mean = np.divide(conf_loss_sum, train_update_number_per_epoch) train_loc_loss_mean = np.divide(loc_loss_sum, train_update_number_per_epoch) train_total_loss_mean = train_conf_loss_mean + train_loc_loss_mean logging.info( f"train confidence loss : {train_conf_loss_mean} / train localization loss : {train_loc_loss_mean} / train total loss : {train_total_loss_mean}" ) if i % save_period == 0: weight_epoch_path = os.path.join(weight_path, str(i)) if not os.path.exists(weight_epoch_path): os.makedirs(weight_epoch_path) # optimizer weight 저장하기 try: trainer.save_states( os.path.join(weight_path, f'{model}-{i:04d}.opt')) except Exception as E: logging.error(f"optimizer weight export 예외 발생 : {E}") else: logging.info("optimizer weight export 성공") ''' Hybrid models can be serialized as JSON files using the export function Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface. When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc. ''' if GPU_COUNT >= 1: context = mx.gpu(0) else: context = mx.cpu(0) ''' mxnet1.6.0 버전 에서 AMP 사용시 위에 미리 선언한 prediction을 사용하면 문제가 될 수 있다. -yolo v3, gaussian yolo v3 에서는 문제가 발생한다. mxnet 1.5.x 버전에서는 아래와 같이 새로 선언하지 않아도 정상 동작한다. block들은 함수 인자로 보낼 경우 자기 자신이 보내진다.(복사되는 것이 아님) export_block_for_cplusplus 에서 prediction 이 hybridize 되면서 미리 선언한 prediction도 hybridize화 되면서 symbol 형태가 된다. 이런 현상을 보면 아래와같이 다시 선언해 주는게 맞는 것 같다. ''' auxnet = Prediction(from_sigmoid=False, num_classes=num_classes, decode_number=decode_number, nms_thresh=nms_thresh, nms_topk=nms_topk, except_class_thresh=except_class_thresh, multiperclass=multiperclass) postnet = PostNet(net=net, auxnet=auxnet) try: net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True) net.save_parameters(os.path.join(weight_path, f"{i}.params")) # onnx 추출용 # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 export_block_for_cplusplus( path=os.path.join(weight_epoch_path, f"{model}_prepost"), block=postnet, data_shape=tuple(input_size) + tuple((3, )), epoch=i, preprocess= True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=context, remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공") net.collect_params().reset_ctx(ctx) if i % eval_period == 0 and valid_list: conf_loss_sum = 0 loc_loss_sum = 0 # loss 구하기 for image, label, cls_all, box_all, _ in valid_dataloader: vd_batch_size = image.shape[0] image = gluon.utils.split_and_load(image, ctx_list, even_split=False) label = gluon.utils.split_and_load(label, ctx_list, even_split=False) cls_all = gluon.utils.split_and_load(cls_all, ctx_list, even_split=False) box_all = gluon.utils.split_and_load(box_all, ctx_list, even_split=False) # prediction, target space for Data Parallelism cls_losses = [] box_losses = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, lb, cls_target, box_target in zip( image, label, cls_all, box_all): gt_box = lb[:, :, :4] gt_id = lb[:, :, 4:5] cls_pred, box_pred, anchor = net(img) id, score, bbox = prediction(cls_pred, box_pred, anchor) precision_recall.update(pred_bboxes=bbox, pred_labels=id, pred_scores=score, gt_boxes=gt_box, gt_labels=gt_id) except_ignore_samples = cls_target > -1 positive_samples = cls_target > 0 positive_numbers = positive_samples.sum() conf_loss = confidence_loss( cls_pred, cls_target, except_ignore_samples.expand_dims(axis=-1)) conf_loss = mx.nd.divide(conf_loss, positive_numbers + 1) cls_losses.append(conf_loss.asscalar()) loc_loss = localization_loss( box_pred, box_target, positive_samples.expand_dims(axis=-1)) box_losses.append(loc_loss.asscalar()) conf_loss_sum += sum(cls_losses) / vd_batch_size loc_loss_sum += sum(box_losses) / vd_batch_size valid_conf_loss_mean = np.divide(conf_loss_sum, valid_update_number_per_epoch) valid_loc_loss_mean = np.divide(loc_loss_sum, valid_update_number_per_epoch) valid_total_loss_mean = valid_conf_loss_mean + valid_loc_loss_mean logging.info( f"valid confidence loss : {valid_conf_loss_mean} / valid localization loss : {valid_loc_loss_mean} / valid total loss : {valid_total_loss_mean}" ) AP_appender = [] round_position = 2 class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list( ) for j, c, p, r in zip(range(len(recall)), class_name, precision, recall): name, AP = precision_recall.get_AP(c, p, r) logging.info( f"class {j}'s {name} AP : {round(AP * 100, round_position)}%" ) AP_appender.append(AP) AP_appender = np.nan_to_num(AP_appender) mAP_result = np.mean(AP_appender) logging.info(f"mAP : {round(mAP_result * 100, round_position)}%") precision_recall.get_PR_curve(name=class_name, precision=precision, recall=recall, threshold=threshold, AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i, auto_open=valid_html_auto_open) precision_recall.reset() if tensorboard: # gpu N 개를 대비한 코드 (Data Parallelism) dataloader_iter = iter(valid_dataloader) image, label, _, _, _ = next(dataloader_iter) image = gluon.utils.split_and_load(image, ctx_list, even_split=False) label = gluon.utils.split_and_load(label, ctx_list, even_split=False) ground_truth_colors = {} for k in range(num_classes): ground_truth_colors[k] = (0, 1, 0) batch_image = [] for img, lb in zip(image, label): gt_boxes = lb[:, :, :4] gt_ids = lb[:, :, 4:5] cls_pred, box_pred, anchor = net(img) ids, scores, bboxes = prediction(cls_pred, box_pred, anchor) for ig, gt_id, gt_box, id, score, bbox in zip( img, gt_ids, gt_boxes, ids, scores, bboxes): ig = ig.transpose((1, 2, 0)) * mx.nd.array( std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context) ig = (ig * 255).clip(0, 255) ig = ig.astype(np.uint8) # ground truth box 그리기 ground_truth = plot_bbox( ig, gt_box, scores=None, labels=gt_id, thresh=None, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True, colors=ground_truth_colors) # prediction box 그리기 prediction_box = plot_bbox( ground_truth, bbox, scores=score, labels=id, thresh=plot_class_thresh, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True) # Tensorboard에 그리기 (height, width, channel) -> (channel, height, width) 를한다. prediction_box = np.transpose(prediction_box, axes=(2, 0, 1)) batch_image.append( prediction_box) # (batch, channel, height, width) summary.add_image(tag="valid_result", image=np.array(batch_image), global_step=i) summary.add_scalar(tag="conf_loss", value={ "train_conf_loss": train_conf_loss_mean, "valid_conf_loss": valid_conf_loss_mean }, global_step=i) summary.add_scalar(tag="loc_loss", value={ "train_loc_loss": train_loc_loss_mean, "valid_loc_loss": valid_loc_loss_mean }, global_step=i) summary.add_scalar(tag="total_loss", value={ "train_total_loss": train_total_loss_mean, "valid_total_loss": valid_total_loss_mean }, global_step=i) for p in net.collect_params().values(): summary.add_histogram(tag=p.name, values=p.data(ctx=ctx_list[0]), global_step=i, bins='default') end_time = time.time() learning_time = end_time - start_time logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H") logging.info("optimization completed") if using_mlflow: ml.log_metric("learning time", round(learning_time / 3600, 2))
def train(): """training""" image_pool = ImagePool(pool_size) metric = mx.metric.CustomMetric(facc) stamp = datetime.now().strftime('%Y_%m_%d-%H_%M') logging.basicConfig(level=logging.DEBUG) # define a summary writer that logs data and flushes to the file every 5 seconds sw = SummaryWriter(logdir='%s' % dir_out_sw, flush_secs=5, verbose=False) global_step = 0 for epoch in range(epochs): if epoch == 0: netG.hybridize() netD.hybridize() # sw.add_graph(netG) # sw.add_graph(netD) tic = time.time() btic = time.time() train_data.reset() val_data.reset() iter = 0 for local_step, batch in enumerate(train_data): ############################ # (1) Update D network: maximize log(D(x, y)) + log(1 - D(x, G(x, z))) ########################### tmp = mx.nd.concat(batch.data[0], batch.data[1], batch.data[2], dim=1) tmp = augmenter(tmp, patch_size=128, offset=offset, aug_type=1, aug_methods=aug_methods, random_crop=False) real_in = tmp[:, :1].as_in_context(ctx) real_out = tmp[:, 1:2].as_in_context(ctx) m = tmp[:, 2:3].as_in_context(ctx) # mask fake_out = netG(real_in) * m # loss weight based on mask, applied on L1 loss if no_loss_weights: loss_weight = m else: loss_weight = m.asnumpy() loss_weight[loss_weight == 0] = .1 loss_weight = mx.nd.array(loss_weight, ctx=m.context) fake_concat = image_pool.query(nd.concat(real_in, fake_out, dim=1)) with autograd.record(): # Train with fake image # Use image pooling to utilize history images output = netD(fake_concat) fake_label = nd.zeros(output.shape, ctx=ctx) errD_fake = GAN_loss(output, fake_label) metric.update([ fake_label, ], [ output, ]) # Train with real image real_concat = nd.concat(real_in, real_out, dim=1) output = netD(real_concat) real_label = nd.ones(output.shape, ctx=ctx) errD_real = GAN_loss(output, real_label) errD = (errD_real + errD_fake) * 0.5 errD.backward() metric.update([ real_label, ], [ output, ]) trainerD.step(batch.data[0].shape[0]) ############################ # (2) Update G network: maximize log(D(x, G(x, z))) - lambda1 * L1(y, G(x, z)) ########################### with autograd.record(): fake_out = netG(real_in) fake_concat = nd.concat(real_in, fake_out, dim=1) output = netD(fake_concat) real_label = nd.ones(output.shape, ctx=ctx) errG = GAN_loss(output, real_label) + loss_2nd( real_out, fake_out, loss_weight) * lambda1 errG.backward() trainerG.step(batch.data[0].shape[0]) sw.add_scalar(tag='loss', value=('d_loss', errD.mean().asscalar()), global_step=global_step) sw.add_scalar(tag='loss', value=('g_loss', errG.mean().asscalar()), global_step=global_step) global_step += 1 if epoch + local_step == 0: sw.add_graph((netG)) img_in_list, img_out_list, m_val = val_data.next().data m_val = m_val.as_in_context(ctx) sw.add_image('first_minibatch_train_real', norm3(real_out)) sw.add_image('first_minibatch_val_real', norm3(img_out_list.as_in_context(ctx))) netG.export('%snetG' % dir_out_checkpoints) if local_step == 0: # Log the first batch of images of each epoch (training) sw.add_image('first_minibatch_train_fake', norm3(fake_out * m) * m, epoch) sw.add_image( 'first_minibatch_val_fake', norm3(netG(img_in_list.as_in_context(ctx)) * m_val) * m_val, epoch) # norm3(netG(img_in_list.as_in_context(ctx)) * m_val.as_in_context(ctx)), epoch) if (iter + 1) % 10 == 0: name, acc = metric.get() logging.info('speed: {} samples/s'.format( batch_size / (time.time() - btic))) logging.info( 'discriminator loss = %f, generator loss = %f, binary training acc = %f at iter %d epoch %d' % (nd.mean(errD).asscalar(), nd.mean(errG).asscalar(), acc, iter, epoch)) iter += 1 btic = time.time() sw.add_scalar(tag='binary_training_acc', value=('acc', acc), global_step=epoch) name, acc = metric.get() metric.reset() fake_val = netG(val_data.data[0][1].as_in_context(ctx)) loss_val = loss_2nd(val_data.data[1][1].as_in_context(ctx), fake_val, val_data.data[2][1].as_in_context(ctx)) * lambda1 sw.add_scalar(tag='loss_val', value=('g_loss', loss_val.mean().asscalar()), global_step=epoch) if (epoch % check_point_interval == 0) | (epoch == epochs - 1): netD.save_params('%snetD-%04d' % (dir_out_checkpoints, epoch)) netG.save_params('%snetG-%04d' % (dir_out_checkpoints, epoch)) logging.info('\nbinary training acc at epoch %d: %s=%f' % (epoch, name, acc)) logging.info('time: %f' % (time.time() - tic)) sw.export_scalars('scalar_dict.json') sw.close()
for yhat, y in zip(outputs, label)]) train_loss += sum([l.sum().asscalar() for l in losses]) trainer.step(batch_size) n += batch_size m += sum([y.size for y in label]) if print_batches and (i + 1) % print_batches == 0: print("Batch %d. Loss: %f, Train acc %f" % (n, train_loss / n, train_acc / m)) ### 可視化 ### if i == 0: data_image = data[0] data_image = (data_image - data_image.min()) / (data_image.max() - data_image.min()) sw.add_image('sperm_image', data_image, epoch) if epoch == 0: sw.add_graph(net) ### 可視化 ### test_acc = evaluate_accuracy(test_data, net, ctx) print( "Epoch %d. Loss: %.3f, Train acc %.2f, Test acc %.2f, Time %.1f sec" % (epoch, train_loss / n, train_acc / m, test_acc, time.time() - start)) if train_loss / n <= 0.0008 and epoch >= 20: break ### 可視化 ### pn = list(net.collect_params().keys()) param_names, grads = [], [] for n, i in enumerate(net.collect_params().values()):
def run_experiment(fraction_train, load_model=False, old_runname=None, start_epoch=None): runname = f'splitted_data_{str(fraction_train)}' device = Device.GPU1 epochs = 50 features = 64 batch_size = 4 all_image_size = 96 in_chan = 15 context = cpu() if device.value == -1 else gpu(device.value) # ---------------------------------------------------- if load_model: summaryWriter = SummaryWriter('logs/' + old_runname, flush_secs=5) else: summaryWriter = SummaryWriter('logs/' + runname, flush_secs=5) train_iter = modules.make_iterator_preprocessed( 'training', 'V1', 'V2', 'V3', batch_size=batch_size, shuffle=True, fraction_train=fraction_train) test_iter = modules.make_iterator_preprocessed('testing', 'V1', 'V2', 'V3', batch_size=batch_size, shuffle=True) RFlocs_V1_overlapped_avg = modules.get_RFs('V1', context) RFlocs_V2_overlapped_avg = modules.get_RFs('V2', context) RFlocs_V3_overlapped_avg = modules.get_RFs('V3', context) with Context(context): discriminator = Discriminator(in_chan) generator = Generator(in_chan, context) if load_model: generator.network.load_parameters( f'saved_models/{old_runname}/netG_{start_epoch}.model', ctx=context) discriminator.network.load_parameters( f'saved_models/{old_runname}/netD_{start_epoch}.model') gen_lossfun = gen.Lossfun(1, 100, 1, context) d = discriminator.network dis_lossfun = dis.Lossfun(1) g = generator.network print('train_dataset_length:', len(train_iter._dataset)) for epoch in range(epochs): loss_discriminator_train = [] loss_generator_train = [] # ==================== # T R AI N I N G # ==================== for RFsignalsV1, RFsignalsV2, RFsignalsV3, targets in tqdm( train_iter, total=len(train_iter)): # ------- # Inputs # ------- inputs1 = modules.get_inputsROI(RFsignalsV1, RFlocs_V1_overlapped_avg, context) inputs2 = modules.get_inputsROI(RFsignalsV2, RFlocs_V2_overlapped_avg, context) inputs3 = modules.get_inputsROI(RFsignalsV3, RFlocs_V3_overlapped_avg, context) inputs = concat(inputs1, inputs2, inputs3, dim=1) # ------------------------------------ # T R A I N D i s c r i m i n a t o r # ------------------------------------ targets = targets.as_in_context(context).transpose( (0, 1, 3, 2)) loss_discriminator_train.append( discriminator.train(g, inputs, targets)) # ---------------------------- # T R A I N G e n e r a t o r # ---------------------------- loss_generator_train.append(generator.train( d, inputs, targets)) if load_model: os.makedirs('saved_models/' + old_runname, exist_ok=True) generator.network.save_parameters( f'saved_models/{old_runname}/netG_{epoch+start_epoch+1}.model' ) discriminator.network.save_parameters( f'saved_models/{old_runname}/netD_{epoch+start_epoch+1}.model' ) else: os.makedirs('saved_models/' + runname, exist_ok=True) generator.network.save_parameters( f'saved_models/{runname}/netG_{epoch}.model') discriminator.network.save_parameters( f'saved_models/{runname}/netD_{epoch}.model') # ==================== # T E S T I N G # ==================== loss_discriminator_test = [] loss_generator_test = [] for RFsignalsV1, RFsignalsV2, RFsignalsV3, targets in test_iter: # ------- # Inputs # ------- inputs1 = modules.get_inputsROI(RFsignalsV1, RFlocs_V1_overlapped_avg, context) inputs2 = modules.get_inputsROI(RFsignalsV2, RFlocs_V2_overlapped_avg, context) inputs3 = modules.get_inputsROI(RFsignalsV3, RFlocs_V3_overlapped_avg, context) inputs = concat(inputs1, inputs2, inputs3, dim=1) # ----- # Targets # ----- targets = targets.as_in_context(context).transpose( (0, 1, 3, 2)) # ---- # sample randomly from history buffer (capacity 50) # ---- z = concat(inputs, g(inputs), dim=1) dis_loss_test = 0.5 * (dis_lossfun(0, d(z)) + dis_lossfun( 1, d(concat(inputs, targets, dim=1)))) loss_discriminator_test.append(float(dis_loss_test.asscalar())) gen_loss_test = (lambda y_hat: gen_lossfun( 1, d(concat(inputs, y_hat, dim=1)), targets, y_hat))( generator.network(inputs)) loss_generator_test.append(float(gen_loss_test.asscalar())) summaryWriter.add_image( "input", modules.leclip(inputs.expand_dims(2).sum(1)), epoch) summaryWriter.add_image("target", modules.leclip(targets), epoch) summaryWriter.add_image("pred", modules.leclip(g(inputs)), epoch) summaryWriter.add_scalar( "dis/loss_discriminator_train", sum(loss_discriminator_train) / len(loss_discriminator_train), epoch) summaryWriter.add_scalar( "gen/loss_generator_train", sum(loss_generator_train) / len(loss_generator_train), epoch) summaryWriter.add_scalar( "dis/loss_discriminator_test", sum(loss_discriminator_test) / len(loss_discriminator_test), epoch) summaryWriter.add_scalar( "gen/loss_generator_test", sum(loss_generator_test) / len(loss_generator_test), epoch) # ------------------------------------------------------------------ # T R A I N I N G Losses # ------------------------------------------------------------------ np.save(f'saved_models/{runname}/Gloss_train', np.array(loss_generator_train)) np.save(f'saved_models/{runname}/Dloss_train', np.array(loss_discriminator_train)) # ------------------------------------------------------------------ # T E S T I N G Losses # ------------------------------------------------------------------ np.save(f'saved_models/{runname}/Gloss_test', np.array(loss_generator_test)) np.save(f'saved_models/{runname}/Dloss_test', np.array(loss_discriminator_test))
import cv2
def run(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], graphviz=True, epoch=100, input_size=[512, 512], batch_size=16, batch_log=100, batch_interval=10, subdivision=4, train_dataset_path="Dataset/train", valid_dataset_path="Dataset/valid", multiscale=True, factor_scale=[8, 5], data_augmentation=True, num_workers=4, optimizer="ADAM", lambda_off=1, lambda_size=0.1, save_period=5, load_period=10, learning_rate=0.001, decay_lr=0.999, decay_step=10, GPU_COUNT=0, base=18, pretrained_base=True, pretrained_path="modelparam", AMP=True, valid_size=8, eval_period=5, tensorboard=True, valid_graph_path="valid_Graph", using_mlflow=True, topk=100, plot_class_thresh=0.5): ''' AMP 가 모든 연산을 지원하지는 않는다. modulated convolution을 지원하지 않음 ''' if GPU_COUNT == 0: ctx = mx.cpu(0) AMP = False elif GPU_COUNT == 1: ctx = mx.gpu(0) else: ctx = [mx.gpu(i) for i in range(GPU_COUNT)] # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if isinstance(ctx, (list, tuple)): for i, c in enumerate(ctx): free_memory, total_memory = mx.context.gpu_memory_info(i) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: if GPU_COUNT == 1: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: logging.info(f'Running on {ctx}') if GPU_COUNT > 0 and batch_size < GPU_COUNT: logging.info("batch size must be greater than gpu number") exit(0) if AMP: amp.init() if multiscale: logging.info("Using MultiScale") if data_augmentation: logging.info("Using Data Augmentation") logging.info("training Center Detector") input_shape = (1, 3) + tuple(input_size) scale_factor = 4 # 고정 logging.info(f"scale factor {scale_factor}") try: train_dataloader, train_dataset = traindataloader( multiscale=multiscale, factor_scale=factor_scale, augmentation=data_augmentation, path=train_dataset_path, input_size=input_size, batch_size=batch_size, batch_interval=batch_interval, num_workers=num_workers, shuffle=True, mean=mean, std=std, scale_factor=scale_factor, make_target=True) valid_dataloader, valid_dataset = validdataloader( path=valid_dataset_path, input_size=input_size, batch_size=valid_size, num_workers=num_workers, shuffle=True, mean=mean, std=std, scale_factor=scale_factor, make_target=True) except Exception as E: logging.info(E) exit(0) train_update_number_per_epoch = len(train_dataloader) if train_update_number_per_epoch < 1: logging.warning("train batch size가 데이터 수보다 큼") exit(0) valid_list = glob.glob(os.path.join(valid_dataset_path, "*")) if valid_list: valid_update_number_per_epoch = len(valid_dataloader) if valid_update_number_per_epoch < 1: logging.warning("valid batch size가 데이터 수보다 큼") exit(0) num_classes = train_dataset.num_class # 클래스 수 name_classes = train_dataset.classes optimizer = optimizer.upper() if pretrained_base: model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_P" + "CENTER_RES" + str(base) else: model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_CENTER_RES" + str(base) weight_path = f"weights/{model}" sym_path = os.path.join(weight_path, f'{model}-symbol.json') param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params') if os.path.exists(param_path) and os.path.exists(sym_path): start_epoch = load_period logging.info(f"loading {os.path.basename(param_path)} weights\n") net = gluon.SymbolBlock.imports(sym_path, ['data'], param_path, ctx=ctx) else: start_epoch = 0 net = CenterNet(base=base, heads=OrderedDict([('heatmap', { 'num_output': num_classes, 'bias': -2.19 }), ('offset', { 'num_output': 2 }), ('wh', { 'num_output': 2 })]), head_conv_channel=64, pretrained=pretrained_base, root=pretrained_path, use_dcnv2=False, ctx=ctx) if isinstance(ctx, (list, tuple)): net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.summary(mx.nd.ones(shape=input_shape, ctx=ctx)) ''' active (bool, default True) – Whether to turn hybrid on or off. static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase. static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower. ''' if multiscale: net.hybridize(active=True, static_alloc=True, static_shape=False) else: net.hybridize(active=True, static_alloc=True, static_shape=True) if start_epoch + 1 >= epoch + 1: logging.info("this model has already been optimized") exit(0) if tensorboard: summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10, verbose=False) if isinstance(ctx, (list, tuple)): net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.forward(mx.nd.ones(shape=input_shape, ctx=ctx)) summary.add_graph(net) if graphviz: gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model) # optimizer unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size step = unit * decay_step lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate) for p in net.collect_params().values(): if p.grad_req != "null": p.grad_req = 'add' if AMP: ''' update_on_kvstore : bool, default None Whether to perform parameter updates on kvstore. If None, then trainer will choose the more suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored. ''' if optimizer.upper() == "ADAM": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "SGD": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0001, "momentum": 0.9, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling else: logging.error("optimizer not selected") exit(0) amp.init_trainer(trainer) else: if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False }) elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False }) elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0001, "momentum": 0.9, 'multi_precision': False }) else: logging.error("optimizer not selected") exit(0) heatmapfocalloss = HeatmapFocalLoss(from_sigmoid=True, alpha=2, beta=4) normedl1loss = NormedL1Loss() prediction = Prediction(batch_size=valid_size, topk=topk, scale=scale_factor) precision_recall = Voc_2007_AP(iou_thresh=0.5, class_names=name_classes) start_time = time.time() for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch): heatmap_loss_sum = 0 offset_loss_sum = 0 wh_loss_sum = 0 time_stamp = time.time() ''' target generator를 train_dataloader에서 만들어 버리는게 학습 속도가 훨씬 빠르다. ''' for batch_count, (image, _, heatmap, offset_target, wh_target, mask_target, _) in enumerate(train_dataloader, start=1): td_batch_size = image.shape[0] image_split = mx.nd.split(data=image, num_outputs=subdivision, axis=0) heatmap_split = mx.nd.split(data=heatmap, num_outputs=subdivision, axis=0) offset_target_split = mx.nd.split(data=offset_target, num_outputs=subdivision, axis=0) wh_target_split = mx.nd.split(data=wh_target, num_outputs=subdivision, axis=0) mask_target_split = mx.nd.split(data=mask_target, num_outputs=subdivision, axis=0) if subdivision == 1: image_split = [image_split] heatmap_split = [heatmap_split] offset_target_split = [offset_target_split] wh_target_split = [wh_target_split] mask_target_split = [mask_target_split] ''' autograd 설명 https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html ''' with autograd.record(train_mode=True): heatmap_all_losses = [] offset_all_losses = [] wh_all_losses = [] for image_part, heatmap_part, offset_target_part, wh_target_part, mask_target_part in zip( image_split, heatmap_split, offset_target_split, wh_target_split, mask_target_split): if GPU_COUNT <= 1: image_part = gluon.utils.split_and_load( image_part, [ctx], even_split=False) heatmap_part = gluon.utils.split_and_load( heatmap_part, [ctx], even_split=False) offset_target_part = gluon.utils.split_and_load( offset_target_part, [ctx], even_split=False) wh_target_part = gluon.utils.split_and_load( wh_target_part, [ctx], even_split=False) mask_target_part = gluon.utils.split_and_load( mask_target_part, [ctx], even_split=False) else: image_part = gluon.utils.split_and_load( image_part, ctx, even_split=False) heatmap_part = gluon.utils.split_and_load( heatmap_part, ctx, even_split=False) offset_target_part = gluon.utils.split_and_load( offset_target_part, ctx, even_split=False) wh_target_part = gluon.utils.split_and_load( wh_target_part, ctx, even_split=False) mask_target_part = gluon.utils.split_and_load( mask_target_part, ctx, even_split=False) # prediction, target space for Data Parallelism heatmap_losses = [] offset_losses = [] wh_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, heatmap_target, offset_target, wh_target, mask_target in zip( image_part, heatmap_part, offset_target_part, wh_target_part, mask_target_part): heatmap_pred, offset_pred, wh_pred = net(img) heatmap_loss = heatmapfocalloss( heatmap_pred, heatmap_target) offset_loss = normedl1loss(offset_pred, offset_target, mask_target) * lambda_off wh_loss = normedl1loss(wh_pred, wh_target, mask_target) * lambda_size heatmap_losses.append(heatmap_loss.asscalar()) offset_losses.append(offset_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) total_loss.append(heatmap_loss + offset_loss + wh_loss) if AMP: with amp.scale_loss(total_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(total_loss) heatmap_all_losses.append(sum(heatmap_losses)) offset_all_losses.append(sum(offset_losses)) wh_all_losses.append(sum(wh_losses)) trainer.step(batch_size=td_batch_size, ignore_stale_grad=False) # 비우기 for p in net.collect_params().values(): p.zero_grad() heatmap_loss_sum += sum(heatmap_all_losses) / td_batch_size offset_loss_sum += sum(offset_all_losses) / td_batch_size wh_loss_sum += sum(wh_all_losses) / td_batch_size if batch_count % batch_log == 0: logging.info( f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],' f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],' f'[Lr = {trainer.learning_rate}]' f'[heatmap loss = {sum(heatmap_all_losses) / td_batch_size:.3f}]' f'[offset loss = {sum(offset_all_losses) / td_batch_size:.3f}]' f'[wh loss = {sum(wh_all_losses) / td_batch_size:.3f}]') time_stamp = time.time() train_heatmap_loss_mean = np.divide(heatmap_loss_sum, train_update_number_per_epoch) train_offset_loss_mean = np.divide(offset_loss_sum, train_update_number_per_epoch) train_wh_loss_mean = np.divide(wh_loss_sum, train_update_number_per_epoch) train_total_loss_mean = train_heatmap_loss_mean + train_offset_loss_mean + train_wh_loss_mean logging.info( f"train heatmap loss : {train_heatmap_loss_mean} / train offset loss : {train_offset_loss_mean} / train wh loss : {train_wh_loss_mean} / train total loss : {train_total_loss_mean}" ) if i % eval_period == 0 and valid_list: heatmap_loss_sum = 0 offset_loss_sum = 0 wh_loss_sum = 0 # loss 구하기 for image, label, heatmap_all, offset_target_all, wh_target_all, mask_target_all, _ in valid_dataloader: vd_batch_size = image.shape[0] if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) heatmap_split = gluon.utils.split_and_load( heatmap_all, [ctx], even_split=False) offset_target_split = gluon.utils.split_and_load( offset_target_all, [ctx], even_split=False) wh_target_split = gluon.utils.split_and_load( wh_target_all, [ctx], even_split=False) mask_target_split = gluon.utils.split_and_load( mask_target_all, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) heatmap_split = gluon.utils.split_and_load( heatmap_all, ctx, even_split=False) offset_target_split = gluon.utils.split_and_load( offset_target_all, ctx, even_split=False) wh_target_split = gluon.utils.split_and_load( wh_target_all, ctx, even_split=False) mask_target_split = gluon.utils.split_and_load( mask_target_all, ctx, even_split=False) # prediction, target space for Data Parallelism heatmap_losses = [] offset_losses = [] wh_losses = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, lb, heatmap_target, offset_target, wh_target, mask_target in zip( image, label, heatmap_split, offset_target_split, wh_target_split, mask_target_split): gt_box = lb[:, :, :4] gt_id = lb[:, :, 4:5] heatmap_pred, offset_pred, wh_pred = net(img) id, score, bbox = prediction(heatmap_pred, offset_pred, wh_pred) precision_recall.update(pred_bboxes=bbox, pred_labels=id, pred_scores=score, gt_boxes=gt_box * scale_factor, gt_labels=gt_id) heatmap_loss = heatmapfocalloss(heatmap_pred, heatmap_target) offset_loss = normedl1loss(offset_pred, offset_target, mask_target) * lambda_off wh_loss = normedl1loss(wh_pred, wh_target, mask_target) * lambda_size heatmap_losses.append(heatmap_loss.asscalar()) offset_losses.append(offset_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) heatmap_loss_sum += sum(heatmap_losses) / vd_batch_size offset_loss_sum += sum(offset_losses) / vd_batch_size wh_loss_sum += sum(wh_losses) / vd_batch_size valid_heatmap_loss_mean = np.divide(heatmap_loss_sum, valid_update_number_per_epoch) valid_offset_loss_mean = np.divide(offset_loss_sum, valid_update_number_per_epoch) valid_wh_loss_mean = np.divide(wh_loss_sum, valid_update_number_per_epoch) valid_total_loss_mean = valid_heatmap_loss_mean + valid_offset_loss_mean + valid_wh_loss_mean logging.info( f"valid heatmap loss : {valid_heatmap_loss_mean} / valid offset loss : {valid_offset_loss_mean} / valid wh loss : {valid_wh_loss_mean} / valid total loss : {valid_total_loss_mean}" ) AP_appender = [] round_position = 2 class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list( ) for j, c, p, r in zip(range(len(recall)), class_name, precision, recall): name, AP = precision_recall.get_AP(c, p, r) logging.info( f"class {j}'s {name} AP : {round(AP * 100, round_position)}%" ) AP_appender.append(AP) mAP_result = np.mean(AP_appender) logging.info(f"mAP : {round(mAP_result * 100, round_position)}%") precision_recall.get_PR_curve(name=class_name, precision=precision, recall=recall, threshold=threshold, AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i) precision_recall.reset() if tensorboard: # gpu N 개를 대비한 코드 (Data Parallelism) dataloader_iter = iter(valid_dataloader) image, label, _, _, _, _, _ = next(dataloader_iter) if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) ground_truth_colors = {} for k in range(num_classes): ground_truth_colors[k] = (0, 0, 1) batch_image = [] heatmap_image = [] for img, lb in zip(image, label): gt_boxes = lb[:, :, :4] gt_ids = lb[:, :, 4:5] heatmap_pred, offset_pred, wh_pred = net(img) ids, scores, bboxes = prediction(heatmap_pred, offset_pred, wh_pred) for ig, gt_id, gt_box, heatmap, id, score, bbox in zip( img, gt_ids, gt_boxes, heatmap_pred, ids, scores, bboxes): ig = ig.transpose((1, 2, 0)) * mx.nd.array( std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context) ig = (ig * 255).clip(0, 255) # heatmap 그리기 heatmap = mx.nd.multiply(heatmap, 255.0) # 0 ~ 255 범위로 바꾸기 heatmap = mx.nd.max( heatmap, axis=0, keepdims=True) # channel 축으로 가장 큰것 뽑기 heatmap = mx.nd.transpose( heatmap, axes=(1, 2, 0)) # (height, width, channel=1) heatmap = mx.nd.repeat( heatmap, repeats=3, axis=-1) # (height, width, channel=3) heatmap = heatmap.asnumpy( ) # mxnet.ndarray -> numpy.ndarray heatmap = cv2.resize(heatmap, dsize=(input_size[1], input_size[0])) # 사이즈 원복 heatmap = heatmap.astype("uint8") # float32 -> uint8 heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET) heatmap[:, :, (0, 1, 2)] = heatmap[:, :, (2, 1, 0)] # BGR -> RGB heatmap = np.transpose( heatmap, axes=(2, 0, 1)) # (channel=3, height, width) # ground truth box 그리기 ground_truth = plot_bbox( ig, gt_box * scale_factor, scores=None, labels=gt_id, thresh=None, reverse_rgb=True, class_names=valid_dataset.classes, absolute_coordinates=True, colors=ground_truth_colors) # prediction box 그리기 prediction_box = plot_bbox( ground_truth, bbox, scores=score, labels=id, thresh=plot_class_thresh, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True) # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다. prediction_box = cv2.cvtColor(prediction_box, cv2.COLOR_BGR2RGB) prediction_box = np.transpose(prediction_box, axes=(2, 0, 1)) batch_image.append( prediction_box) # (batch, channel, height, width) heatmap_image.append(heatmap) all_image = np.concatenate( [np.array(batch_image), np.array(heatmap_image)], axis=-1) summary.add_image(tag="valid_result", image=all_image, global_step=i) summary.add_scalar(tag="heatmap_loss", value={ "train_heatmap_loss_mean": train_heatmap_loss_mean, "valid_heatmap_loss_mean": valid_heatmap_loss_mean }, global_step=i) summary.add_scalar(tag="offset_loss", value={ "train_offset_loss_mean": train_offset_loss_mean, "valid_offset_loss_mean": valid_offset_loss_mean }, global_step=i) summary.add_scalar(tag="wh_loss", value={ "train_wh_loss_mean": train_wh_loss_mean, "valid_wh_loss_mean": valid_wh_loss_mean }, global_step=i) summary.add_scalar(tag="total_loss", value={ "train_total_loss": train_total_loss_mean, "valid_total_loss": valid_total_loss_mean }, global_step=i) params = net.collect_params().values() if GPU_COUNT > 1: for c in ctx: for p in params: summary.add_histogram(tag=p.name, values=p.data(ctx=c), global_step=i, bins='default') else: for p in params: summary.add_histogram(tag=p.name, values=p.data(), global_step=i, bins='default') if i % save_period == 0: if not os.path.exists(weight_path): os.makedirs(weight_path) ''' Hybrid models can be serialized as JSON files using the export function Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface. When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc. ''' if GPU_COUNT >= 1: context = mx.gpu(0) else: context = mx.cpu(0) postnet = PostNet(net=net, auxnet=prediction) # 새로운 객체가 생성 try: net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True) net.save_parameters(os.path.join(weight_path, f"{i}.params")) # onnx 추출용 # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 export_block_for_cplusplus( path=os.path.join(weight_path, f"{model}_prepost"), block=postnet, data_shape=tuple(input_size) + tuple((3, )), epoch=i, preprocess= True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=context, remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공") net.collect_params().reset_ctx(ctx) end_time = time.time() learning_time = end_time - start_time logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H") logging.info("optimization completed") if using_mlflow: ml.log_metric("learning time", round(learning_time / 3600, 2))
def rescale_per_image(x): assert x.ndim == 4 x = x.copy() for i in range(x.shape[0]): min_val = x[i].min().asscalar() max_val = x[i].max().asscalar() x[i] = rescale(x[i], min_val, max_val) return x sw = SummaryWriter(logdir='./logs') swan = mx.nd.load('./data/imagenet_swan.ndarray')[0] swan = swan.reshape((1, ) + swan.shape).astype('float32') sw.add_image(tag='swan', image=swan.astype('uint8')) # plot conv filter and output of inception-bn weight = mx.nd.load('./data/inception_bn_conv_1_weight.param')[0] bias = mx.nd.load('./data/inception_bn_conv_1_bias.param')[0] mean_rgb = mx.nd.array([123.68, 116.779, 103.939]) mean_rgb = mean_rgb.reshape((1, 3, 1, 1)) out = mx.nd.Convolution(swan - mean_rgb, weight=weight, bias=bias, kernel=weight.shape[2:], num_filter=weight.shape[0]) out = out.transpose((1, 0, 2, 3)) tag = 'test_weight' sw.add_image(tag='inception_bn_conv_1_weight', image=rescale_per_image(weight)) sw.add_image(tag='inception_bn_conv_1_output', image=rescale_per_image(out))
def train(cfg): date_today = date.today().strftime("%b-%d-%Y") summary_writer = SummaryWriter(cfg.log_dir, flush_secs=5, filename_suffix=date_today) train_data = mx.gluon.data.vision.MNIST( train=True).transform_first(data_xform) train_loader = mx.gluon.data.DataLoader(train_data, shuffle=True, batch_size=cfg.batch_size) image_shape = train_data[0][0].shape # No initialization. Custom blocks encapsulate initialization and setting of data. net = Glow(image_shape, cfg.K, cfg.L, cfg.affine, cfg.filter_size, cfg.temp, cfg.n_bits) ctx = get_context(cfg.use_gpu) net = set_context(net, ctx) trainer = mx.gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': cfg.lr}) n_samples = len(train_loader) update_interval = n_samples // 2 # store the loss with summary writer twice loss_buffer = LossBuffer() global_step = 1 for epoch in range(1, cfg.n_epochs + 1): for idx, (batch, label) in enumerate(train_loader): print(f'Epoch {epoch} - Batch {idx}/{n_samples}', end='\r') data = mx.gluon.utils.split_and_load(batch, ctx) with mx.autograd.record(): for X in data: z_list, nll, bpd = net(X) prev_loss = loss_buffer.new_loss(bpd.mean()) loss_buffer.loss.backward() trainer.step(1) if prev_loss is not None and global_step % update_interval == 0: loss = prev_loss.asscalar() summary_writer.add_scalar(tag='bpd', value=loss, global_step=global_step) global_step += 1 # Sample from latent space to generate random digit and reverse from latent if (epoch % cfg.plot_interval) == 0: x_generate = net.reverse()[0] x_generate = x_generate.reshape(1, *x_generate.shape) x_recon = net.reverse(z_list[-1])[0] x_recon = x_recon.reshape(1, *x_recon.shape) x_real = data[0][0].reshape(1, *data[0][0].shape) minim = -0.5 maxim = 0.5 x_generate = x_generate.clip(minim, maxim) x_generate += -minim x_recon = x_recon.clip(minim, maxim) x_recon += -minim x_real += -minim img = mx.nd.concatenate([x_real, x_generate, x_recon], axis=0).asnumpy() summary_writer.add_image(tag='generations', image=img, global_step=global_step) summary_writer.close()
class GluonLearner(): def __init__(self, model, run_id, gpu_idxs=None, hybridize=False, tensorboard_logging=False): """ Parameters ---------- model: HybridBlock gpu_idxs: None or list of ints If None will set context to CPU. If list of ints, will set context to given GPUs. """ logging.info("Using Gluon Learner.") self.model = model self.run_id = run_id if hybridize: self.model.hybridize() logging.info("Hybridized model.") self.context = get_context(gpu_idxs) self.tensorboard_logging = tensorboard_logging if self.tensorboard_logging: from mxboard import SummaryWriter current_folder = os.path.dirname(os.path.realpath(__file__)) tensorboard_folder = os.path.join(current_folder, "..", "logs", "tensorboard") summary_filepath = os.path.join(tensorboard_folder, self.run_id) self.writer = SummaryWriter(logdir=summary_filepath) def fit(self, train_data, valid_data, epochs=300, lr=None, lr_schedule=None, initializer=mx.init.Xavier(), optimizer=None, kvstore='device', log_frequency=10000, early_stopping_criteria=None ): """ Uses accuracy as training and validation metric. Parameters ---------- train_iter : DataIter Contains training data validation_iter : DataIter Contains validation data epochs: int Number of epochs to run, unless stopped early by early_stopping_criteria. lr: float or int Learning rate lr_schedule : dict Contains change points of learning rate. Key is the epoch and value is the learning rate. Must contain epoch 0. initializer : mxnet.initializer.Initializer optimizer: mxnet.optimizer.Optimizer Defaults to be `mx.optimizer.SGD(learning_rate=lr_schedule[0], rescale_grad=1.0/batch_size, momentum=0.9)` kvstore : str, optional log_frequency : int, optional Number of samples between logs early_stopping_criteria: function (float -> boolean) Given validation accuracy, should return True if training should be stopped early. Returns ------- None Output is logged to file. """ if lr_schedule is None: assert lr is not None, "lr must be defined if not using lr_schedule" lr_schedule = {0: lr} else: assert lr is None, "lr should not be defined if using lr_schedule" assert 0 in lr_schedule.keys(), "lr for epoch 0 must be defined in lr_schedule" self.model.initialize(initializer, ctx=self.context) if optimizer is None: optimizer = mx.optimizer.SGD(learning_rate=lr_schedule[0], momentum=0.9) trainer = mx.gluon.Trainer(params=self.model.collect_params(), optimizer=optimizer, kvstore=kvstore) train_metric = mx.metric.Accuracy() criterion = mx.gluon.loss.SoftmaxCrossEntropyLoss() max_val_acc = {'val_acc': 0, 'trn_acc': 0, 'epoch': 0} for epoch in range(epochs): epoch_tick = time.time() # update learning rate if epoch in lr_schedule.keys(): trainer.set_learning_rate(lr_schedule[epoch]) logging.info("Epoch {}, Changed learning rate.".format(epoch)) logging.info('Epoch {}, Learning rate={}'.format(epoch, trainer.learning_rate)) if self.tensorboard_logging: self.writer.add_scalar(tag='learning_rate', value=trainer.learning_rate, global_step=epoch+1) train_metric.reset() samples_processed = 0 for batch_idx, (data, label) in enumerate(train_data): batch_tick = time.time() batch_size = data.shape[0] # partition data across all devices in context data = mx.gluon.utils.split_and_load(data, ctx_list=self.context, batch_axis=0) label = mx.gluon.utils.split_and_load(label, ctx_list=self.context, batch_axis=0) y_pred = [] losses = [] with mx.autograd.record(): # calculate loss on each partition of data for x_part, y_true_part in zip(data, label): y_pred_part = self.model(x_part) loss = criterion(y_pred_part, y_true_part) # store the losses and do backward after we have done forward on all GPUs. # for better performance on multiple GPUs. losses.append(loss) y_pred.append(y_pred_part) for loss in losses: loss.backward() trainer.step(batch_size) train_metric.update(label, y_pred) if self.tensorboard_logging: # log to tensorboard (on first batch) if batch_idx == 0: self.writer.add_histogram(tag='input', values=x_part, global_step=epoch + 1, bins=100) self.writer.add_histogram(tag='output', values=y_pred_part, global_step=epoch + 1, bins=100) self.writer.add_histogram(tag='loss', values=loss, global_step=epoch + 1, bins=100) self.writer.add_image(tag="batch", image=x_part, global_step=epoch + 1) # log batch speed (if a multiple of log_frequency is contained in the last batch) log_batch = (samples_processed // log_frequency) != ((samples_processed + batch_size) // log_frequency) if ((batch_idx >= 1) and log_batch): # batch estimate, not averaged over multiple batches speed = batch_size / (time.time() - batch_tick) logging.info('Epoch {}, Batch {}, Speed={:.2f} images/second'.format(epoch, batch_idx, speed)) samples_processed += batch_size # log training accuracy _, trn_acc = train_metric.get() logging.info('Epoch {}, Training accuracy={}'.format(epoch, trn_acc)) if self.tensorboard_logging: self.writer.add_scalar(tag='accuracy/training', value=trn_acc*100, global_step=epoch+1) # log validation accuracy val_acc = evaluate_accuracy(valid_data, self.model, ctx=self.context) logging.info('Epoch {}, Validation accuracy={}'.format(epoch, val_acc)) if self.tensorboard_logging: self.writer.add_scalar(tag='accuracy/validation', value=val_acc * 100, global_step=epoch + 1) # log maximum validation accuracy if val_acc > max_val_acc['val_acc']: max_val_acc = {'val_acc': val_acc, 'trn_acc': trn_acc, 'epoch': epoch} logging.info(("Epoch {}, Max validation accuracy={} @ " "Epoch {} (with training accuracy={})").format(epoch, max_val_acc['val_acc'], max_val_acc['epoch'], max_val_acc['trn_acc'])) # log duration of epoch logging.info('Epoch {}, Duration={}'.format(epoch, time.time() - epoch_tick)) if early_stopping_criteria: if early_stopping_criteria(val_acc): logging.info("Epoch {}, Reached early stopping target, stopping training.".format(epoch)) break # checkpoint final model current_folder = os.path.dirname(os.path.realpath(__file__)) checkpoint_folder = os.path.join(current_folder, "..", "logs", "checkpoints") checkpoint_filepath = os.path.join(checkpoint_folder, self.run_id + '.params') self.model.save_params(checkpoint_filepath) def predict(self, test_data, log_frequency=10000): logging.info('Starting inference.') current_folder = os.path.dirname(os.path.realpath(__file__)) checkpoint_folder = os.path.join(current_folder, "..", "logs", "checkpoints") checkpoint_filepath = os.path.join(checkpoint_folder, self.run_id + '.params') self.model.load_params(checkpoint_filepath, ctx=self.context) samples_processed = 0 for batch_idx, (data, label) in enumerate(test_data): batch_tick = time.time() batch_size = data.shape[0] # partition data across all devices in context data = mx.gluon.utils.split_and_load(data, ctx_list=self.context, batch_axis=0) label = mx.gluon.utils.split_and_load(label, ctx_list=self.context, batch_axis=0) # calculate loss on each partition of data y_pred = [] for x_part, y_true_part in zip(data, label): y_pred_part = self.model(x_part) y_pred.append(y_pred_part) mx.nd.waitall() batch_tock = time.time() # log batch speed (if a multiple of log_frequency is contained in the last batch) log_batch = (samples_processed // log_frequency) != ((samples_processed + batch_size) // log_frequency) warm_up_period = 5 if ((batch_idx >= warm_up_period) and log_batch): # batch estimate, not averaged over multiple batches latency = (batch_tock - batch_tick) # seconds speed = batch_size / latency logging.info('Inference. Batch {}, Latency={:.5f} ms, Speed={:.2f} images/second'.format(batch_idx, latency * 1000, speed)) samples_processed += batch_size logging.info('Completed inference.')
def train(epochs): gan_loss = gluon.loss.SigmoidBinaryCrossEntropyLoss() l1_loss = gluon.loss.L1Loss() trainer_G = gluon.Trainer(netG.collect_params(), 'adam', optimizer_params={ 'learning_rate': 0.0002, 'beta1': 0.5, 'beta2': 0.999 }) trainer_D = gluon.Trainer(netD.collect_params(), 'adam', optimizer_params={ 'learning_rate': 0.0002, 'beta1': 0.5, 'beta2': 0.999 }) ## config the log file logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) fh = logging.FileHandler(os.path.join(log_dir, 'train.log')) logger.addHandler(fh) sw = SummaryWriter(logdir=os.path.join(log_dir, 'train_sw')) batch_len = train_iter.num_data // train_iter.batch_size image_pool = utils.ImagePool(50) global_step = 0 for epch in range(epochs): train_iter.reset() epch_time = time.time() batch_time = time.time() for iter_step, databatch in enumerate(train_iter): data = databatch.data[0].as_in_context(ctx) label = databatch.label[0].as_in_context(ctx) ## train netD pred = netG(data) # fake_data =nd.concat(data, pred, dim=1) # fake_data = image_pool.fetch_img(fake_data) fake_data = image_pool.fetch_img(nd.concat(data, pred, dim=1)) with autograd.record(): # fake pred_fake = netD(fake_data) fake_label = nd.zeros_like(pred_fake) loss_fake = gan_loss(pred_fake, fake_label).sum() # real real_data = nd.concat(data, label, dim=1) pred_real = netD(real_data) real_label = nd.ones_like(pred_real) loss_real = gan_loss(pred_real, real_label).sum() loss_D = (loss_real + loss_fake) * 0.5 loss_D.backward() trainer_D.step(data.shape[0]) sw.add_scalar('lossD', loss_D.asscalar(), global_step) ## train netG with autograd.record(): pred = netG(data) in_data = nd.concat(data, pred, dim=1) pred_real = netD(in_data) pred_label = nd.ones_like(pred_real) ganloss_g = gan_loss(pred_real, pred_label) l1loss_g = l1_loss(pred, label) loss_G = ganloss_g + l1loss_g * l1_lambda loss_G = loss_G.sum() loss_G.backward() trainer_G.step(data.shape[0]) sw.add_scalar('lossG', loss_G.asscalar(), global_step) ## do the checkpoints during intra epoch if (iter_step + 1) % log_iter_intervals == 0: logger.info( '[Epoch {}][Iter {}] Done., Speed: {:.4f} sample / s'. format(str(epch), str(iter_step), data.shape[0] / (time.time() - batch_time))) batch_time = time.time() global_step += 1 ## do the evaluation after every epoch fake_img = pred[0] img_arr = (fake_img - mx.nd.min(fake_img)) / (mx.nd.max(fake_img) - mx.nd.min(fake_img)) # img_arr = img_arr[::-1, :, :] sw.add_image('generated image', img_arr) eval(epch) ## do the checkpoints inter epochs netG.save_parameters(ckpt_fmt.format('netG', str(epch))) netD.save_parameters(ckpt_fmt.format('netD', str(epch))) logger.info('[Epoch {}] Done. Cost: {:.4f} s'.format( str(epch), time.time() - epch_time))
def fit(args, network, data_loader, **kwargs): """ train a model args : argparse returns network : the symbol definition of the nerual network data_loader : function that returns the train and val data iterators """ # kvstore kv = mx.kvstore.create(args.kv_store) if args.gc_type != 'none': kv.set_gradient_compression({ 'type': args.gc_type, 'threshold': args.gc_threshold }) # logging head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s' logging.basicConfig(level=logging.DEBUG, format=head) logging.info('start with arguments %s', args) epoch_size = get_epoch_size(args, kv) # data iterators (train, val) = data_loader(args, kv) if 'dist' in args.kv_store and not 'async' in args.kv_store: logging.info('Resizing training data to %d batches per machine', epoch_size) # resize train iter to ensure each machine has same number of batches per epoch # if not, dist_sync can hang at the end with one machine waiting for other machines train = mx.io.ResizeIter(train, epoch_size) if args.test_io: tic = time.time() for i, batch in enumerate(train): if isinstance(batch, list): for b in batch: for j in b.data: j.wait_to_read() else: for j in batch.data: j.wait_to_read() if (i + 1) % args.disp_batches == 0: logging.info( 'Batch [%d]\tSpeed: %.2f samples/sec', i, args.disp_batches * args.batch_size / (time.time() - tic)) tic = time.time() return # define a summary writer that logs data and flushes to the file every 5 seconds if args.summarywriter: shutil.rmtree('/opt/incubator-mxnet/logs') # clear the previous logs os.mkdir('/opt/incubator-mxnet/logs') sw = SummaryWriter(logdir='/opt/incubator-mxnet/logs', flush_secs=args.flush_secs) # load model if 'arg_params' in kwargs and 'aux_params' in kwargs: arg_params = kwargs['arg_params'] aux_params = kwargs['aux_params'] else: sym, arg_params, aux_params = _load_model(args, kv.rank) if sym is not None: assert sym.tojson() == network.tojson() network = sym # log the network if args.summarywriter: sw.add_graph(network) # save model checkpoint = _save_model(args, kv.rank) # convert mean.bin to mean.npy _convert_mean_numpy(args, kv.rank) # devices for training devs = mx.cpu() if args.gpus is None or args.gpus == "" else [ mx.gpu(int(i)) for i in args.gpus.split(',') ] # learning rate lr, lr_scheduler = _get_lr_scheduler(args, kv) # create model model = mx.mod.Module(context=devs, symbol=network) lr_scheduler = lr_scheduler optimizer_params = { 'learning_rate': lr, 'wd': args.wd, 'lr_scheduler': lr_scheduler, 'multi_precision': True } # Only a limited number of optimizers have 'momentum' property has_momentum = {'sgd', 'dcasgd', 'nag'} if args.optimizer in has_momentum: optimizer_params['momentum'] = args.mom monitor = mx.mon.Monitor(args.monitor, pattern=".*") if args.monitor > 0 else None # A limited number of optimizers have a warmup period has_warmup = {'lbsgd', 'lbnag'} if args.optimizer in has_warmup: nworkers = kv.num_workers if epoch_size < 1: epoch_size = 1 macrobatch_size = args.macrobatch_size if macrobatch_size < args.batch_size * nworkers: macrobatch_size = args.batch_size * nworkers #batch_scale = round(float(macrobatch_size) / args.batch_size / nworkers +0.4999) batch_scale = math.ceil( float(macrobatch_size) / args.batch_size / nworkers) optimizer_params['updates_per_epoch'] = epoch_size optimizer_params[ 'begin_epoch'] = args.load_epoch if args.load_epoch else 0 optimizer_params['batch_scale'] = batch_scale optimizer_params['warmup_strategy'] = args.warmup_strategy optimizer_params['warmup_epochs'] = args.warmup_epochs optimizer_params['num_epochs'] = args.num_epochs if args.initializer == 'default': if args.network == 'alexnet': # AlexNet will not converge using Xavier initializer = mx.init.Normal() # VGG will not trend to converge using Xavier-Gaussian elif args.network and 'vgg' in args.network: initializer = mx.init.Xavier() else: initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2) # initializer = mx.init.Xavier(factor_type="in", magnitude=2.34), elif args.initializer == 'xavier': initializer = mx.init.Xavier() elif args.initializer == 'msra': initializer = mx.init.MSRAPrelu() elif args.initializer == 'orthogonal': initializer = mx.init.Orthogonal() elif args.initializer == 'normal': initializer = mx.init.Normal() elif args.initializer == 'uniform': initializer = mx.init.Uniform() elif args.initializer == 'one': initializer = mx.init.One() elif args.initializer == 'zero': initializer = mx.init.Zero() # evaluation metrices eval_metrics = ['accuracy'] if args.top_k > 0: eval_metrics.append( mx.metric.create('top_k_accuracy', top_k=args.top_k)) supported_loss = ['ce', 'nll_loss'] if len(args.loss) > 0: # ce or nll loss is only applicable to softmax output loss_type_list = args.loss.split(',') if 'softmax_output' in network.list_outputs(): for loss_type in loss_type_list: loss_type = loss_type.strip() if loss_type == 'nll': loss_type = 'nll_loss' if loss_type not in supported_loss: logging.warning(loss_type + ' is not an valid loss type, only cross-entropy or ' \ 'negative likelihood loss is supported!') else: eval_metrics.append(mx.metric.create(loss_type)) else: logging.warning( "The output is not softmax_output, loss argument will be skipped!" ) # callbacks that run after each batch if args.summarywriter: # 增加可视化的回调函数,有多个回调函数时,除最后一个回调函数外不能进行准确率的清零操作(即auto_reset参数必须设置为False) batch_end_callbacks = [ mx.callback.Speedometer(args.batch_size, args.disp_batches, False), summary_writter_callback.summary_writter_eval_metric(sw) ] else: batch_end_callbacks = [ mx.callback.Speedometer(args.batch_size, args.disp_batches, True) ] if 'batch_end_callback' in kwargs: cbs = kwargs['batch_end_callback'] batch_end_callbacks += cbs if isinstance(cbs, list) else [cbs] # run model.fit(train, begin_epoch=args.load_epoch if args.load_epoch else 0, num_epoch=args.num_epochs, eval_data=val, eval_metric=eval_metrics, kvstore=kv, optimizer=args.optimizer, optimizer_params=optimizer_params, initializer=initializer, arg_params=arg_params, aux_params=aux_params, batch_end_callback=batch_end_callbacks, epoch_end_callback=checkpoint, allow_missing=True, monitor=monitor) # log the weight after train if args.summarywriter: arg_params, aux_params = model.get_params() for k, v in arg_params.items(): if v.ndim == 4: # only weight matrix has four dimision weight = rescale_per_image(v) sw.add_image(tag=k, image=weight) sw.close()
def run(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], anchor_alloc_size=[256, 256], box_sizes=[21, 51.2, 133.12, 215.04, 296.96, 378.88, 460.8, 542.72], box_ratios=[[1, 2, 0.5]] + [[1, 2, 0.5, 3, 1.0 / 3]] * 4 + [[1, 2, 0.5]] * 2, anchor_box_clip=True, graphviz=True, epoch=100, input_size=[400, 600], batch_log=100, batch_size=16, batch_interval=10, subdivision=4, train_dataset_path="Dataset/train", valid_dataset_path="Dataset/valid", multiscale=True, factor_scale=[8, 5], foreground_iou_thresh=0.5, data_augmentation=True, num_workers=4, optimizer="ADAM", save_period=10, load_period=10, learning_rate=0.001, decay_lr=0.999, decay_step=10, GPU_COUNT=0, base="VGG16_512", pretrained_base=True, pretrained_path="modelparam", classHardNegativeMining=True, boxHardNegativeMining=True, AMP=True, valid_size=8, eval_period=5, tensorboard=True, valid_graph_path="valid_Graph", using_mlflow=True, decode_number=-1, multiperclass=True, nms_thresh=0.45, nms_topk=500, iou_thresh=0.5, except_class_thresh=0.01, plot_class_thresh=0.5): if GPU_COUNT == 0: ctx = mx.cpu(0) AMP = False elif GPU_COUNT == 1: ctx = mx.gpu(0) else: ctx = [mx.gpu(i) for i in range(GPU_COUNT)] # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if isinstance(ctx, (list, tuple)): for i, c in enumerate(ctx): free_memory, total_memory = mx.context.gpu_memory_info(i) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: if GPU_COUNT == 1: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: logging.info(f'Running on {ctx}') if GPU_COUNT > 0 and batch_size < GPU_COUNT: logging.info("batch size must be greater than gpu number") exit(0) if AMP: amp.init() if multiscale: logging.info("Using MultiScale") if data_augmentation: logging.info("Using Data Augmentation") logging.info("training SSD Detector") input_shape = (1, 3) + tuple(input_size) try: if base.upper() == "VGG16_300": # 입력 사이즈 300 x 300 추천 net = SSD_VGG16(version=300, input_size=input_size, box_sizes=box_sizes, box_ratios=box_ratios, anchor_box_clip=anchor_box_clip, alloc_size=anchor_alloc_size, ctx=mx.cpu()) elif base.upper() == "VGG16_512": # 입력 사이즈 512 x 512 추천 net = SSD_VGG16(version=512, input_size=input_size, box_sizes=box_sizes, box_ratios=box_ratios, anchor_box_clip=anchor_box_clip, ctx=mx.cpu()) train_dataloader, train_dataset = traindataloader( multiscale=multiscale, factor_scale=factor_scale, augmentation=data_augmentation, path=train_dataset_path, input_size=input_size, batch_size=batch_size, batch_interval=batch_interval, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, foreground_iou_thresh=foreground_iou_thresh, make_target=True) valid_dataloader, valid_dataset = validdataloader( path=valid_dataset_path, input_size=input_size, batch_size=valid_size, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, foreground_iou_thresh=foreground_iou_thresh, make_target=True) except Exception: logging.info("dataset 없음") exit(0) train_update_number_per_epoch = len(train_dataloader) if train_update_number_per_epoch < 1: logging.warning("train batch size가 데이터 수보다 큼") exit(0) valid_list = glob.glob(os.path.join(valid_dataset_path, "*")) if valid_list: valid_update_number_per_epoch = len(valid_dataloader) if valid_update_number_per_epoch < 1: logging.warning("valid batch size가 데이터 수보다 큼") exit(0) num_classes = train_dataset.num_class # 클래스 수 name_classes = train_dataset.classes # 이름 다시 붙이기 optimizer = optimizer.upper() base = base.upper() if pretrained_base: model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_P" + base else: model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_" + base weight_path = f"weights/{model}" sym_path = os.path.join(weight_path, f'{model}-symbol.json') param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params') if os.path.exists(param_path) and os.path.exists(sym_path): start_epoch = load_period logging.info(f"loading {os.path.basename(param_path)} weights\n") net = gluon.SymbolBlock.imports(sym_path, ['data'], param_path, ctx=ctx) else: start_epoch = 0 if base.upper() == "VGG16_300": # 입력 사이즈 300 x 300 추천 net = SSD_VGG16( version=300, input_size=input_size, # box_sizes=[21, 45, 101.25, 157.5, 213.75, 270, 326.25], # box_ratios=[[1, 2, 0.5]] + # conv4_3 # [[1, 2, 0.5, 3, 1.0 / 3]] * 3 + # conv7, conv8_2, conv9_2, conv10_2 # [[1, 2, 0.5]] * 2, # conv11_2, conv12_2 box_sizes=box_sizes, box_ratios=box_ratios, num_classes=num_classes, pretrained=pretrained_base, pretrained_path=pretrained_path, anchor_box_clip=anchor_box_clip, alloc_size=anchor_alloc_size, ctx=ctx) elif base.upper() == "VGG16_512": # 입력 사이즈 512 x 512 추천 net = SSD_VGG16( version=512, input_size=input_size, # box_sizes=[21, 51.2, 133.12, 215.04, 296.96, 378.88, 460.8, 542.72], # box_ratios=[[1, 2, 0.5]] + # conv4_3 # [[1, 2, 0.5, 3, 1.0 / 3]] * 4 + # conv7, conv8_2, conv9_2, conv10_2 # [[1, 2, 0.5]] * 2, # conv11_2, conv12_2 box_sizes=box_sizes, box_ratios=box_ratios, num_classes=num_classes, pretrained=pretrained_base, pretrained_path=pretrained_path, anchor_box_clip=anchor_box_clip, ctx=ctx) else: logging.warning("backbone 없음") exit(0) if isinstance(ctx, (list, tuple)): net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.summary(mx.nd.ones(shape=input_shape, ctx=ctx)) ''' active (bool, default True) – Whether to turn hybrid on or off. static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase. static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower. ''' if multiscale: net.hybridize(active=True, static_alloc=True, static_shape=False) else: net.hybridize(active=True, static_alloc=True, static_shape=True) if start_epoch + 1 >= epoch + 1: logging.info("this model has already been optimized") exit(0) if tensorboard: summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10, verbose=False) if isinstance(ctx, (list, tuple)): net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.forward(mx.nd.ones(shape=input_shape, ctx=ctx)) summary.add_graph(net) if graphviz: gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model) # optimizer unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size step = unit * decay_step lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate) for p in net.collect_params().values(): if p.grad_req != "null": p.grad_req = 'add' if AMP: ''' update_on_kvstore : bool, default None Whether to perform parameter updates on kvstore. If None, then trainer will choose the more suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored. ''' if optimizer.upper() == "ADAM": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "SGD": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0005, "momentum": 0.9, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling else: logging.error("optimizer not selected") exit(0) amp.init_trainer(trainer) else: if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False }) elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False }) elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0005, "momentum": 0.9, 'multi_precision': False }) else: logging.error("optimizer not selected") exit(0) ''' localization loss -> Smooth L1 loss confidence loss -> Softmax ''' if not classHardNegativeMining: confidence_loss = SoftmaxCrossEntropyLoss(axis=-1, sparse_label=True, from_log_softmax=False, batch_axis=None, reduction="sum", exclude=False) if not boxHardNegativeMining: localization_loss = HuberLoss(rho=1, batch_axis=None, reduction="sum", exclude=False) prediction = Prediction(from_softmax=False, num_classes=num_classes, decode_number=decode_number, nms_thresh=nms_thresh, nms_topk=nms_topk, except_class_thresh=except_class_thresh, multiperclass=multiperclass) precision_recall = Voc_2007_AP(iou_thresh=iou_thresh, class_names=name_classes) start_time = time.time() for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch): conf_loss_sum = 0 loc_loss_sum = 0 time_stamp = time.time() for batch_count, (image, _, cls_all, box_all, _) in enumerate(train_dataloader, start=1): td_batch_size = image.shape[0] image = mx.nd.split(data=image, num_outputs=subdivision, axis=0) cls_all = mx.nd.split(data=cls_all, num_outputs=subdivision, axis=0) box_all = mx.nd.split(data=box_all, num_outputs=subdivision, axis=0) if subdivision == 1: image = [image] cls_t_all = [cls_t_all] box_t_all = [box_t_all] with autograd.record(train_mode=True): cls_all_losses = [] box_all_losses = [] for image_split, cls_split, box_split in zip( image, cls_all, box_all): if GPU_COUNT <= 1: image_split = gluon.utils.split_and_load( image_split, [ctx], even_split=False) cls_split = gluon.utils.split_and_load( cls_split, [ctx], even_split=False) box_split = gluon.utils.split_and_load( box_split, [ctx], even_split=False) else: image_split = gluon.utils.split_and_load( image_split, ctx, even_split=False) cls_split = gluon.utils.split_and_load( cls_split, ctx, even_split=False) box_split = gluon.utils.split_and_load( box_split, ctx, even_split=False) # prediction, target space for Data Parallelism cls_losses = [] box_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, cls_target, box_target in zip( image_split, cls_split, box_split): # 1. SSD network Inference cls_pred, box_pred, anchor = net(img) ''' 4. Hard negative mining (class에만 loss 계산) Hard negative mining After the matching step, most of the default boxes are negatives, especially when the number of possible default boxes is large. This introduces a significant imbalance between the positive and negative training examples. Instead of using all the negative examples, we sort them using the highest confidence loss for each default box and pick the top ones so that the ratio between the negatives and positives is at most 3:1. We found that this leads to faster optimization and a more stable training ''' weight_term_alpha = 1 negative_mining_ratio = 3 positive_samples = cls_target > 0 # True or False positive_numbers = positive_samples.sum() if classHardNegativeMining: pred = mx.nd.log_softmax(cls_pred, axis=-1) negative_samples = 1 - positive_samples conf_loss = -mx.nd.pick( pred, cls_target, axis=-1) # (batch, all feature number) ''' we sort them using the highest confidence loss for each default box and pick the top ones so that the ratio between the negatives and positives is at most 3:1. ''' negative_samples_conf_loss = (conf_loss * negative_samples) # 아래 3줄의 코드 출처 : from gluoncv.loss import SSDMultiBoxLoss negative_samples_index = mx.nd.argsort( negative_samples_conf_loss, axis=-1, is_ascend=False) selection = mx.nd.argsort(negative_samples_index, axis=-1, is_ascend=True) hard_negative_samples = selection <= mx.nd.multiply( positive_numbers, negative_mining_ratio).expand_dims(-1) pos_hardnega = positive_samples + hard_negative_samples conf_loss = mx.nd.where( pos_hardnega > 0, conf_loss, mx.nd.zeros_like(conf_loss)) conf_loss = mx.nd.sum(conf_loss) if positive_numbers: conf_loss = mx.nd.divide( conf_loss, positive_numbers) else: conf_loss = mx.nd.multiply(conf_loss, 0) cls_losses.append(conf_loss.asscalar()) else: conf_loss = confidence_loss( cls_pred, cls_target, positive_samples.expand_dims(axis=-1)) if positive_numbers: conf_loss = mx.nd.divide( conf_loss, positive_numbers) else: conf_loss = mx.nd.multiply(conf_loss, 0) cls_losses.append(conf_loss.asscalar()) if boxHardNegativeMining: # loc loss에도 hard HardNegativeMining 적용해보자. pred = mx.nd.log_softmax(cls_pred, axis=-1) negative_samples = 1 - positive_samples conf_loss_for_box = -mx.nd.pick( pred, cls_target, axis=-1) # (batch, all feature number) negative_samples_conf_loss = (conf_loss_for_box * negative_samples) negative_samples_index = mx.nd.argsort( negative_samples_conf_loss, axis=-1, is_ascend=False) selection = mx.nd.argsort(negative_samples_index, axis=-1, is_ascend=True) hard_negative_samples = selection <= mx.nd.multiply( positive_numbers, negative_mining_ratio).expand_dims(-1) pos_hardnega = positive_samples + hard_negative_samples pos_hardnega = mx.nd.repeat( pos_hardnega.reshape(shape=(0, 0, 1)), repeats=4, axis=-1) loc_loss = mx.nd.abs(box_pred - box_target) loc_loss = mx.nd.where(loc_loss > 1, loc_loss - 0.5, (0.5 / 1) * mx.nd.square(loc_loss)) loc_loss = mx.nd.where(pos_hardnega > 0, loc_loss, mx.nd.zeros_like(loc_loss)) loc_loss = mx.nd.sum(loc_loss) if positive_numbers: loc_loss = mx.nd.divide( loc_loss, positive_numbers) else: loc_loss = mx.nd.multiply(loc_loss, 0) box_losses.append(loc_loss.asscalar()) else: loc_loss = localization_loss( box_pred, box_target, positive_samples.expand_dims(axis=-1)) if positive_numbers: loc_loss = mx.nd.divide( loc_loss, positive_numbers) else: loc_loss = mx.nd.multiply(loc_loss, 0) box_losses.append(loc_loss.asscalar()) total_loss.append(conf_loss + weight_term_alpha * loc_loss) if AMP: with amp.scale_loss(total_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(total_loss) cls_all_losses.append(sum(cls_losses)) box_all_losses.append(sum(box_losses)) trainer.step(batch_size=td_batch_size, ignore_stale_grad=False) # 비우기 for p in net.collect_params().values(): p.zero_grad() conf_loss_sum += sum(cls_all_losses) / td_batch_size loc_loss_sum += sum(box_all_losses) / td_batch_size if batch_count % batch_log == 0: logging.info( f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],' f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],' f'[Lr = {trainer.learning_rate}]' f'[confidence loss = {sum(cls_all_losses) / td_batch_size:.3f}]' f'[localization loss = {sum(box_all_losses) / td_batch_size:.3f}]' ) time_stamp = time.time() train_conf_loss_mean = np.divide(conf_loss_sum, train_update_number_per_epoch) train_loc_loss_mean = np.divide(loc_loss_sum, train_update_number_per_epoch) train_total_loss_mean = train_conf_loss_mean + train_loc_loss_mean logging.info( f"train confidence loss : {train_conf_loss_mean} / train localization loss : {train_loc_loss_mean} / train total loss : {train_total_loss_mean}" ) if i % eval_period == 0 and valid_list: if classHardNegativeMining: confidence_loss = SoftmaxCrossEntropyLoss( axis=-1, sparse_label=True, from_log_softmax=False, batch_axis=None, reduction="sum", exclude=False) if boxHardNegativeMining: localization_loss = HuberLoss(rho=1, batch_axis=None, reduction="sum", exclude=False) conf_loss_sum = 0 loc_loss_sum = 0 for image, label, cls_all, box_all, _ in valid_dataloader: vd_batch_size = image.shape[0] if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) cls_all = gluon.utils.split_and_load(cls_all, [ctx], even_split=False) box_all = gluon.utils.split_and_load(box_all, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) cls_all = gluon.utils.split_and_load(cls_all, [ctx], even_split=False) box_all = gluon.utils.split_and_load(box_all, [ctx], even_split=False) # prediction, target space for Data Parallelism cls_losses = [] box_losses = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, lb, cls_target, box_target in zip( image, label, cls_all, box_all): gt_box = lb[:, :, :4] gt_id = lb[:, :, 4:5] cls_pred, box_pred, anchor = net(img) id, score, bbox = prediction(cls_pred, box_pred, anchor) precision_recall.update(pred_bboxes=bbox, pred_labels=id, pred_scores=score, gt_boxes=gt_box, gt_labels=gt_id) positive_samples = cls_target > 0 positive_numbers = positive_samples.sum() conf_loss = confidence_loss( cls_pred, cls_target, positive_samples.expand_dims(axis=-1)) if positive_numbers: conf_loss = mx.nd.divide(conf_loss, positive_numbers) else: conf_loss = mx.nd.multiply(conf_loss, 0) cls_losses.append(conf_loss.asscalar()) loc_loss = localization_loss( box_pred, box_target, positive_samples.expand_dims(axis=-1)) if positive_numbers: loc_loss = mx.nd.divide(loc_loss, positive_numbers) else: loc_loss = mx.nd.multiply(loc_loss, 0) box_losses.append(loc_loss.asscalar()) conf_loss_sum += sum(cls_losses) / vd_batch_size loc_loss_sum += sum(box_losses) / vd_batch_size valid_conf_loss_mean = np.divide(conf_loss_sum, valid_update_number_per_epoch) valid_loc_loss_mean = np.divide(loc_loss_sum, valid_update_number_per_epoch) valid_total_loss_mean = valid_conf_loss_mean + valid_loc_loss_mean logging.info( f"valid confidence loss : {valid_conf_loss_mean} / valid localization loss : {valid_loc_loss_mean} / valid total loss : {valid_total_loss_mean}" ) AP_appender = [] round_position = 2 class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list( ) for j, c, p, r in zip(range(len(recall)), class_name, precision, recall): name, AP = precision_recall.get_AP(c, p, r) logging.info( f"class {j}'s {name} AP : {round(AP * 100, round_position)}%" ) AP_appender.append(AP) mAP_result = np.mean(AP_appender) logging.info(f"mAP : {round(mAP_result * 100, round_position)}%") precision_recall.get_PR_curve(name=class_name, precision=precision, recall=recall, threshold=threshold, AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i) precision_recall.reset() if tensorboard: # gpu N 개를 대비한 코드 (Data Parallelism) dataloader_iter = iter(valid_dataloader) image, label, _, _, _ = next(dataloader_iter) if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) ground_truth_colors = {} for k in range(num_classes): ground_truth_colors[k] = (0, 0, 1) batch_image = [] for img, lb in zip(image, label): gt_boxes = lb[:, :, :4] gt_ids = lb[:, :, 4:5] cls_pred, box_pred, anchor = net(img) ids, scores, bboxes = prediction(cls_pred, box_pred, anchor) for ig, gt_id, gt_box, id, score, bbox in zip( img, gt_ids, gt_boxes, ids, scores, bboxes): ig = ig.transpose((1, 2, 0)) * mx.nd.array( std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context) ig = (ig * 255).clip(0, 255) # ground truth box 그리기 ground_truth = plot_bbox( ig, gt_box, scores=None, labels=gt_id, thresh=None, reverse_rgb=True, class_names=valid_dataset.classes, absolute_coordinates=True, colors=ground_truth_colors) # prediction box 그리기 prediction_box = plot_bbox( ground_truth, bbox, scores=score, labels=id, thresh=plot_class_thresh, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True) # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다. prediction_box = cv2.cvtColor(prediction_box, cv2.COLOR_BGR2RGB) prediction_box = np.transpose(prediction_box, axes=(2, 0, 1)) batch_image.append( prediction_box) # (batch, channel, height, width) summary.add_image(tag="valid_result", image=np.array(batch_image), global_step=i) summary.add_scalar(tag="conf_loss", value={ "train_conf_loss": train_conf_loss_mean, "valid_conf_loss": valid_conf_loss_mean }, global_step=i) summary.add_scalar(tag="loc_loss", value={ "train_loc_loss": train_loc_loss_mean, "valid_loc_loss": valid_loc_loss_mean }, global_step=i) summary.add_scalar(tag="total_loss", value={ "train_total_loss": train_total_loss_mean, "valid_total_loss": valid_total_loss_mean }, global_step=i) params = net.collect_params().values() if GPU_COUNT > 1: for c in ctx: for p in params: summary.add_histogram(tag=p.name, values=p.data(ctx=c), global_step=i, bins='default') else: for p in params: summary.add_histogram(tag=p.name, values=p.data(), global_step=i, bins='default') if i % save_period == 0: weight_epoch_path = os.path.join(weight_path, str(i)) if not os.path.exists(weight_epoch_path): os.makedirs(weight_epoch_path) ''' Hybrid models can be serialized as JSON files using the export function Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface. When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc. ''' if GPU_COUNT >= 1: context = mx.gpu(0) else: context = mx.cpu(0) postnet = PostNet(net=net, auxnet=prediction) try: net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True) net.save_parameters(os.path.join(weight_path, f"{i}.params")) # onnx 추출용 # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 / onnx로는 추출 못함. export_block_for_cplusplus( path=os.path.join(weight_epoch_path, f"{model}_prepost"), block=postnet, data_shape=tuple(input_size) + tuple((3, )), epoch=i, preprocess= True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=context, remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공") net.collect_params().reset_ctx(ctx) end_time = time.time() learning_time = end_time - start_time logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H") logging.info("optimization completed") if using_mlflow: ml.log_metric("learning time", round(learning_time / 3600, 2))
def train(epochs, ctx): # Collect all parameters from net and its children, then initialize them. net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx) net.hybridize() # Trainer is for updating parameters with gradient. trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': opt.lr, 'momentum': opt.momentum}) metric = mx.metric.Accuracy() loss = gluon.loss.SoftmaxCrossEntropyLoss() # collect parameter names for logging the gradients of parameters in each epoch params = net.collect_params() param_names = params.keys() # define a summary writer that logs data and flushes to the file every 5 seconds sw = SummaryWriter(logdir='./logs', flush_secs=5) global_step = 0 for epoch in range(epochs): # reset data iterator and metric at begining of epoch. metric.reset() for i, (data, label) in enumerate(train_data): # Copy data to ctx if necessary data = data.as_in_context(ctx) label = label.as_in_context(ctx) # Start recording computation graph with record() section. # Recorded graphs can then be differentiated with backward. with autograd.record(): output = net(data) L = loss(output, label) sw.add_scalar(tag='cross_entropy', value=L.mean().asscalar(), global_step=global_step) global_step += 1 L.backward() # take a gradient step with batch_size equal to data.shape[0] trainer.step(data.shape[0]) # update metric at last. metric.update([label], [output]) if i % opt.log_interval == 0 and i > 0: name, train_acc = metric.get() print('[Epoch %d Batch %d] Training: %s=%f' % (epoch, i, name, train_acc)) # Log the first batch of images of each epoch if i == 0: sw.add_image('minist_first_minibatch', data.reshape((opt.batch_size, 1, 28, 28)), epoch) if epoch == 0: sw.add_graph(net) grads = [i.grad() for i in net.collect_params().values()] assert len(grads) == len(param_names) # logging the gradients of parameters for checking convergence for i, name in enumerate(param_names): sw.add_histogram(tag=name, values=grads[i], global_step=epoch, bins=1000) name, train_acc = metric.get() print('[Epoch %d] Training: %s=%f' % (epoch, name, train_acc)) # logging training accuracy sw.add_scalar(tag='accuracy_curves', value=('train_acc', train_acc), global_step=epoch) name, val_acc = test(ctx) print('[Epoch %d] Validation: %s=%f' % (epoch, name, val_acc)) # logging the validation accuracy sw.add_scalar(tag='accuracy_curves', value=('valid_acc', val_acc), global_step=epoch) sw.export_scalars('scalar_dict.json') sw.close()
def train(epochs, ctx): # Collect all parameters from net and its children, then initialize them. net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx) # Trainer is for updating parameters with gradient. trainer = gluon.Trainer(net.collect_params(), 'adam') metric = mx.metric.Accuracy() loss = gluon.loss.SoftmaxCrossEntropyLoss() # do forward pass with dummy data without backwards pass to initialize binary layers with autograd.record(): data, label = dummy_data(ctx) output = net(data) L = loss(output, label) if opt.hybridize: net.hybridize() # collect parameter names for logging the gradients of parameters in each epoch log_param_filter = ".*weight|.*bias" params = net.collect_params(log_param_filter) param_names = params.keys() sw = SummaryWriter(logdir='./logs/{}-{}bits/'.format( "symbolic" if opt.hybridize else "gluon", opt.bits), flush_secs=5) global_step = 0 for epoch in range(epochs): # reset data iterator and metric at begining of epoch. metric.reset() for i, (data, label) in enumerate(train_data): # Copy data to ctx if necessary data = data.as_in_context(ctx) label = label.as_in_context(ctx) # Start recording computation graph with record() section. # Recorded graphs can then be differentiated with backward. with autograd.record(): output = net(data) L = loss(output, label) L.backward() sw.add_scalar(tag='cross_entropy', value=L.mean().asscalar(), global_step=global_step) global_step += 1 # take a gradient step with batch_size equal to data.shape[0] trainer.step(data.shape[0]) # update metric at last. metric.update([label], [output]) if i % opt.log_interval == 0 and i > 0: name, acc = metric.get() print('[Epoch %d Batch %d] Training: %s=%f' % (epoch, i, name, acc)) if i == 0: sw.add_image('mnist_first_minibatch', data.reshape((opt.batch_size, 1, 28, 28)), epoch) grads = [ i.grad() for i in net.collect_params(log_param_filter).values() ] assert len(grads) == len(param_names) # logging the gradients of parameters for checking convergence for i, name in enumerate(param_names): sw.add_histogram(tag=name, values=grads[i], global_step=global_step, bins=1000) name, acc = metric.get() print('[Epoch %d] Training: %s=%f' % (epoch, name, acc)) sw.add_scalar(tag='train_acc', value=acc, global_step=global_step) name, val_acc = test(ctx) print('[Epoch %d] Validation: %s=%f' % (epoch, name, val_acc)) sw.add_scalar(tag='valid_acc', value=val_acc, global_step=global_step) if not opt.hybridize: net.hybridize() with autograd.record(): data, label = dummy_data(ctx) output = net(data) L = loss(output, label) net.export("mnist-lenet-{}-{}-bit".format( "symbolic" if opt.hybridize else "gluon", opt.bits), epoch=1) sw.add_graph(net) sw.close()
def run(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], offset_alloc_size=(64, 64), anchors={"shallow": [(10, 13), (16, 30), (33, 23)], "middle": [(30, 61), (62, 45), (59, 119)], "deep": [(116, 90), (156, 198), (373, 326)]}, graphviz=False, epoch=100, input_size=[416, 416], batch_log=100, batch_size=16, batch_interval=10, subdivision=4, train_dataset_path="Dataset/train", valid_dataset_path="Dataset/valid", multiscale=False, factor_scale=[13, 5], ignore_threshold=0.5, dynamic=False, data_augmentation=True, num_workers=4, optimizer="ADAM", save_period=5, load_period=10, learning_rate=0.001, decay_lr=0.999, decay_step=10, GPU_COUNT=0, Darknetlayer=53, pretrained_base=True, pretrained_path="modelparam", AMP=True, valid_size=8, eval_period=5, tensorboard=True, valid_graph_path="valid_Graph", using_mlflow=True, multiperclass=True, nms_thresh=0.5, nms_topk=500, iou_thresh=0.5, except_class_thresh=0.05, plot_class_thresh=0.5): if GPU_COUNT == 0: ctx = mx.cpu(0) AMP = False elif GPU_COUNT == 1: ctx = mx.gpu(0) else: ctx = [mx.gpu(i) for i in range(GPU_COUNT)] # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if isinstance(ctx, (list, tuple)): for i, c in enumerate(ctx): free_memory, total_memory = mx.context.gpu_memory_info(i) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info(f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB') else: if GPU_COUNT == 1: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info(f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB') else: logging.info(f'Running on {ctx}') # 입력 사이즈를 32의 배수로 지정해 버리기 - stride가 일그러지는 것을 막기 위함 if input_size[0] % 32 != 0 and input_size[1] % 32 != 0: logging.info("The input size must be a multiple of 32") exit(0) if GPU_COUNT > 0 and batch_size < GPU_COUNT: logging.info("batch size must be greater than gpu number") exit(0) if AMP: amp.init() if multiscale: logging.info("Using MultiScale") if data_augmentation: logging.info("Using Data Augmentation") logging.info("training YoloV3 Detector") input_shape = (1, 3) + tuple(input_size) try: net = Yolov3(Darknetlayer=Darknetlayer, anchors=anchors, pretrained=False, ctx=mx.cpu()) train_dataloader, train_dataset = traindataloader(multiscale=multiscale, factor_scale=factor_scale, augmentation=data_augmentation, path=train_dataset_path, input_size=input_size, batch_size=batch_size, batch_interval=batch_interval, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, ignore_threshold=ignore_threshold, dynamic=dynamic, from_sigmoid=False, make_target=True) valid_dataloader, valid_dataset = validdataloader(path=valid_dataset_path, input_size=input_size, batch_size=valid_size, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, ignore_threshold=ignore_threshold, dynamic=dynamic, from_sigmoid=False, make_target=True) except Exception: logging.info("dataset 없음") exit(0) train_update_number_per_epoch = len(train_dataloader) if train_update_number_per_epoch < 1: logging.warning("train batch size가 데이터 수보다 큼") exit(0) valid_list = glob.glob(os.path.join(valid_dataset_path, "*")) if valid_list: valid_update_number_per_epoch = len(valid_dataloader) if valid_update_number_per_epoch < 1: logging.warning("valid batch size가 데이터 수보다 큼") exit(0) num_classes = train_dataset.num_class # 클래스 수 name_classes = train_dataset.classes optimizer = optimizer.upper() if pretrained_base: model = str(input_size[0]) + "_" + str(input_size[1]) + "_" + optimizer + "_P" + "Dark_" + str(Darknetlayer) else: model = str(input_size[0]) + "_" + str(input_size[1]) + "_" + optimizer + "_Dark_" + str(Darknetlayer) weight_path = f"weights/{model}" sym_path = os.path.join(weight_path, f'{model}-symbol.json') param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params') if os.path.exists(param_path) and os.path.exists(sym_path): start_epoch = load_period logging.info(f"loading {os.path.basename(param_path)} weights\n") net = gluon.SymbolBlock.imports(sym_path, ['data'], param_path, ctx=ctx) else: start_epoch = 0 ''' mxnet c++에서 arbitrary input image 를 받기 위한 전략 alloc_size : tuple of int, default is (128, 128) For advanced users. Define `alloc_size` to generate large enough offset maps, which will later saved in parameters. During inference, we support arbitrary input image by cropping corresponding area of the anchor map. This allow us to export to symbol so we can run it in c++, Scalar, etc. ''' net = Yolov3(Darknetlayer=Darknetlayer, input_size=input_size, anchors=anchors, num_classes=num_classes, # foreground만 pretrained=pretrained_base, pretrained_path=pretrained_path, alloc_size=offset_alloc_size, ctx=ctx) if isinstance(ctx, (list, tuple)): net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.summary(mx.nd.ones(shape=input_shape, ctx=ctx)) ''' active (bool, default True) – Whether to turn hybrid on or off. static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase. static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower. ''' if multiscale: net.hybridize(active=True, static_alloc=True, static_shape=False) else: net.hybridize(active=True, static_alloc=True, static_shape=True) if start_epoch + 1 >= epoch + 1: logging.info("this model has already been optimized") exit(0) if tensorboard: summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10, verbose=False) if isinstance(ctx, (list, tuple)): net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.forward(mx.nd.ones(shape=input_shape, ctx=ctx)) summary.add_graph(net) if graphviz: gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model) # optimizer unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size step = unit * decay_step lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate) for p in net.collect_params().values(): if p.grad_req != "null": p.grad_req = 'add' if AMP: ''' update_on_kvstore : bool, default None Whether to perform parameter updates on kvstore. If None, then trainer will choose the more suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored. ''' if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False}, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False}, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0005, "momentum": 0.9, 'multi_precision': False}, update_on_kvstore=False) # for Dynamic loss scaling else: logging.error("optimizer not selected") exit(0) amp.init_trainer(trainer) else: if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False}) elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False}) elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0005, "momentum": 0.9, 'multi_precision': False}) else: logging.error("optimizer not selected") exit(0) loss = Yolov3Loss(sparse_label=True, from_sigmoid=False, batch_axis=None, num_classes=num_classes, reduction="sum", exclude=False) prediction = Prediction( from_sigmoid=False, num_classes=num_classes, nms_thresh=nms_thresh, nms_topk=nms_topk, except_class_thresh=except_class_thresh, multiperclass=multiperclass) precision_recall = Voc_2007_AP(iou_thresh=iou_thresh, class_names=name_classes) start_time = time.time() for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch): xcyc_loss_sum = 0 wh_loss_sum = 0 object_loss_sum = 0 class_loss_sum = 0 time_stamp = time.time() for batch_count, (image, _, xcyc_all, wh_all, objectness_all, class_all, weights_all, _) in enumerate( train_dataloader, start=1): td_batch_size = image.shape[0] image = mx.nd.split(data=image, num_outputs=subdivision, axis=0) xcyc_all = mx.nd.split(data=xcyc_all, num_outputs=subdivision, axis=0) wh_all = mx.nd.split(data=wh_all, num_outputs=subdivision, axis=0) objectness_all = mx.nd.split(data=objectness_all, num_outputs=subdivision, axis=0) class_all = mx.nd.split(data=class_all, num_outputs=subdivision, axis=0) weights_all = mx.nd.split(data=weights_all, num_outputs=subdivision, axis=0) if subdivision == 1: image = [image] xcyc_all = [xcyc_all] wh_all = [wh_all] objectness_all = [objectness_all] class_all = [class_all] weights_all = [weights_all] ''' autograd 설명 https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html ''' with autograd.record(train_mode=True): xcyc_all_losses = [] wh_all_losses = [] object_all_losses = [] class_all_losses = [] for image_split, xcyc_split, wh_split, objectness_split, class_split, weights_split in zip(image, xcyc_all, wh_all, objectness_all, class_all, weights_all): if GPU_COUNT <= 1: image_split = gluon.utils.split_and_load(image_split, [ctx], even_split=False) xcyc_split = gluon.utils.split_and_load(xcyc_split, [ctx], even_split=False) wh_split = gluon.utils.split_and_load(wh_split, [ctx], even_split=False) objectness_split = gluon.utils.split_and_load(objectness_split, [ctx], even_split=False) class_split = gluon.utils.split_and_load(class_split, [ctx], even_split=False) weights_split = gluon.utils.split_and_load(weights_split, [ctx], even_split=False) else: image_split = gluon.utils.split_and_load(image_split, ctx, even_split=False) xcyc_split = gluon.utils.split_and_load(xcyc_split, ctx, even_split=False) wh_split = gluon.utils.split_and_load(wh_split, ctx, even_split=False) objectness_split = gluon.utils.split_and_load(objectness_split, ctx, even_split=False) class_split = gluon.utils.split_and_load(class_split, ctx, even_split=False) weights_split = gluon.utils.split_and_load(weights_split, ctx, even_split=False) xcyc_losses = [] wh_losses = [] object_losses = [] class_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, xcyc_target, wh_target, objectness, class_target, weights in zip(image_split, xcyc_split, wh_split, objectness_split, class_split, weights_split): output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net( img) xcyc_loss, wh_loss, object_loss, class_loss = loss(output1, output2, output3, xcyc_target, wh_target, objectness, class_target, weights) xcyc_losses.append(xcyc_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) object_losses.append(object_loss.asscalar()) class_losses.append(class_loss.asscalar()) total_loss.append(xcyc_loss + wh_loss + object_loss + class_loss) if AMP: with amp.scale_loss(total_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(total_loss) xcyc_all_losses.append(sum(xcyc_losses)) wh_all_losses.append(sum(wh_losses)) object_all_losses.append(sum(object_losses)) class_all_losses.append(sum(class_losses)) trainer.step(batch_size=td_batch_size, ignore_stale_grad=False) # 비우기 for p in net.collect_params().values(): p.zero_grad() xcyc_loss_sum += sum(xcyc_all_losses) / td_batch_size wh_loss_sum += sum(wh_all_losses) / td_batch_size object_loss_sum += sum(object_all_losses) / td_batch_size class_loss_sum += sum(class_all_losses) / td_batch_size if batch_count % batch_log == 0: logging.info(f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],' f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],' f'[Lr = {trainer.learning_rate}]' f'[xcyc loss = {sum(xcyc_all_losses) / td_batch_size:.3f}]' f'[wh loss = {sum(wh_all_losses) / td_batch_size:.3f}]' f'[obj loss = {sum(object_all_losses) / td_batch_size:.3f}]' f'[class loss = {sum(class_all_losses) / td_batch_size:.3f}]') time_stamp = time.time() train_xcyc_loss_mean = np.divide(xcyc_loss_sum, train_update_number_per_epoch) train_wh_loss_mean = np.divide(wh_loss_sum, train_update_number_per_epoch) train_object_loss_mean = np.divide(object_loss_sum, train_update_number_per_epoch) train_class_loss_mean = np.divide(class_loss_sum, train_update_number_per_epoch) train_total_loss_mean = train_xcyc_loss_mean + train_wh_loss_mean + train_object_loss_mean + train_class_loss_mean logging.info( f"train xcyc loss : {train_xcyc_loss_mean} / " f"train wh loss : {train_wh_loss_mean} / " f"train object loss : {train_object_loss_mean} / " f"train class loss : {train_class_loss_mean} / " f"train total loss : {train_total_loss_mean}" ) if i % eval_period == 0 and valid_list: xcyc_loss_sum = 0 wh_loss_sum = 0 object_loss_sum = 0 class_loss_sum = 0 # loss 구하기 for image, label, xcyc_all, wh_all, objectness_all, class_all, weights_all, _ in valid_dataloader: vd_batch_size, _, height, width = image.shape if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) xcyc_all = gluon.utils.split_and_load(xcyc_all, [ctx], even_split=False) wh_all = gluon.utils.split_and_load(wh_all, [ctx], even_split=False) objectness_all = gluon.utils.split_and_load(objectness_all, [ctx], even_split=False) class_all = gluon.utils.split_and_load(class_all, [ctx], even_split=False) weights_all = gluon.utils.split_and_load(weights_all, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) xcyc_all = gluon.utils.split_and_load(xcyc_all, ctx, even_split=False) wh_all = gluon.utils.split_and_load(wh_all, ctx, even_split=False) objectness_all = gluon.utils.split_and_load(objectness_all, ctx, even_split=False) class_all = gluon.utils.split_and_load(class_all, ctx, even_split=False) weights_all = gluon.utils.split_and_load(weights_all, ctx, even_split=False) xcyc_losses = [] wh_losses = [] object_losses = [] class_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, lb, xcyc_target, wh_target, objectness, class_target, weights in zip(image, label, xcyc_all, wh_all, objectness_all, class_all, weights_all): gt_box = lb[:, :, :4] gt_id = lb[:, :, 4:5] output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net( img) id, score, bbox = prediction(output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3) precision_recall.update(pred_bboxes=bbox, pred_labels=id, pred_scores=score, gt_boxes=gt_box, gt_labels=gt_id) xcyc_loss, wh_loss, object_loss, class_loss = loss(output1, output2, output3, xcyc_target, wh_target, objectness, class_target, weights) xcyc_losses.append(xcyc_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) object_losses.append(object_loss.asscalar()) class_losses.append(class_loss.asscalar()) total_loss.append(xcyc_losses + wh_losses + object_losses + class_losses) xcyc_loss_sum += sum(xcyc_losses) / vd_batch_size wh_loss_sum += sum(wh_losses) / vd_batch_size object_loss_sum += sum(object_losses) / vd_batch_size class_loss_sum += sum(class_losses) / vd_batch_size valid_xcyc_loss_mean = np.divide(xcyc_loss_sum, valid_update_number_per_epoch) valid_wh_loss_mean = np.divide(wh_loss_sum, valid_update_number_per_epoch) valid_object_loss_mean = np.divide(object_loss_sum, valid_update_number_per_epoch) valid_class_loss_mean = np.divide(class_loss_sum, valid_update_number_per_epoch) valid_total_loss_mean = valid_xcyc_loss_mean + valid_wh_loss_mean + valid_object_loss_mean + valid_class_loss_mean logging.info( f"valid xcyc loss : {valid_xcyc_loss_mean} / " f"valid wh loss : {valid_wh_loss_mean} / " f"valid object loss : {valid_object_loss_mean} / " f"valid class loss : {valid_class_loss_mean} / " f"valid total loss : {valid_total_loss_mean}" ) AP_appender = [] round_position = 2 class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list() for j, c, p, r in zip(range(len(recall)), class_name, precision, recall): name, AP = precision_recall.get_AP(c, p, r) logging.info(f"class {j}'s {name} AP : {round(AP * 100, round_position)}%") AP_appender.append(AP) mAP_result = np.mean(AP_appender) logging.info(f"mAP : {round(mAP_result * 100, round_position)}%") precision_recall.get_PR_curve(name=class_name, precision=precision, recall=recall, threshold=threshold, AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i) precision_recall.reset() if tensorboard: # gpu N 개를 대비한 코드 (Data Parallelism) dataloader_iter = iter(valid_dataloader) image, label, _, _, _, _, _, _ = next(dataloader_iter) if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) ground_truth_colors = {} for k in range(num_classes): ground_truth_colors[k] = (0, 0, 1) batch_image = [] for img, lb in zip(image, label): gt_boxes = lb[:, :, :4] gt_ids = lb[:, :, 4:5] output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net( img) ids, scores, bboxes = prediction(output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3) for ig, gt_id, gt_box, id, score, bbox in zip(img, gt_ids, gt_boxes, ids, scores, bboxes): ig = ig.transpose( (1, 2, 0)) * mx.nd.array(std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context) ig = (ig * 255).clip(0, 255) # ground truth box 그리기 ground_truth = plot_bbox(ig, gt_box, scores=None, labels=gt_id, thresh=None, reverse_rgb=True, class_names=valid_dataset.classes, absolute_coordinates=True, colors=ground_truth_colors) # prediction box 그리기 prediction_box = plot_bbox(ground_truth, bbox, scores=score, labels=id, thresh=plot_class_thresh, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True) # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다. prediction_box = cv2.cvtColor(prediction_box, cv2.COLOR_BGR2RGB) prediction_box = np.transpose(prediction_box, axes=(2, 0, 1)) batch_image.append(prediction_box) # (batch, channel, height, width) summary.add_image(tag="valid_result", image=np.array(batch_image), global_step=i) summary.add_scalar(tag="xy_loss", value={"train_xcyc_loss": train_xcyc_loss_mean, "valid_xcyc_loss": valid_xcyc_loss_mean}, global_step=i) summary.add_scalar(tag="wh_loss", value={"train_wh_loss": train_wh_loss_mean, "valid_wh_loss": valid_wh_loss_mean}, global_step=i) summary.add_scalar(tag="object_loss", value={"train_object_loss": train_object_loss_mean, "valid_object_loss": valid_object_loss_mean}, global_step=i) summary.add_scalar(tag="class_loss", value={"train_class_loss": train_class_loss_mean, "valid_class_loss": valid_class_loss_mean}, global_step=i) summary.add_scalar(tag="total_loss", value={ "train_total_loss": train_total_loss_mean, "valid_total_loss": valid_total_loss_mean}, global_step=i) params = net.collect_params().values() if GPU_COUNT > 1: for c in ctx: for p in params: summary.add_histogram(tag=p.name, values=p.data(ctx=c), global_step=i, bins='default') else: for p in params: summary.add_histogram(tag=p.name, values=p.data(), global_step=i, bins='default') if i % save_period == 0: weight_epoch_path = os.path.join(weight_path, str(i)) if not os.path.exists(weight_epoch_path): os.makedirs(weight_epoch_path) ''' Hybrid models can be serialized as JSON files using the export function Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface. When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc. ''' if GPU_COUNT >= 1: context = mx.gpu(0) else: context = mx.cpu(0) postnet = PostNet(net=net, auxnet=prediction) try: net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True) # for onnx net.save_parameters(os.path.join(weight_path, f"{i}.params")) # onnx 추출용 # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 / onnx로는 추출 못함. export_block_for_cplusplus(path=os.path.join(weight_epoch_path, f"{model}_prepost"), block=postnet, data_shape=tuple(input_size) + tuple((3,)), epoch=i, preprocess=True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=context, remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공") net.collect_params().reset_ctx(ctx) end_time = time.time() learning_time = end_time - start_time logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H") logging.info("optimization completed") if using_mlflow: ml.log_metric("learning time", round(learning_time / 3600, 2))
class SegmentationTrainer(object): def __init__(self, args, model, model_cfg, trainset, valset, optimizer_params, with_depth=False, image_dump_interval=200, criterion=MixSoftmaxCrossEntropyLoss): self.args = args self.model_cfg = model_cfg self.with_depth = with_depth if with_depth: batchify_fn = Tuple(Tuple(Stack(), Stack()), Stack()) else: batchify_fn = Tuple(Stack(), Stack()) self.trainset = trainset self.valset = valset self.train_data = gluon.data.DataLoader(trainset, args.batch_size, shuffle=True, last_batch='rollover', batchify_fn=batchify_fn, num_workers=args.workers) self.val_data = gluon.data.DataLoader(valset, args.test_batch_size, batchify_fn=batchify_fn, last_batch='rollover', num_workers=args.workers) logger.info(model) model.cast(args.dtype) model.collect_params().reset_ctx(ctx=args.ctx) self.net = model self.evaluator = SegEvalModel(model) if args.weights is not None: if os.path.isfile(args.weights): model.load_parameters(args.weights, ctx=args.ctx) else: raise RuntimeError( f"=> no checkpoint found at '{args.weights}'") # create criterion self.criterion = criterion(model_cfg.aux, aux_weight=model_cfg.aux_weight) if args.dtype == 'float16': optimizer_params['multi_precision'] = True self.lr_scheduler = LRScheduler(mode=optimizer_params['mode'], baselr=optimizer_params['baselr'], niters=len(self.train_data) * optimizer_params['nepochs'], nepochs=optimizer_params['nepochs']) del optimizer_params['mode'], optimizer_params[ 'baselr'], optimizer_params['nepochs'] optimizer_params['lr_scheduler'] = self.lr_scheduler kv = mx.kv.create(args.kvstore) self.optimizer = gluon.Trainer(self.net.collect_params(), 'sgd', optimizer_params, kvstore=kv) # evaluation metrics self.metric = SegmentationMetric(trainset.NUM_CLASS) self.tqdm_out = TqdmToLogger(logger, level=logging.INFO) self.denormalizator = DeNormalize(args.input_normalization['mean'], args.input_normalization['std']) self.sw = None self.viz_pallete = _getvocpallete(trainset.NUM_CLASS) self.image_dump_interval = image_dump_interval def training(self, epoch): if self.sw is None: self.sw = SummaryWriter(logdir=str(self.args.logs_path), flush_secs=5) tbar = tqdm(self.train_data, file=self.tqdm_out, ncols=100) self.metric.reset() train_loss = 0.0 iter_per_epoch = len(self.train_data) for i, (data, target) in enumerate(tbar): global_step = iter_per_epoch * epoch + i data = split_and_load(data, ctx_list=self.args.ctx, batch_axis=0, even_split=False) target = split_and_load(target, ctx_list=self.args.ctx, batch_axis=0, even_split=False) with autograd.record(True): if self.with_depth: outputs = [self.net(*X) for X in data] else: outputs = [self.net(X) for X in data] losses = [ self.criterion(*X, Y) for X, Y in zip(outputs, target) ] autograd.backward(losses) self.optimizer.step(self.args.batch_size) batch_loss = sum(loss.asnumpy()[0] for loss in losses) / len(losses) train_loss += batch_loss if self.image_dump_interval > 0 and global_step % self.image_dump_interval == 0: image_blob = data[0][0][0] if self.with_depth else data[0][0] image = self.denormalizator(image_blob.as_in_context( mx.cpu(0))).asnumpy() * 255 gt_mask = target[0][0].asnumpy() + self.trainset.pred_offset predicted_mask = mx.nd.squeeze( mx.nd.argmax(outputs[0][0][0], 0)).asnumpy() + self.trainset.pred_offset gt_mask = visualize_mask(gt_mask.astype(np.int32), self.trainset.NUM_CLASS + 1) predicted_mask = visualize_mask( predicted_mask.astype(np.int32), self.trainset.NUM_CLASS + 1) image = image.transpose((1, 2, 0)) if gt_mask.shape[:2] == image.shape[:2]: result = np.hstack( (image, gt_mask, predicted_mask)).transpose( (2, 0, 1)).astype(np.uint8) self.sw.add_image('Images/input_image', result, global_step=global_step) else: self.sw.add_image('Images/input_image', image.transpose( (2, 0, 1)).astype(np.uint8), global_step=global_step) result = np.hstack((gt_mask, predicted_mask)).transpose( (2, 0, 1)).astype(np.uint8) self.sw.add_image('Images/predicted', result, global_step=global_step) self.sw.add_scalar(tag='Loss/ce', value={ 'batch': batch_loss, 'epoch_avg': train_loss / (i + 1) }, global_step=global_step) self.sw.add_scalar(tag='learning_rate', value=self.lr_scheduler.learning_rate, global_step=global_step) if hasattr(self.criterion, 'k_sum'): self.sw.add_scalar(tag='nfl_mult', value=self.criterion.k_sum, global_step=global_step) tbar.set_description( f'Epoch {epoch}, training loss {train_loss/(i+1):.3f}') mx.nd.waitall() self.net.hybridize() save_checkpoint(self.net, self.args, epoch=None) def validation(self, epoch): if self.sw is None: self.sw = SummaryWriter(logdir=str(self.args.logs_path), flush_secs=5) self.metric.reset() tbar = tqdm(self.val_data, file=self.tqdm_out, ncols=100) for i, (data, target) in enumerate(tbar): data = split_and_load(data, ctx_list=self.args.ctx, batch_axis=0, even_split=False) if self.with_depth: outputs = [self.net(*X)[0] for X in data] else: outputs = [self.net(X)[0] for X in data] targets = mx.gluon.utils.split_and_load(target, self.args.ctx, even_split=False) self.metric.update(targets, outputs) names, values = self.metric.get() result_str = ', '.join( [f'{name}: {value:4f}' for name, value in zip(names, values)]) tbar.set_description(f'Epoch {epoch}, validation {result_str}') names, values = self.metric.get() result_str = ', '.join( [f'{name}: {value:4f}' for name, value in zip(names, values)]) tbar.set_description(f'Epoch {epoch}, validation {result_str}') logging.info(result_str) for name, value in zip(names, values): self.sw.add_scalar(tag=f'Metrics/{name}', value={'val': value}, global_step=epoch)
def check_add_image(data): sw = SummaryWriter(logdir=_LOGDIR) sw.add_image(tag='test_add_image', image=data, global_step=0) sw.close() check_event_file_and_remove_logdir()
dis_loss_test = 0.5 * (dis_lossfun(0, d(z)) + dis_lossfun( 1, d(concat(merged_inputs, targets, dim=1)))) loss_discriminator_test.append(float(dis_loss_test.asscalar())) gen_loss_test = gen_lossfun(1, d(concat(merged_inputs, y_hat, dim=1)), targets, y_hat) loss_generator_test.append(float(gen_loss_test.asscalar())) # ------------------------------ # Saving to logs for tensorboard # ------------------------------ summaryWriter.add_image( "input", modules.leclip(merged_inputs.expand_dims(2).sum(1)), epoch) summaryWriter.add_image("target", modules.leclip(targets), epoch) summaryWriter.add_image("pred", modules.leclip(g(merged_inputs)), epoch) summaryWriter.add_scalar( "dis/loss_discriminator_train", sum(loss_discriminator_train) / len(loss_discriminator_train), epoch) summaryWriter.add_scalar( "gen/loss_generator_train", sum(loss_generator_train) / len(loss_generator_train), epoch) summaryWriter.add_scalar( "dis/loss_discriminator_test", sum(loss_discriminator_test) / len(loss_discriminator_test), epoch) summaryWriter.add_scalar(
class ModuleLearner(): def __init__(self, model, run_id, gpu_idxs=None, tensorboard_logging=False): """ Parameters ---------- model: HybridBlock gpu_idxs: None or list of ints If None will set context to CPU. If list of ints, will set context to given GPUs. """ logging.info("Using Module Learner.") model.hybridize() logging.info("Hybridized model.") input = mx.sym.var('data') pre_output = model(input) output = mx.sym.SoftmaxOutput(pre_output, name='softmax') context = get_context(gpu_idxs) self.module = mx.mod.Module(symbol=output, context=context, data_names=['data'], label_names=['softmax_label']) self.tensorboard_logging = tensorboard_logging if self.tensorboard_logging: from mxboard import SummaryWriter current_folder = os.path.dirname(os.path.realpath(__file__)) tensorboard_folder = os.path.join(current_folder, "..", "logs", "tensorboard") summary_filepath = os.path.join(tensorboard_folder, run_id) self.writer = SummaryWriter(logdir=summary_filepath) def fit(self, train_data, valid_data, epochs=300, lr=None, lr_schedule=None, initializer=mx.init.Xavier(), optimizer=None, kvstore='device', log_frequency=10000, early_stopping_criteria=None): """ Uses accuracy as training and validation metric. Parameters ---------- train_iter : DataIter Contains training data validation_iter : DataIter Contains validation data epochs: int Number of epochs to run, unless stopped early by early_stopping_criteria. lr: float or int Learning rate lr_schedule : dict Contains change points of learning rate. Key is the epoch and value is the learning rate. Must contain epoch 0. initializer : mxnet.initializer.Initializer optimizer: mxnet.optimizer.Optimizer Defaults to be `mx.optimizer.SGD(learning_rate=lr_schedule[0], rescale_grad=1.0/batch_size, momentum=0.9)` kvstore : str, optional log_frequency : int, optional Number of batches between logs early_stopping_criteria: function (float -> boolean) Given validation accuracy, should return True if training should be stopped early. Returns ------- None Output is logged to file. """ if lr_schedule is None: assert lr is not None, "lr must be defined if not using lr_schedule" lr_schedule = {0: lr} else: assert lr is None, "lr should not be defined if using lr_schedule" assert 0 in lr_schedule.keys( ), "lr for epoch 0 must be defined in lr_schedule" mod = self.module batch_size = train_data.provide_data[0].shape[0] mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label) mod.init_params(initializer=initializer) if optimizer is None: optimizer = mx.optimizer.SGD(learning_rate=lr_schedule[0], rescale_grad=1.0 / batch_size, momentum=0.9) mod.init_optimizer(kvstore=kvstore, optimizer=optimizer) train_metric = mx.metric.create('acc') validation_metric = mx.metric.create('acc') max_val_acc = {'val_acc': 0, 'trn_acc': 0, 'epoch': 0} for epoch in range(epochs): epoch_tick = time.time() # update learning rate if epoch in lr_schedule.keys(): mod._optimizer.lr = lr_schedule[epoch] logging.info("Epoch {}, Changed learning rate.".format(epoch)) logging.info('Epoch {}, Learning rate={}'.format( epoch, mod._optimizer.lr)) if self.tensorboard_logging: self.writer.add_scalar(tag='learning_rate', value=mod._optimizer.lr, global_step=epoch + 1) train_data.reset() train_metric.reset() samples_processed = 0 for batch_idx, batch in enumerate(train_data): batch_tick = time.time() mod.forward(batch, is_train=True) # compute predictions mod.update_metric( train_metric, batch.label) # accumulate prediction accuracy mod.backward() # compute gradients mod.update() # update parameters if self.tensorboard_logging: # log to tensorboard (on first batch) if batch_idx == 0: self.writer.add_image(tag="batch", image=batch.data[0], global_step=epoch + 1) # log batch speed (if a multiple of log_frequency is contained in the last batch) log_batch = (samples_processed // log_frequency) != ( (samples_processed + batch_size) // log_frequency) if ((batch_idx >= 1) and log_batch): # batch estimate, not averaged over multiple batches speed = batch_size / (time.time() - batch_tick) logging.info( 'Epoch {}, Batch {}, Speed={:.2f} images/second'. format(epoch, batch_idx, speed)) samples_processed += batch_size # log training accuracy _, trn_acc = train_metric.get() logging.info('Epoch {}, Training accuracy={}'.format( epoch, trn_acc)) if self.tensorboard_logging: self.writer.add_scalar(tag='accuracy/training', value=trn_acc * 100, global_step=epoch + 1) # log validation accuracy res = mod.score(valid_data, validation_metric) _, val_acc = res[0] logging.info('Epoch {}, Validation accuracy={}'.format( epoch, val_acc)) if self.tensorboard_logging: self.writer.add_scalar(tag='accuracy/validation', value=val_acc * 100, global_step=epoch + 1) # log maximum validation accuracy if val_acc > max_val_acc['val_acc']: max_val_acc = { 'val_acc': val_acc, 'trn_acc': trn_acc, 'epoch': epoch } logging.info(("Epoch {}, Max validation accuracy={} @ " "Epoch {} (with training accuracy={})").format( epoch, max_val_acc['val_acc'], max_val_acc['epoch'], max_val_acc['trn_acc'])) # log duration of epoch logging.info('Epoch {}, Duration={}'.format( epoch, time.time() - epoch_tick)) if early_stopping_criteria: if early_stopping_criteria(val_acc): logging.info( "Epoch {}, Reached early stopping target, stopping training." .format(epoch)) break def predict(self, test_data, log_frequency=10000): logging.info('Starting inference.') mod = self.module batch_size = test_data.provide_data[0].shape[0] mod.bind(data_shapes=test_data.provide_data, label_shapes=test_data.provide_label) samples_processed = 0 batch_tick = time.time() for pred, batch_idx, batch in mod.iter_predict(test_data): pred[0].wait_to_read() batch_tock = time.time() # log batch speed (if a multiple of log_frequency is contained in the last batch) log_batch = (samples_processed // log_frequency) != ( (samples_processed + batch_size) // log_frequency) warm_up_period = 5 if ((batch_idx >= warm_up_period) and log_batch): # batch estimate, not averaged over multiple batches latency = (batch_tock - batch_tick) # seconds speed = batch_size / latency logging.info( 'Inference. Batch {}, Latency={:.5f} ms, Speed={:.2f} images/second' .format(batch_idx, latency * 1000, speed)) samples_processed += batch_size batch_tick = time.time() logging.info('Completed inference.')
def train(): image_pool = ImagePool(pool_size) metric = mx.metric.CustomMetric(facc) stamp = datetime.now().strftime('%Y_%m_%d-%H_%M') logging.basicConfig(level=logging.DEBUG) # define a summary writer that logs data and flushes to the file every 5 seconds sw = SummaryWriter(logdir='./logs_', flush_secs=5) global_step = 0 # paramsG = netG.collect_params() # param_namesG = paramsG.keys() # # paramsD = netD.collect_params() # param_namesD = paramsD.keys() for epoch in range(epochs): if epoch == 0: netG.hybridize() netD.hybridize() # sw.add_graph(netG) # sw.add_graph(netD) tic = time.time() btic = time.time() train_data.reset() iter = 0 for local_step, batch in enumerate(train_data): ############################ # (1) Update D network: maximize log(D(x, y)) + log(1 - D(x, G(x, z))) ########################### real_in = batch.data[0].as_in_context(ctx) real_out = batch.data[1].as_in_context(ctx) fake_out = netG(real_in) fake_concat = image_pool.query(nd.concat(real_in, fake_out, dim=1)) with autograd.record(): # Train with fake image # Use image pooling to utilize history images output = netD(fake_concat) fake_label = nd.zeros(output.shape, ctx=ctx) errD_fake = GAN_loss(output, fake_label) metric.update([ fake_label, ], [ output, ]) # Train with real image real_concat = nd.concat(real_in, real_out, dim=1) output = netD(real_concat) real_label = nd.ones(output.shape, ctx=ctx) errD_real = GAN_loss(output, real_label) errD = (errD_real + errD_fake) * 0.5 errD.backward() metric.update([ real_label, ], [ output, ]) trainerD.step(batch.data[0].shape[0]) sw.add_graph((netG)) ############################ # (2) Update G network: maximize log(D(x, G(x, z))) - lambda1 * L1(y, G(x, z)) ########################### with autograd.record(): fake_out = netG(real_in) fake_concat = nd.concat(real_in, fake_out, dim=1) output = netD(fake_concat) real_label = nd.ones(output.shape, ctx=ctx) errG = GAN_loss( output, real_label) + L1_loss(real_out, fake_out) * lambda1 errG.backward() trainerG.step(batch.data[0].shape[0]) sw.add_scalar(tag='loss', value=('d_loss', errD.mean().asscalar()), global_step=global_step) sw.add_scalar(tag='loss', value=('g_loss', errG.mean().asscalar()), global_step=global_step) global_step += 1 # Log the first batch of images of each epoch if local_step == 0: fake_out = ((fake_out + 1) * 127.5) / 255 sw.add_image('minist_first_minibatch', fake_out, epoch) if iter % 10 == 0: name, acc = metric.get() logging.info('speed: {} samples/s'.format( batch_size / (time.time() - btic))) logging.info( 'discriminator loss = %f, generator loss = %f, binary training acc = %f at iter %d epoch %d' % (nd.mean(errD).asscalar(), nd.mean(errG).asscalar(), acc, iter, epoch)) iter = iter + 1 btic = time.time() sw.add_scalar(tag='binary_training_acc', value=('acc', acc), global_step=epoch) # gradsG = [i.grad() for i in netG.collect_params().values()] # gradsD = [i.grad() for i in netD.collect_params().values()] # # logging the gradients of parameters for checking convergence # for i, name in enumerate(param_namesG): # sw.add_histogram(tag=name + 'G', values=gradsG[i], global_step=epoch, bins=1000) # for i, name in enumerate(param_namesD): # sw.add_histogram(tag=name + 'D', values=gradsD[i], global_step=epoch, bins=1000) name, acc = metric.get() metric.reset() logging.info('\nbinary training acc at epoch %d: %s=%f' % (epoch, name, acc)) logging.info('time: %f' % (time.time() - tic)) # # Visualize one generated image for each epoch # fake_img = fake_out[0] # visualize(fake_img) # plt.show() sw.export_scalars('scalar_dict.json') sw.close()