def evaluate_acc(net, data_iter, ctx): data_iter.reset() box_metric = metric.MAE() outs, labels = None, None for i, batch in enumerate(data_iter): data = batch.data[0].as_in_context(ctx) label = batch.label[0].as_in_context(ctx) # print('acc',label.shape) anchors, box_preds, cls_preds = net(data) #MultiBoxTraget 作用是将生成的anchors与哪些ground truth对应,提取出anchors的偏移和对应的类型 #预测的误差是每次网络输出的预测框g与anchors的差分别/anchor[xywh],然后作为smoothL1(label-g)解算,g才是预测 # 正负样本比例1:3 box_offset, box_mask, cls_labels = MultiBoxTarget( anchors, label, cls_preds.transpose((0, 2, 1)), negative_mining_ratio=3.0) box_metric.update([box_offset], [box_preds * box_mask]) cls_probs = nd.SoftmaxActivation(cls_preds.transpose((0, 2, 1)), mode='channel') #对输出的bbox通过NMS极大值抑制算法筛选检测框 out = MultiBoxDetection(cls_probs, box_preds, anchors, force_suppress=True, clip=False, nms_threshold=0.45) if outs is None: outs = out labels = label else: outs = nd.concat(outs, out, dim=0) labels = nd.concat(labels, label, dim=0) AP = evaluate_MAP(outs, labels) return AP, box_metric
def evaluate_accracy(data_iterator, net): acc = metric.MAE() for i, (data, label) in enumerate(data_iterator): data = data.as_in_context(data_ctx) label = label.as_in_context(data_ctx) output = net(data) prediction = nd.dot(output, scale) acc.update(preds=prediction, labels=label) return acc.get()[1]
def train(train_iter): net = nn.HybridSequential() with net.name_scope(): net.add( model.DSOD(32, 6, 32, 1, 1) # 64 6 48 1 1 ) net.initialize() box_loss = SmoothL1Loss() cls_loss = FocalLoss() # hard neg mining vs FocalLoss() l1_loss = gluon.loss.L1Loss() net.collect_params().reset_ctx(ctx) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.1, 'wd': 5e-4}) cls_metric = metric.Accuracy() box_metric = metric.MAE() filename = args.params if args.retrain: print('load last time weighting') net.load_params(filename, ctx=mx.gpu()) for epoch in range(args.epoch): train_data.reset() cls_metric.reset() box_metric.reset() tic = time.time() for i, batch in enumerate(train_data): x = batch.data[0].as_in_context(ctx) y = batch.label[0].as_in_context(ctx) with mx.autograd.record(): anchors, class_preds, box_preds = net(x) box_target, box_mask, cls_target = training_targets(anchors, class_preds, y) loss1 = cls_loss(class_preds, cls_target) loss2 = l1_loss(box_preds, box_target, box_mask) loss = loss1 + 5 * loss2 loss.backward() trainer.step(batch_size) cls_metric.update([cls_target], [class_preds.transpose((0, 2, 1))]) box_metric.update([box_target], [box_preds * box_mask]) print('Epoch %2d, train %s %.2f, %s %.5f, time %.1f sec' % ( epoch, *cls_metric.get(), *box_metric.get(), time.time() - tic)) net.save_params(filename)
def train_fun(): cls_metric = metric.Accuracy() box_metric = metric.MAE() ctx = gpu(0) train_data, test_data, class_names, num_class = get_iterators( data_shape, batch_size) train_data.reshape(label_shape=(3, 5)) train_data = test_data.sync_label_shape(train_data) net = ToySSD(num_class) net.initialize(init.Xavier(magnitude=2), ctx=ctx) net = ToySSD(num_classes=2, verbose=False) net.initialize(init.Xavier(magnitude=2), ctx=ctx) trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'learning_rate': 0.1, 'wd': 5e-4 }) import time from mxnet import autograd cls_loss = FocalLoss() box_loss = SmoothL1Loss() for epoch in range(30): train_data.reset() cls_metric.reset() box_metric.reset() tic = time.time() for i, batch in enumerate(train_data): x = batch.data[0].as_in_context(ctx) y = batch.label[0].as_in_context(ctx) with autograd.record(): anchors, class_preds, box_preds = net(x) box_target, box_mask, cls_target = training_targets( anchors, class_preds, y) loss1 = cls_loss(class_preds, cls_target) loss2 = box_loss(box_preds, box_target, box_mask) loss = loss1 + loss2 loss.backward() trainer.step(batch_size) cls_metric.update([cls_target], [class_preds.transpose((0, 2, 1))]) box_metric.update([box_target], [box_preds * box_mask]) print('epoch %2d, train %s %.2f, %s %.5f, time %.1f sec' % (epoch, *cls_metric.get(), *box_metric.get(), time.time() - tic))
box_loss = SmoothL1Loss() print(box_loss) train_data.reshape(label_shape=(3, 5)) train_data = test_data.sync_label_shape(train_data) net = ToySSD(num_class) net.initialize(init.Xavier(magnitude=2), ctx=ctx) trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'learning_rate': 0.1, 'wd': 5e-4 }) ctx = utils.try_gpu() cls_metric = metric.Accuracy() box_metric = metric.MAE() for epoch in range(30): # reset data iterators and metrics train_data.reset() cls_metric.reset() box_metric.reset() tic = time.time() for i, batch in enumerate(train_data): x = batch.data[0].as_in_context(ctx) y = batch.label[0].as_in_context(ctx) with autograd.record(): anchors, class_preds, box_preds = net(x) box_target, box_mask, cls_target = training_targets( anchors, class_preds, y) # losses loss1 = cls_loss(class_preds, cls_target)
def evaluate_acc(net, data_iter, ctx): data_iter.reset() box_metric = metric.MAE() outs, labels = None, None for i, batch in enumerate(data_iter): data = batch.data[0].as_in_context(ctx) label = batch.label[0].as_in_context(ctx) # print('acc',label.shape) ssd_layers = net(data) arm_loc_preds, arm_cls_preds, arm_anchor_boxes, odm_loc_preds, odm_cls_preds = multibox_layer(ssd_layers,\ num_classes,sizes,ratios,normalizations) # arm_loc_preds, arm_cls_preds, arm_anchor_boxes, odm_loc_preds, odm_cls_preds = net(data) label_arm = nd.Custom(label, op_type='modify_label') arm_tmp = MultiBoxTarget(arm_anchor_boxes,label_arm,arm_cls_preds,overlap_threshold=.5,\ negative_mining_ratio=3,negative_mining_thresh=.5) arm_loc_target = arm_tmp[0] # box offset arm_loc_target_mask = arm_tmp[1] # box mask (only 0,1) arm_cls_target = arm_tmp[2] # every anchor' idx odm_anchor_boxes = refine_anchor_generator( arm_anchor_boxes, arm_loc_preds) #(batch,h*w*num_anchors[:layers],4) odm_anchor_boxes_bs = nd.split(data=odm_anchor_boxes, axis=0, num_outputs=label.shape[0]) # list odm_loc_target = [] odm_loc_target_mask = [] odm_cls_target = [] label_bs = nd.split(data=label, axis=0, num_outputs=label.shape[0]) odm_cls_preds_bs = nd.split(data=odm_cls_preds, axis=0, num_outputs=label.shape[0]) for j in range(label.shape[0]): if label.shape[0] == 1: odm_tmp = MultiBoxTarget(odm_anchor_boxes_bs[j].expand_dims(axis=0),label_bs[j].expand_dims(axis=0),\ odm_cls_preds_bs[j].expand_dims(axis=0),overlap_threshold=.5,negative_mining_ratio=2,negative_mining_thresh=.5) ## 多个batch else: odm_tmp = MultiBoxTarget(odm_anchor_boxes_bs[j],label_bs[j],\ odm_cls_preds_bs[j],overlap_threshold=.5,negative_mining_ratio=3,negative_mining_thresh=.5) odm_loc_target.append(odm_tmp[0]) odm_loc_target_mask.append(odm_tmp[1]) odm_cls_target.append(odm_tmp[2]) odm_loc_target = nd.concat(*odm_loc_target, dim=0) odm_loc_target_mask = nd.concat(*odm_loc_target_mask, dim=0) odm_cls_target = nd.concat(*odm_cls_target, dim=0) # negitave filter group = nd.Custom(arm_cls_preds, odm_cls_target, odm_loc_target_mask, op_type='negative_filtering') odm_cls_target = group[0] #用ARM中的cls过滤后的odm_cls odm_loc_target_mask = group[1] #过滤掉的mask为0 # arm_cls_prob = nd.SoftmaxActivation(arm_cls_preds, mode='channel') odm_cls_prob = nd.SoftmaxActivation(odm_cls_preds, mode='channel') out = MultiBoxDetection(odm_cls_prob,odm_loc_preds,odm_anchor_boxes,\ force_suppress=True,clip=False,nms_threshold=.5,nms_topk=400) # print(out.shape) if outs is None: outs = out labels = label else: outs = nd.concat(outs, out, dim=0) labels = nd.concat(labels, label, dim=0) box_metric.update([odm_loc_target], [odm_loc_preds * odm_loc_target_mask]) AP = evaluate_MAP(outs, labels) return AP, box_metric
def mytrain(net,num_classes,train_data,valid_data,ctx,start_epoch, end_epoch, \ arm_cls_loss=arm_cls_loss,cls_loss=cls_loss,box_loss=box_loss,trainer=None): if trainer is None: # trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.01,'momentum':0.9, 'wd':50.0}) trainer = gluon.Trainer(net.collect_params(), 'adam', { 'learning_rate': 0.001, 'clip_gradient': 2.0 }) # trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': 0.003}) box_metric = metric.MAE() ## add visible # collect parameter names for logging the gradients of parameters in each epoch params = net.collect_params() # param_names = params.keys() # define a summary writer that logs data and flushes to the file every 5 seconds sw = SummaryWriter(logdir='./logs', flush_secs=5) global_step = 0 for e in range(start_epoch, end_epoch): # print(e) train_data.reset() valid_data.reset() box_metric.reset() tic = time.time() _loss = [0, 0] arm_loss = [0, 0] # if e == 6 or e == 100: # trainer.set_learning_rate(trainer.learning_rate * 0.2) outs, labels = None, None for i, batch in enumerate(train_data): # print('----- batch {} start ----'.format(i)) data = batch.data[0].as_in_context(ctx) label = batch.label[0].as_in_context(ctx) # print('label shape: ',label.shape) with autograd.record(): # 1. generate results according to extract network ssd_layers = net(data) arm_loc_preds, arm_cls_preds, arm_anchor_boxes, odm_loc_preds, odm_cls_preds = multibox_layer(ssd_layers,\ num_classes,sizes,ratios,normalizations) # arm_loc_preds, arm_cls_preds, arm_anchor_boxes, odm_loc_preds, odm_cls_preds = net(data) # print('---------1111-----------') # 2. ARM predict ## 2.1 modify label as [-1,0,..] label_arm = nd.Custom(label, op_type='modify_label') arm_tmp = MultiBoxTarget(arm_anchor_boxes,label_arm,arm_cls_preds,overlap_threshold=.5,\ negative_mining_ratio=3,negative_mining_thresh=.5) arm_loc_target = arm_tmp[0] # box offset arm_loc_target_mask = arm_tmp[1] # box mask (only 0,1) arm_cls_target = arm_tmp[2] # every anchor' idx # print(sum(arm_cls_target[0])) # print('---------2222-----------') # 3. ODM predict ## 3.1 refine anchor generator originate in ARM odm_anchor_boxes = refine_anchor_generator( arm_anchor_boxes, arm_loc_preds) #(batch,h*w*num_anchors[:layers],4) # ### debug backward err # odm_anchor_boxes = arm_anchor_boxes odm_anchor_boxes_bs = nd.split( data=odm_anchor_boxes, axis=0, num_outputs=label.shape[0]) # list # print('---3 : odm_anchor_boxes_bs shape : {}'.format(odm_anchor_boxes_bs[0].shape)) # print('---------3333-----------') ## 3.2 对当前所有batch的data计算 Target (多个gpu使用) odm_loc_target = [] odm_loc_target_mask = [] odm_cls_target = [] label_bs = nd.split(data=label, axis=0, num_outputs=label.shape[0]) odm_cls_preds_bs = nd.split(data=odm_cls_preds, axis=0, num_outputs=label.shape[0]) # print('---4 : odm_cls_preds_bs shape: {}'.format(odm_cls_preds_bs[0].shape)) # print('---4 : label_bs shape: {}'.format(label_bs[0].shape)) for j in range(label.shape[0]): if label.shape[0] == 1: odm_tmp = MultiBoxTarget(odm_anchor_boxes_bs[j].expand_dims(axis=0),label_bs[j].expand_dims(axis=0),\ odm_cls_preds_bs[j].expand_dims(axis=0),overlap_threshold=.5,negative_mining_ratio=2,negative_mining_thresh=.5) ## 多个batch else: odm_tmp = MultiBoxTarget(odm_anchor_boxes_bs[j],label_bs[j],\ odm_cls_preds_bs[j],overlap_threshold=.5,negative_mining_ratio=3,negative_mining_thresh=.5) odm_loc_target.append(odm_tmp[0]) odm_loc_target_mask.append(odm_tmp[1]) odm_cls_target.append(odm_tmp[2]) ### concat ,上面为什么会单独计算每张图,odm包含了batch,so需要拆 odm_loc_target = nd.concat(*odm_loc_target, dim=0) odm_loc_target_mask = nd.concat(*odm_loc_target_mask, dim=0) odm_cls_target = nd.concat(*odm_cls_target, dim=0) # 4. negitave filter group = nd.Custom(arm_cls_preds, odm_cls_target, odm_loc_target_mask, op_type='negative_filtering') odm_cls_target = group[0] #用ARM中的cls过滤后的odm_cls odm_loc_target_mask = group[1] #过滤掉的mask为0 # print('---------4444-----------') # 5. calc loss # TODO:add 1/N_arm, 1/N_odm (num of positive anchors) # arm_cls_loss = gluon.loss.SoftmaxCrossEntropyLoss() arm_loss_cls = arm_cls_loss(arm_cls_preds.transpose((0, 2, 1)), arm_cls_target) arm_loss_loc = box_loss(arm_loc_preds, arm_loc_target, arm_loc_target_mask) # print('55555 loss-> arm_loss_cls : {} arm_loss_loc {}'.format(arm_loss_cls.shape,arm_loss_loc.shape)) # print('arm_loss_cls loss : {}'.format(arm_loss_cls)) # odm_cls_prob = nd.softmax(odm_cls_preds,axis=2) tmp = odm_cls_preds.transpose((0, 2, 1)) odm_loss_cls = cls_loss(odm_cls_preds.transpose((0, 2, 1)), odm_cls_target) odm_loss_loc = box_loss(odm_loc_preds, odm_loc_target, odm_loc_target_mask) # print('66666 loss-> odm_loss_cls : {} odm_loss_loc {}'.format(odm_loss_cls.shape,odm_loss_loc.shape)) # print('odm_loss_cls loss :{} '.format(odm_loss_cls)) # print('odm_loss_loc loss :{} '.format(odm_loss_loc)) # print('N_arm: {} ; N_odm: {} '.format(nd.sum(arm_loc_target_mask,axis=1)/4.0,nd.sum(odm_loc_target_mask,axis=1)/4.0)) # loss = arm_loss_cls+arm_loss_loc+odm_loss_cls+odm_loss_loc loss = 1/(nd.sum(arm_loc_target_mask,axis=1)/4.0) *(arm_loss_cls+arm_loss_loc) + \ 1/(nd.sum(odm_loc_target_mask,axis=1)/4.0)*(odm_loss_cls+odm_loss_loc) sw.add_scalar(tag='loss', value=loss.mean().asscalar(), global_step=global_step) global_step += 1 loss.backward(retain_graph=False) # autograd.backward(loss) # print(net.collect_params().get('conv4_3_weight').data()) # print(net.collect_params().get('vgg0_conv9_weight').grad()) ### 单独测试梯度 # arm_loss_cls.backward(retain_graph=False) # arm_loss_loc.backward(retain_graph=False) # odm_loss_cls.backward(retain_graph=False) # odm_loss_loc.backward(retain_graph=False) trainer.step(data.shape[0]) _loss[0] += nd.mean(odm_loss_cls).asscalar() _loss[1] += nd.mean(odm_loss_loc).asscalar() arm_loss[0] += nd.mean(arm_loss_cls).asscalar() arm_loss[1] += nd.mean(arm_loss_loc).asscalar() # print(arm_loss) arm_cls_prob = nd.SoftmaxActivation(arm_cls_preds, mode='channel') odm_cls_prob = nd.SoftmaxActivation(odm_cls_preds, mode='channel') out = MultiBoxDetection(odm_cls_prob,odm_loc_preds,odm_anchor_boxes,\ force_suppress=True,clip=False,nms_threshold=.5,nms_topk=400) # print('out shape: {}'.format(out.shape)) if outs is None: outs = out labels = label else: outs = nd.concat(outs, out, dim=0) labels = nd.concat(labels, label, dim=0) box_metric.update([odm_loc_target], [odm_loc_preds * odm_loc_target_mask]) print('-------{} epoch end ------'.format(e)) train_AP = evaluate_MAP(outs, labels) valid_AP, val_box_metric = evaluate_acc(net, valid_data, ctx) info["train_ap"].append(train_AP) info["valid_ap"].append(valid_AP) info["loss"].append(_loss) print('odm loss: ', _loss) print('arm loss: ', arm_loss) if e == 0: sw.add_graph(net) # grads = [i.grad() for i in net.collect_params().values()] # grads_4_3 = net.collect_params().get('vgg0_conv9_weight').grad() # sw.add_histogram(tag ='vgg0_conv9_weight',values=grads_4_3,global_step=e, bins=1000 ) grads_4_2 = net.collect_params().get('vgg0_conv5_weight').grad() sw.add_histogram(tag='vgg0_conv5_weight', values=grads_4_2, global_step=e, bins=1000) # assert len(grads) == len(param_names) # logging the gradients of parameters for checking convergence # for i, name in enumerate(param_names): # sw.add_histogram(tag=name, values=grads[i], global_step=e, bins=1000) # net.export('./Model/RefineDet_MeterDetect') # net if (e + 1) % 5 == 0: print( "epoch: %d time: %.2f cls loss: %.4f,reg loss: %.4f lr: %.5f" % (e, time.time() - tic, _loss[0], _loss[1], trainer.learning_rate)) print("train mae: %.4f AP: %.4f" % (box_metric.get()[1], train_AP)) print("valid mae: %.4f AP: %.4f" % (val_box_metric.get()[1], valid_AP)) sw.add_scalar(tag='train_AP', value=train_AP, global_step=e) sw.add_scalar(tag='valid_AP', value=valid_AP, global_step=e) sw.close() if True: info["loss"] = np.array(info["loss"]) info["cls_loss"] = info["loss"][:, 0] info["box_loss"] = info["loss"][:, 1] plt.figure(figsize=(12, 4)) plt.subplot(121) plot("train_ap") plot("valid_ap") plt.legend(loc="upper right") plt.subplot(122) plot("cls_loss") plot("box_loss") plt.legend(loc="upper right") plt.savefig('loss_curve.png')
def train(batch_size, train_data, test_data, net, trainer, ctx, num_epochs, lr_step_epochs=None, lr_decay=0.1, print_batches=100, load_epoch=0, model_prefix=None, period=1): """ Train a network. required=True for those uninitialized arguments. Refer to mxnet/module/base_module.py fit() to load trained model. Refer to fit.py to set lr scheduler. """ logging.info("Start training on {}".format(ctx)) # Load trained model. # Indicates the starting epoch. Usually, if resumed from a checkpoint saved at a previous training phase # at epoch N, then this value is N. if load_epoch > 0: if os.path.exists(model_prefix + "-{}.params".format(load_epoch)): net.load_params(model_prefix + "-{}.params".format(load_epoch), ctx) logging.info("Resume training from epoch {}".format(load_epoch)) else: print("The resume model does not exist.") # if isinstance(ctx, mx.Context): # ctx = [ctx] # Not use. # Set lr scheduler. # can use learning_rate() and set_learning_rate(lr) to set lr scheduler. if lr_step_epochs is not None: step_epochs = [int(l) for l in lr_step_epochs.split(',')] for s in step_epochs: if epoch == s: trainer.set_learning_rate(trainer.learning_rate * lr_decay) logging.info("Adjust learning rate to {} for epoch {}".format( trainer.learning_rate, epoch)) # Use trainer.learning_rate # Loss cls_loss = FocalLoss() box_loss = SmoothL1Loss() # Evaluate. ''' 对于分类好坏我们可以沿用之前的分类精度. 评估边框预测的好坏的一个常用是是平均绝对误差. 但是平方误差对于大的误差给予过大的值, 从而数值上过于敏感. 平均绝对误差就是将二次项替换成绝对值, 具体来说就是预测的边框和真实边框在4个维度上的差值的绝对值. ''' cls_metric = metric.Accuracy() # classification evaluation. box_metric = metric.MAE() # box prediction evaluation. # validating val_cls_metric = metric.Accuracy() # classification evaluation. val_box_metric = metric.MAE() # box prediction evaluation. # the CUDA implementation requres each image has at least 3 lables. # Padd two -1 labels for each instance. Use when loading pikachu dataset. ??? # train_data.reshape(label_shape=(3, 5)) # train_data = train_data.sync_label_shape(train_data) for epoch in range(load_epoch, num_epochs): train_loss, n = 0.0, 0.0 # reset data iterators and metrics. Must reset! train_data.reset() cls_metric.reset() box_metric.reset() val_cls_metric.reset() val_box_metric.reset() tic = time.time() for i, batch in enumerate(train_data): x = batch.data[0].as_in_context(ctx) y = batch.label[0].as_in_context(ctx) with autograd.record(): anchors, class_preds, box_preds = net(x) box_target, box_mask, cls_target = training_targets( anchors, class_preds, y) # losses loss1 = cls_loss(class_preds, cls_target) loss2 = box_loss(box_preds, box_target, box_mask) loss = loss1 + loss2 loss.backward() train_loss += sum([l.sum().asscalar() for l in loss]) trainer.step(batch_size) n += batch_size # update metrics cls_metric.update([cls_target], [class_preds.transpose((0, 2, 1))]) box_metric.update([box_target], [box_preds * box_mask]) if print_batches and (i + 1) % print_batches == 0: logging.info( "Epoch [%d]. Batch [%d]. Loss [%f]. Time %.1f sec" % (epoch, n, train_loss / n, time.time() - tic)) # cls_metric.get() will return a NDArray (string, float). # print print("Train acc:", cls_metric.get(), box_metric.get()) val_cls_metric, val_box_metric = evaluate_accuracy( test_data, net, ctx, val_cls_metric, val_box_metric) # print print("Val acc: ", val_cls_metric.get(), val_box_metric.get()) # save checkpoint if (epoch + 1) % period == 0: net.save_params(model_prefix + "-{}.params".format(epoch + 1)) logging.info("Saved checkpoint to {}-{}.params".format( model_prefix, epoch + 1))
loss = F.smooth_l1((output - label) * mask, scalar=1.0) return loss.mean(self._batch_axis, exclude=True) box_loss = SmoothL1Loss() # 评估测量 # 对于分类好坏我们可以沿用之前的分类精度。 # 评估边框预测的好坏的一个常用是是平均绝对误差。 # 记得在线性回归我们使用了平均平方误差。 # 但跟上面对损失函数的讨论一样,平方误差对于大的误差给予过大的值,从而数值上过于敏感。 # 平均绝对误差就是将二次项替换成绝对值,具体来说就是预测的边框和真实边框在4个维度上的差值的绝对值。 cls_metric = metric.Accuracy( ) # \text{accuracy}(y, \\hat{y}) = \\frac{1}{n} \\sum_{i=0}^{n-1} \text{1}(\\hat{y_i} == y_i) box_metric = metric.MAE() # \frac{\sum_i^n |y_i - \hat{y}_i|}{n} # 初始化模型和训练器 # ctx = gpu(0) ctx = cpu(0) # the CUDA implementation requres each image has at least 3 lables. # Padd two -1 labels for each instance train_data.reshape(label_shape=(3, 5)) train_data = test_data.sync_label_shape(train_data) # num_class = 2 net = ToySSD(num_class) net.initialize(init.Xavier(magnitude=2), ctx=ctx) trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'learning_rate': 0.1, 'wd': 5e-4
from mxnet import gluon from mxnet import autograd from mxnet import nd from mxnet.ndarray.contrib import MultiBoxPrior import model import readData from mxnet import metric import time # Train the model ctx = mx.gpu() # Evaluation cls_metric = metric.Accuracy( ) # used for classification, see the API tutorial for more details box_metric = metric.MAE( ) # used for box prediction, mean absolute error, see API for more explaination data_shape = 256 batch_size = 2 train_data, val_data, class_names, num_class = readData.get_iterators( data_shape, batch_size) train_data.reshape(label_shape=(3, 5)) train_data = val_data.sync_label_shape( train_data) # synchronize label shape with the input iterator. To Be Sure net = model.ToySSD(num_class) net.initialize(init.Xavier(magnitude=2), ctx=ctx) # Note that add the weight decay in the Trainer constructure trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'learning_rate': 0.1,
def mytrain(net, train_data, valid_data, ctx, start_epoch, end_epoch, cls_loss, box_loss, trainer=None): if trainer is None: # trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.01,'momentum':0.9, 'wd':5e-1}) trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'learning_rate': 0.1, 'wd': 1e-3 }) box_metric = metric.MAE() for e in range(start_epoch, end_epoch): # print(e) train_data.reset() valid_data.reset() box_metric.reset() tic = time.time() _loss = [0, 0] if e == 100 or e == 120 or e == 150 or e == 180 or e == 200: trainer.set_learning_rate(trainer.learning_rate * 0.2) outs, labels = None, None for i, batch in enumerate(train_data): data = batch.data[0].as_in_context(ctx) label = batch.label[0].as_in_context(ctx) # print(label.shape) with autograd.record(): anchors, box_preds, cls_preds = net(data) # print(anchors.shape,box_preds.shape,cls_preds.shape) # negative_mining_ratio,在生成的mask中增加*3的反例参加loss的计算。 box_offset, box_mask, cls_labels = MultiBoxTarget( anchors, label, cls_preds.transpose(axes=(0, 2, 1)), negative_mining_ratio=3.0) # , overlap_threshold=0.75) loss1 = cls_loss(cls_preds, cls_labels) loss2 = box_loss(box_preds, box_offset, box_mask) loss = loss1 + loss2 # print(loss1.shape,loss2.shape) loss.backward() trainer.step(data.shape[0]) _loss[0] += nd.mean(loss1).asscalar() _loss[1] += nd.mean(loss2).asscalar() cls_probs = nd.SoftmaxActivation(cls_preds.transpose((0, 2, 1)), mode='channel') out = MultiBoxDetection(cls_probs, box_preds, anchors, force_suppress=True, clip=False, nms_threshold=0.45) if outs is None: outs = out labels = label else: outs = nd.concat(outs, out, dim=0) labels = nd.concat(labels, label, dim=0) box_metric.update([box_offset], [box_preds * box_mask]) train_AP = evaluate_MAP(outs, labels) valid_AP, val_box_metric = evaluate_acc(net, valid_data, ctx) info["train_ap"].append(train_AP) info["valid_ap"].append(valid_AP) info["loss"].append(_loss) if (e + 1) % 10 == 0: print("epoch: %d time: %.2f loss: %.4f, %.4f lr: %.5f" % (e, time.time() - tic, _loss[0], _loss[1], trainer.learning_rate)) print("train mae: %.4f AP: %.4f" % (box_metric.get()[1], train_AP)) print("valid mae: %.4f AP: %.4f" % (val_box_metric.get()[1], valid_AP)) if True: info["loss"] = np.array(info["loss"]) info["cls_loss"] = info["loss"][:, 0] info["box_loss"] = info["loss"][:, 1] plt.figure(figsize=(12, 4)) plt.subplot(121) plot("train_ap") plot("valid_ap") plt.legend(loc="upper right") plt.subplot(122) plot("cls_loss") plot("box_loss") plt.legend(loc="upper right") plt.savefig('loss_curve.png')