def test_single_method(): N = 1000 constant = LRScheduler('constant', base_lr=0, target_lr=1, niters=N) linear = LRScheduler('linear', base_lr=1, target_lr=2, niters=N) cosine = LRScheduler('cosine', base_lr=3, target_lr=1, niters=N) poly = LRScheduler('poly', base_lr=1, target_lr=0, niters=N, power=2) step = LRScheduler('step', base_lr=1, target_lr=0, niters=N, step_iter=[100, 500], step_factor=0.1) step2 = LRScheduler('step', base_lr=1, target_lr=0, nepochs=2, iters_per_epoch=N / 2, step_iter=[100, 500], step_factor=0.1) step3 = LRScheduler('step', base_lr=1, target_lr=0, nepochs=100, iters_per_epoch=N / 100, step_epoch=[10, 50], step_factor=0.1) # Test numerical value for i in range(N): compare(constant, i, 0) expect_linear = 2 + (1 - 2) * (1 - i / (N - 1)) compare(linear, i, expect_linear) expect_cosine = 1 + (3 - 1) * ((1 + cos(pi * i / (N - 1))) / 2) compare(cosine, i, expect_cosine) expect_poly = 0 + (1 - 0) * (pow(1 - i / (N - 1), 2)) compare(poly, i, expect_poly) if i < 100: expect_step = 1 elif i < 500: expect_step = 0.1 else: expect_step = 0.01 compare(step, i, expect_step) compare(step2, i, expect_step) compare(step3, i, expect_step) # Test out-of-range updates for i in range(10): constant.update(i - 3) linear.update(i - 3) cosine.update(i - 3) poly.update(i - 3)
class Trainer(object): def __init__(self, flag, batch_size, use_global_stats=True, checkpoint_interval=5, epochs=50, learning_rate=1.e-4, momentum=0.9, weight_decay=1.e-4, train_OS=16, train_split='train_aug', val_split='val', resume=None, test_batch_size=None, data_root=os.path.expanduser('~/.mxnet/datasets/voc'), num_workers=4): if test_batch_size is None: test_batch_size = batch_size self.running_flag = flag self.checkpoint_interval = checkpoint_interval # dataset and dataloader train_dataset = VOCAugSegmentation(root=data_root, split=train_split) val_datset = VOCAugSegmentation(root=data_root, split=val_split) self.train_data = gluon.data.DataLoader(train_dataset, batch_size, shuffle=True, last_batch='rollover', num_workers=num_workers) self.eval_data = gluon.data.DataLoader(val_datset, test_batch_size, last_batch='keep', num_workers=num_workers) # create network model = DeepLabv3p(OS=train_OS, classes=21, use_global_stats=use_global_stats) self.net = model print(model) # resume checkpoint if needed if resume is not None: if os.path.isfile(resume): model.load_params(resume, ctx=mx.gpu()) else: raise RuntimeError( "=> no checkpoint found at '{}'".format(resume)) else: model.initialize(ctx=mx.gpu()) # create criterion self.criterion = SoftmaxCrossEntropyLoss() # optimizer and lr scheduling self.lr_scheduler = LRScheduler(mode='poly', baselr=learning_rate, niters=len(self.train_data), nepochs=epochs) self.optimizer = gluon.Trainer( self.net.collect_params(), 'sgd', { 'lr_scheduler': self.lr_scheduler, 'wd': weight_decay, 'momentum': momentum, 'multi_precision': True }) def training(self, epoch): tbar = tqdm(self.train_data) train_loss = 0. for i, (data, target) in enumerate(tbar): data = data.copyto(mx.gpu()) target = target.copyto(mx.gpu()) self.lr_scheduler.update(i, epoch) with autograd.record(True): outputs = self.net(data) losses = self.criterion(outputs, target) loss = losses.mean() mx.nd.waitall() loss.backward() self.optimizer.step(batch_size=1) # dummy expression train_loss += loss.asscalar() tbar.set_description('Epoch %d, training loss %.3f' % (epoch, train_loss / (i + 1))) mx.nd.waitall() # break def validation(self, epoch, train=False): if train: loader = self.train_data flag = "train" else: loader = self.eval_data flag = 'val' tbar = tqdm(loader) total_inter, total_union, total_correct, total_label = (0, ) * 4 for i, (x, y) in enumerate(tbar): x = x.copyto(mx.gpu()) y = y.copyto(mx.gpu()) pred = self.net(x) correct, labeled = batch_pix_accuracy(output=pred, target=y) inter, union = batch_intersection_union(output=pred, target=y, nclass=21) total_correct += correct.astype('int64') total_label += labeled.astype('int64') total_inter += inter.astype('int64') total_union += union.astype('int64') pix_acc = np.float64(1.0) * total_correct / ( np.spacing(1, dtype=np.float64) + total_label) IoU = np.float64(1.0) * total_inter / ( np.spacing(1, dtype=np.float64) + total_union) mIoU = IoU.mean() tbar.set_description('%s - Epoch %s, pix_acc: %.4f, mIoU: %.4f' % (flag, epoch, pix_acc, mIoU)) mx.nd.waitall() # break return pix_acc, mIoU def save_checkpoint(self, epoch, is_best=False): save_checkpoint(self.running_flag, self.net, epoch, self.checkpoint_interval, is_best)
class Trainer(object): def __init__(self, flag, batch_size, use_global_stats=True, checkpoint_interval=5, epochs=50, learning_rate=1.e-4, momentum=0.9, weight_decay=4.e-5, train_OS=16, train_split='train_aug', val_split='val', resume=None, test_batch_size=None, data_root=os.path.expanduser('~/.mxnet/datasets/voc'), ctx=[mx.gpu()], norm_layer=gluon.nn.BatchNorm, num_workers=4): if test_batch_size is None: test_batch_size = batch_size self.running_flag = flag self.checkpoint_interval = checkpoint_interval self.batch_size = batch_size # dataset and dataloader train_dataset = VOCAugSegmentation(root=data_root, split=train_split) val_datset = VOCAugSegmentation(root=data_root, split=val_split) self.train_data = gluon.data.DataLoader(train_dataset, batch_size, shuffle=True, last_batch='rollover', num_workers=num_workers) self.eval_data = gluon.data.DataLoader(val_datset, test_batch_size, last_batch='keep', num_workers=num_workers) # create network model = DeepLabv3p(OS=train_OS, classes=21, use_global_stats=use_global_stats, norm_layer=norm_layer) print(model) # resume checkpoint if needed if resume is not None: if os.path.isfile(resume): model.load_parameters(resume, ctx=ctx) else: raise RuntimeError("=> no checkpoint found at '{}'".format(resume)) else: model.initialize(ctx=ctx) self.net = DataParallelModel(model, ctx, sync=True) self.evaluator = DataParallelModel(SegEvalModel(model), ctx) # create criterion self.criterion = DataParallelCriterion(SoftmaxCrossEntropyLoss(), ctx, sync=True) # optimizer and lr scheduling self.lr_scheduler = LRScheduler(mode='poly', baselr=learning_rate, niters=len(self.train_data), nepochs=epochs) self.optimizer = gluon.Trainer(self.net.module.collect_params(), 'sgd', {'lr_scheduler': self.lr_scheduler, 'wd': weight_decay, 'momentum': momentum, 'multi_precision': True}) def training(self, epoch): tbar = tqdm(self.train_data) train_loss = 0. for i, (data, target) in enumerate(tbar): self.lr_scheduler.update(i, epoch) with autograd.record(train_mode=True): outputs = self.net(data) losses = self.criterion(outputs, target) mx.nd.waitall() autograd.backward(losses) for loss in losses: train_loss += loss.asnumpy()[0] / len(losses) self.optimizer.step(batch_size=self.batch_size) tbar.set_description('Epoch %d, training loss %.3f' % (epoch, train_loss / (i + 1))) mx.nd.waitall() # break def validation(self, epoch, train=False): if train: loader = self.train_data flag = "train" else: loader = self.eval_data flag = 'val' tbar = tqdm(loader) total_inter, total_union, total_correct, total_label = (0,) * 4 for i, (x, y) in enumerate(tbar): outputs = self.evaluator(x, y) for (correct, labeled, inter, union) in outputs: total_correct += correct total_label += labeled total_inter += inter total_union += union pixAcc = 1.0 * total_correct / (np.spacing(1) + total_label) IoU = 1.0 * total_inter / (np.spacing(1) + total_union) mIoU = IoU.mean() tbar.set_description('%s - Epoch %s, validation pixAcc: %.4f, mIoU: %.4f' % \ (flag, epoch, pixAcc, mIoU)) mx.nd.waitall() return pixAcc, mIoU def save_checkpoint(self, epoch, is_best=False): save_checkpoint(self.running_flag, self.net.module, epoch, self.checkpoint_interval, is_best)
def train(net, train_data, val_data, eval_metric, ctx, args): """Training pipeline""" net.collect_params().reset_ctx(ctx) if args.no_wd: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 if args.label_smooth: net._target_generator._label_smooth = True if args.lr_decay_period > 0: lr_decay_epoch = list( range(args.lr_decay_period, args.epochs, args.lr_decay_period)) else: lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')] lr_scheduler = LRScheduler(mode=args.lr_mode, baselr=args.lr, niters=args.num_samples // args.batch_size, nepochs=args.epochs, step=lr_decay_epoch, step_factor=args.lr_decay, power=2, warmup_epochs=args.warmup_epochs) trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'wd': args.wd, 'momentum': args.momentum, 'lr_scheduler': lr_scheduler }, kvstore='local') # targets sigmoid_ce = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) l1_loss = gluon.loss.L1Loss() # metrics obj_metrics = mx.metric.Loss('ObjLoss') center_metrics = mx.metric.Loss('BoxCenterLoss') scale_metrics = mx.metric.Loss('BoxScaleLoss') cls_metrics = mx.metric.Loss('ClassLoss') # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): if args.mixup: # TODO(zhreshold): more elegant way to control mixup during runtime try: train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5) except AttributeError: train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5) if epoch >= args.epochs - args.no_mixup_epochs: try: train_data._dataset.set_mixup(None) except AttributeError: train_data._dataset._data.set_mixup(None) tic = time.time() btic = time.time() mx.nd.waitall() net.hybridize() for i, batch in enumerate(train_data): batch_size = batch[0].shape[0] data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) # objectness, center_targets, scale_targets, weights, class_targets fixed_targets = [ gluon.utils.split_and_load(batch[it], ctx_list=ctx, batch_axis=0) for it in range(1, 6) ] gt_boxes = gluon.utils.split_and_load(batch[6], ctx_list=ctx, batch_axis=0) sum_losses = [] obj_losses = [] center_losses = [] scale_losses = [] cls_losses = [] with autograd.record(): for ix, x in enumerate(data): obj_loss, center_loss, scale_loss, cls_loss = net( x, gt_boxes[ix], *[ft[ix] for ft in fixed_targets]) sum_losses.append(obj_loss + center_loss + scale_loss + cls_loss) obj_losses.append(obj_loss) center_losses.append(center_loss) scale_losses.append(scale_loss) cls_losses.append(cls_loss) autograd.backward(sum_losses) lr_scheduler.update(i, epoch) trainer.step(batch_size) obj_metrics.update(0, obj_losses) center_metrics.update(0, center_losses) scale_metrics.update(0, scale_losses) cls_metrics.update(0, cls_losses) if args.log_interval and not (i + 1) % args.log_interval: name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() logger.info( '[Epoch {}][Batch {}], LR: {:.2E}, Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}' .format(epoch, i, trainer.learning_rate, batch_size / (time.time() - btic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) btic = time.time() name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() logger.info( '[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}' .format(epoch, (time.time() - tic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) if not (epoch + 1) % args.val_interval: # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric) val_msg = '\n'.join( ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0. save_params(net, best_map, current_map, epoch, args.save_interval, args.save_prefix)
def train(net, train_data, val_data, eval_metric, polygon_metric, ctx, args): """Training pipeline""" net.collect_params().reset_ctx(ctx) # lr decay policy lr_decay = float(args.lr_decay) lr_steps = sorted( [float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()]) lr_scheduler = LRScheduler(mode=args.lr_mode, baselr=args.lr, niters=args.num_samples // args.batch_size, nepochs=args.epochs, step=lr_steps, step_factor=lr_decay, power=2, warmup_epochs=args.warmup_epochs) trainer = gluon.Trainer( net.collect_params(), 'sgd', { 'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum, 'lr_scheduler': lr_scheduler }) mbox_loss = gcv.loss.SSDMultiBoxLoss() ce_metric = mx.metric.Loss('cls_loss') smoothl1_metric = mx.metric.Loss('box_loss') coef_center_metric = mx.metric.Loss('center_loss') coef_metric = mx.metric.Loss('coef_loss') # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): # while lr_steps and epoch >= lr_steps[0]: # new_lr = trainer.learning_rate * lr_decay # lr_steps.pop(0) # trainer.set_learning_rate(new_lr) # logger.info("[Epoch {}] Set learning rate to {}".format(epoch, new_lr)) ce_metric.reset() smoothl1_metric.reset() coef_metric.reset() coef_center_metric.reset() tic = time.time() btic = time.time() net.hybridize() for i, batch in enumerate(train_data): batch_size = batch[0].shape[0] data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0) coef_center_targets = gluon.utils.split_and_load(batch[3], ctx_list=ctx, batch_axis=0) coef_targets = gluon.utils.split_and_load(batch[4], ctx_list=ctx, batch_axis=0) with autograd.record(): cls_preds = [] box_preds = [] coef_preds = [] coef_center_preds = [] for x in data: cls_pred, box_pred, _, coef_center_pred, coef_pred = net(x) # print(cls_pred.shape, box_pred.shape, coef_center_pred.shape) cls_preds.append(cls_pred) box_preds.append(box_pred) coef_preds.append(coef_pred) coef_center_preds.append(coef_center_pred) sum_loss, cls_loss, box_loss, coef_center_loss, coef_loss = mbox_loss( cls_preds, box_preds, coef_center_preds, coef_preds, cls_targets, box_targets, coef_center_targets, coef_targets) autograd.backward(sum_loss) # since we have already normalized the loss, we don't want to normalize # by batch-size anymore trainer.step(1) lr_scheduler.update(i, epoch) coef_center_metric.update( 0, [l * batch_size for l in coef_center_loss]) coef_metric.update(0, [l * batch_size for l in coef_loss]) ce_metric.update(0, [l * batch_size for l in cls_loss]) smoothl1_metric.update(0, [l * batch_size for l in box_loss]) if args.log_interval and not (i + 1) % args.log_interval: name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() name3, loss3 = coef_center_metric.get() name4, loss4 = coef_metric.get() logger.info( '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, LR:{}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}' .format(epoch, i, batch_size / (time.time() - btic), trainer.learning_rate, name1, loss1, name2, loss2, name3, loss3, name4, loss4)) btic = time.time() name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() name3, loss3 = coef_center_metric.get() name4, loss4 = coef_metric.get() logger.info( '[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}' .format(epoch, (time.time() - tic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) if (epoch + 1) % args.val_interval == 0: # consider reduce the frequency of validation to save time map_bbox, map_polygon = validate(net, val_data, ctx, eval_metric, polygon_metric) map_name, mean_ap = map_bbox polygonmap_name, polygonmean_ap = map_polygon val_msg = '\n'.join( ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg)) polygonval_msg = '\n'.join([ '{}={}'.format(k, v) for k, v in zip(polygonmap_name, polygonmean_ap) ]) logger.info('[Epoch {}] PolygonValidation: \n{}'.format( epoch, polygonval_msg)) current_map = float(polygonmean_ap[-1]) else: current_map = 0. save_params(net, best_map, current_map, epoch, args.save_interval, args.save_prefix)
class Trainer(object): def __init__(self, args): self.args = args # image transform input_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize([.485, .456, .406], [.229, .224, .225]), ]) # dataset and dataloader data_kwargs = {'transform': input_transform, 'base_size': args.base_size, 'crop_size': args.crop_size} trainset = get_segmentation_dataset( args.dataset, split=args.train_split, mode='train', **data_kwargs) valset = get_segmentation_dataset( args.dataset, split='val', mode='val', **data_kwargs) self.train_data = gluon.data.DataLoader( trainset, args.batch_size, shuffle=True, last_batch='rollover', num_workers=args.workers) self.eval_data = gluon.data.DataLoader(valset, args.test_batch_size, last_batch='rollover', num_workers=args.workers) # create network if args.model_zoo is not None: model = get_model(args.model_zoo, pretrained=True) else: model = get_segmentation_model(model=args.model, dataset=args.dataset, backbone=args.backbone, norm_layer=args.norm_layer, norm_kwargs=args.norm_kwargs, aux=args.aux, crop_size=args.crop_size) model.cast(args.dtype) print(model) self.net = DataParallelModel(model, args.ctx, args.syncbn) self.evaluator = DataParallelModel(SegEvalModel(model), args.ctx) # resume checkpoint if needed if args.resume is not None: if os.path.isfile(args.resume): model.load_parameters(args.resume, ctx=args.ctx) else: raise RuntimeError("=> no checkpoint found at '{}'" \ .format(args.resume)) # create criterion criterion = MixSoftmaxCrossEntropyLoss(args.aux, aux_weight=args.aux_weight) self.criterion = DataParallelCriterion(criterion, args.ctx, args.syncbn) # optimizer and lr scheduling self.lr_scheduler = LRScheduler(mode='poly', baselr=args.lr, niters=len(self.train_data), nepochs=args.epochs) kv = mx.kv.create(args.kvstore) optimizer_params = {'lr_scheduler': self.lr_scheduler, 'wd':args.weight_decay, 'momentum': args.momentum} if args.dtype == 'float16': optimizer_params['multi_precision'] = True if args.no_wd: for k, v in self.net.module.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 self.optimizer = gluon.Trainer(self.net.module.collect_params(), 'sgd', optimizer_params, kvstore = kv) # evaluation metrics self.metric = gluoncv.utils.metrics.SegmentationMetric(trainset.num_class) def training(self, epoch): tbar = tqdm(self.train_data) train_loss = 0.0 alpha = 0.2 for i, (data, target) in enumerate(tbar): self.lr_scheduler.update(i, epoch) with autograd.record(True): outputs = self.net(data.astype(args.dtype, copy=False)) losses = self.criterion(outputs, target) mx.nd.waitall() autograd.backward(losses) self.optimizer.step(self.args.batch_size) for loss in losses: train_loss += loss.asnumpy()[0] / len(losses) tbar.set_description('Epoch %d, training loss %.3f'%\ (epoch, train_loss/(i+1))) mx.nd.waitall() # save every epoch save_checkpoint(self.net.module, self.args, False) def validation(self, epoch): #total_inter, total_union, total_correct, total_label = 0, 0, 0, 0 self.metric.reset() tbar = tqdm(self.eval_data) for i, (data, target) in enumerate(tbar): outputs = self.evaluator(data.astype(args.dtype, copy=False)) outputs = [x[0] for x in outputs] targets = mx.gluon.utils.split_and_load(target, args.ctx, even_split=False) self.metric.update(targets, outputs) pixAcc, mIoU = self.metric.get() tbar.set_description('Epoch %d, validation pixAcc: %.3f, mIoU: %.3f'%\ (epoch, pixAcc, mIoU)) mx.nd.waitall()
label = gluon.utils.split_and_load(batch[1], ctx_list=[context], batch_axis=0) weight = gluon.utils.split_and_load(batch[2], ctx_list=[context], batch_axis=0) with ag.record(): outputs = [net(X) for X in data] loss = [ L(yhat, y, w) for yhat, y, w in zip(outputs, label, weight) ] for l in loss: l.backward() lr_scheduler.update(i, epoch) trainer.step(batch_size) metric.update(label, outputs) break ############################################################################# # Due to limitation on the resources, we only train the model for one batch in this tutorial. # # Please checkout the full :download:`training script # <../../../scripts/pose/simple_pose/train_simple_pose.py>` to reproduce our results. # # References # ---------- #
class Trainer(object): def __init__(self, args): self.args = args self.two_model = False ## self.semi = False # image transform input_transform = transforms.Compose([ transforms.ToTensor(), # transforms.Normalize([.485, .456, .406], [.229, .224, .225]), # transforms.Normalize([0, 0, 0], [1, 1, 1]), # ([0, 0, 0], [1, 1, 1]) # transforms.Normalize([0], [1]), # this is for 1 channel: ([0], [1]) ([556.703], [482.175]) ]) # dataset and dataloader data_kwargs = { 'transform': input_transform, 'base_size': args.base_size, 'crop_size': args.crop_size } trainset = get_segmentation_dataset(args.dataset, split=args.train_split, mode='train', **data_kwargs) valset = get_segmentation_dataset(args.dataset, split='val', mode='val', **data_kwargs) self.train_data = gluon.data.DataLoader(trainset, args.batch_size, shuffle=True, last_batch='rollover', num_workers=args.workers) self.eval_data = gluon.data.DataLoader( valset, args.batch_size, # args.test_batch_size, [horse changed this] last_batch='rollover', num_workers=args.workers) # create network if args.model_zoo is not None: print('get model from the zoo.') model = get_model(args.model_zoo, pretrained=True) if self.two_model: self.model2 = get_model( args.model_zoo, pretrained=True) ## 2nd identical model else: print('create model.') model = get_segmentation_model(model=args.model, dataset=args.dataset, backbone=args.backbone, norm_layer=args.norm_layer, norm_kwargs=args.norm_kwargs, aux=args.aux, crop_size=args.crop_size, pretrained=False) if self.two_model: self.model2 = get_segmentation_model( model=args.model, dataset=args.dataset, backbone=args.backbone, norm_layer=args.norm_layer, norm_kwargs=args.norm_kwargs, aux=args.aux, crop_size=args.crop_size, pretrained=False) model.cast(args.dtype) if self.two_model: self.model2.cast(args.dtype) # print(model) # don't print model # print(help(model.collect_params)) # >>> Notice here <<< # model.initialize() # horse ref: https://discuss.mxnet.io/t/object-detection-transfer-learning/2477/2 ''' ''' self.net = DataParallelModel(model, args.ctx, args.syncbn) self.evaluator = DataParallelModel(SegEvalModel(model), args.ctx) if self.two_model: self.evaluator2 = DataParallelModel(SegEvalModel(self.model2), args.ctx) # resume checkpoint if needed if args.resume is not None: if os.path.isfile(args.resume): if not horse_changed: model.load_parameters(args.resume, ctx=args.ctx) if horse_changed: model.load_parameters(args.resume, ctx=args.ctx, allow_missing=True, ignore_extra=True) else: raise RuntimeError("=> no checkpoint found at '{}'" \ .format(args.resume)) ''' self.net = DataParallelModel(model, args.ctx, args.syncbn) self.evaluator = DataParallelModel(SegEvalModel(model), args.ctx) ''' # create criterion criterion = MixSoftmaxCrossEntropyLoss(args.aux, aux_weight=args.aux_weight) self.criterion = DataParallelCriterion(criterion, args.ctx, args.syncbn) # optimizer and lr scheduling self.lr_scheduler = LRScheduler(mode='poly', baselr=args.lr, niters=len(self.train_data), nepochs=args.epochs) kv = mx.kv.create(args.kvstore) optimizer_params = { 'lr_scheduler': self.lr_scheduler, 'wd': args.weight_decay, 'momentum': args.momentum } if args.dtype == 'float16': optimizer_params['multi_precision'] = True if args.no_wd: for k, v in self.net.module.collect_params( '.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 self.optimizer = gluon.Trainer(self.net.module.collect_params(), 'sgd', optimizer_params, kvstore=kv) # evaluation metrics self.metric = gluoncv.utils.metrics.SegmentationMetric( trainset.num_class) def training(self, epoch): if self.two_model: if self.two_model: self.model2.load_parameters( 'runs/pascal_voc/deeplab/HVSMR/res50_backup.params', ctx=args.ctx) # args.resume self.model2.cast(args.dtype) self.evaluator2 = DataParallelModel(SegEvalModel(self.model2), args.ctx) if horse_changed: print('>>> start training.') # [horse] tbar = tqdm(self.train_data) train_loss = 0.0 alpha = 0.2 for i, (data, target) in enumerate(tbar): self.lr_scheduler.update(i, epoch) with autograd.record(True): # >>>>>>>>>>>>>>>>>>>> global print_shape if print_shape: print('>>> data of one batch:') print(data.shape, target.shape) # horse ''' with open('have_a_look.pkl', 'wb') as fo: pickle.dump(data.asnumpy(), fo) pickle.dump(target.asnumpy(), fo) ''' for ii in range(data.shape[1]): one_sample = data[0, ii, :, :].asnumpy() s_mean = np.mean(one_sample.flatten()) s_std = np.std(one_sample.flatten()) s_min = min(one_sample.flatten()) s_max = max(one_sample.flatten()) print('dim | mean | std | min | max', ii, s_mean, s_std, s_min, s_max) print_shape = False # >>>>>>>>>>>>>>>>>>>> outputs = self.net(data.astype(args.dtype, copy=False)) # print('outputs:', len(outputs[0]), outputs[0][0].shape) # [horse] # print('target:', target.shape) # outputs: 2 (14, 3, 250, 250) # target: (14, 250, 250) # +++++ +++++ +++++ _outputs = outputs _target = mx.ndarray.reshape( target, shape=(-3, -2)) # to be (batch_size*NUM_SEQ, 250, 250) # +++++ +++++ +++++ # losses = self.criterion(outputs, target) losses = self.criterion(_outputs, _target) mx.nd.waitall() autograd.backward(losses) self.optimizer.step(self.args.batch_size) for loss in losses: train_loss += loss.asnumpy()[0] / len(losses) tbar.set_description('Epoch %d, training loss %.3f'%\ (epoch, train_loss/(i+1))) mx.nd.waitall() # save every epoch save_checkpoint(self.net.module, self.args, False) # ++++++++++ ++++++++++ ++++++++++ ++++++++++ ++++++++++ if not horse_changed: tbar = tqdm(self.train_data) train_loss = 0.0 alpha = 0.2 for i, (data, target) in enumerate(tbar): self.lr_scheduler.update(i, epoch) with autograd.record(True): outputs = self.net(data.astype(args.dtype, copy=False)) # print('target:', target.shape) # target: (4, 480, 480) ## print('target sum before:', [i.sum() for i in target.asnumpy()]) # target sum: [389344.0, 0.0, 0.0, 188606.0] # ++++++++++ ++++++++++ ++++++++++ ++++++++++ ++++++++++ # ++++++++++ ++++++++++ ++++++++++ ++++++++++ ++++++++++ if self.semi: pos = np.where( np.array([i.sum() for i in target.asnumpy()]) == 0)[0] ## print('pos',pos) if len(pos) != 0: data2 = data[pos, :, :, :] _outputs = self.evaluator2( data2.astype(args.dtype, copy=False)) _outputs = [x[0] for x in _outputs] label_generated = np.zeros( (len(pos), target.shape[1], target.shape[2])) for k in range(len(pos)): ## print(_outputs[0].shape) label_slice = labeler_random( _outputs[0].asnumpy()[k, 0:3, :, :], crop_size=target.shape[1], prob_cut=0.46) label_generated[k, :, :] = label_slice target[pos, :, :] = mx.nd.array(label_generated) ## print('target sum after:', [i.sum() for i in target.asnumpy()]) ''' if True: # print('targets and outputs shape:', len(outputs), outputs[0].shape) # outputs: 1 (18, 3, 250, 250); targets: 1 (18, 250, 250) for sample in range(2): mx2img(data[sample,:,:,:], str(sample)+'.jpg') mx2img(target[sample,:,:], str(sample)+'.png') ''' # ++++++++++ ++++++++++ ++++++++++ ++++++++++ ++++++++++ # ++++++++++ ++++++++++ ++++++++++ ++++++++++ ++++++++++ losses = self.criterion(outputs, target) mx.nd.waitall() autograd.backward(losses) self.optimizer.step(self.args.batch_size) for loss in losses: train_loss += loss.asnumpy()[0] / len(losses) tbar.set_description('Epoch %d, training loss %.3f'%\ (epoch, train_loss/(i+1))) mx.nd.waitall() # save every epoch save_checkpoint(self.net.module, self.args, False) # ++++++++++ ++++++++++ ++++++++++ ++++++++++ ++++++++++ ''' <- this is backup if not horse_changed: tbar = tqdm(self.train_data) train_loss = 0.0 alpha = 0.2 for i, (data, target) in enumerate(tbar): self.lr_scheduler.update(i, epoch) with autograd.record(True): outputs = self.net(data.astype(args.dtype, copy=False)) losses = self.criterion(outputs, target) mx.nd.waitall() autograd.backward(losses) self.optimizer.step(self.args.batch_size) for loss in losses: train_loss += loss.asnumpy()[0] / len(losses) tbar.set_description('Epoch %d, training loss %.3f'%\ (epoch, train_loss/(i+1))) mx.nd.waitall() # save every epoch save_checkpoint(self.net.module, self.args, False) # ++++++++++ ++++++++++ ++++++++++ ++++++++++ ++++++++++ ''' def validation(self, epoch): if not horse_changed: output_to_see = False # False # [horse added] output_score_map = False # [horse added] #total_inter, total_union, total_correct, total_label = 0, 0, 0, 0 self.metric.reset() tbar = tqdm(self.eval_data) output_index = 0 # [horse added] for i, (data, target) in enumerate(tbar): # print('target', target) outputs = self.evaluator(data.astype(args.dtype, copy=False)) outputs = [x[0] for x in outputs] # print(outputs) ''' if i == 50: with open('have_a_look.pkl', 'wb') as fo: pickle.dump(outputs[0].asnumpy(),fo) ''' targets = mx.gluon.utils.split_and_load(target, args.ctx, even_split=False) # ++++++++++ ++++++++++ ++++++++++ if output_to_see: # print('targets and outputs shape:', len(outputs), outputs[0].shape) # outputs: 1 (18, 3, 250, 250); targets: 1 (18, 250, 250) output_prefix = 'outdir_tosee' if not os.path.exists(output_prefix): os.makedirs(output_prefix) batch_size = self.args.batch_size crop_size = self.args.crop_size for sample in range(batch_size): path = os.path.join(output_prefix, str(output_index) + '.png') mx2img(outputs[0][sample, :, :, :], path) output_index += 1 # ++++++++++ ++++++++++ ++++++++++ if output_score_map: score_map_dir = 'scoredir_tosee' # args.scoredir if not os.path.exists(score_map_dir): os.makedirs(score_map_dir) batch_size = self.args.batch_size for sample in range(batch_size): # score_map_name = os.path.splitext(impath)[0] + '.pkl' # score_map_path = os.path.join(score_map_dir, score_map_name) score_map_path = os.path.join( score_map_dir, str(output_index) + '.pkl') with open(score_map_path, 'wb') as fo: pickle.dump( outputs[0].asnumpy()[sample, 0:3, :, :], fo) output_index += 1 self.metric.update(targets, outputs) ''' pixAcc, mIoU = self.metric.get() tbar.set_description('Epoch %d, validation pixAcc: %.3f, mIoU: %.3f'%\ (epoch, pixAcc, mIoU)) ''' pixAcc, mIoU, dice = self.metric.get() # [horse changed] tbar.set_description('Epoch %d, validation pixAcc: %.3f, mIoU: %.3f, dice: %.3f, %.3f, %.3f'%\ (epoch, pixAcc, mIoU, dice[0], dice[1], dice[2])) mx.nd.waitall() if horse_changed: output_to_see = True # False #total_inter, total_union, total_correct, total_label = 0, 0, 0, 0 self.metric.reset() tbar = tqdm(self.eval_data) output_index = 0 for i, (data, target) in enumerate(tbar): # print('target', target) outputs = self.evaluator(data.astype(args.dtype, copy=False)) outputs = [x[0] for x in outputs] _target = mx.ndarray.reshape(target, shape=(-3, -2)) targets = mx.gluon.utils.split_and_load(_target, args.ctx, even_split=False) # ++++++++++ ++++++++++ ++++++++++ if output_to_see: # print('targets and outputs shape:', len(outputs), outputs[0].shape) # outputs: 1 (18, 3, 250, 250); targets: 1 (18, 250, 250) output_prefix = 'outdir_seq' batch_size = self.args.batch_size crop_size = self.args.crop_size NUM_SEQ = int(outputs[0].shape[0] / batch_size) # print(batch_size, NUM_SEQ, crop_size) outputs_out = mx.ndarray.reshape( outputs[0], shape=(batch_size, NUM_SEQ, 3, crop_size, crop_size) ) # 3 is the class number not image channel, just for convenience targets_out = mx.ndarray.reshape(targets[0], shape=(batch_size, NUM_SEQ, crop_size, crop_size)) for sample in range(batch_size): for seq in range(NUM_SEQ): path = os.path.join( output_prefix, str(output_index) + '_' + str(seq) + '.png') path_mask = os.path.join( output_prefix, str(output_index) + '_gt_' + str(seq) + '.png') mx2img(outputs_out[sample, seq, :, :, :], path) mx2img(targets_out[sample, seq, :, :], path_mask) output_index += 1 # ++++++++++ ++++++++++ ++++++++++ self.metric.update(targets, outputs) ''' pixAcc, mIoU = self.metric.get() tbar.set_description('Epoch %d, validation pixAcc: %.3f, mIoU: %.3f'%\ (epoch, pixAcc, mIoU)) ''' pixAcc, mIoU, dice = self.metric.get() # [horse changed] tbar.set_description('Epoch %d, validation pixAcc: %.3f, mIoU: %.3f, dice: %.3f, %.3f, %.3f'%\ (epoch, pixAcc, mIoU, dice[0], dice[1], dice[2])) mx.nd.waitall()
def train(): epochs = 101 lr = 0.1 momentum = 0.9 wd = 5e-4 plot_period = 20 ctx = [mx.gpu(i) for i in range(2)] batch_size = 256 train_set = MNIST(train=True, transform=transform_train) train_data = gluon.data.DataLoader(train_set, batch_size, True, num_workers=4, last_batch='discard') val_set = MNIST(train=False, transform=transform_val) val_data = gluon.data.DataLoader(val_set, batch_size, shuffle=False, num_workers=4) net = MnistNet(embedding_size=2) net.initialize(init=mx.init.MSRAPrelu(), ctx=ctx) net.hybridize() loss = CenterLoss(10, 2, 1) loss.initialize(ctx=ctx) num_batches = len(train_set) // batch_size train_params = net.collect_params() train_params.update(loss.params) lr_scheduler = LRScheduler("cosine", lr, niters=num_batches, nepochs=epochs, targetlr=1e-8, warmup_epochs=10, warmup_lr=0.001) trainer = gluon.Trainer(train_params, 'nag', {'lr_scheduler': lr_scheduler, 'momentum': momentum, 'wd': wd}) metric = mtc.Accuracy() num_batch = len(train_data) for epoch in range(epochs): plot = True if (epoch % plot_period) == 0 else False train_loss = 0 metric.reset() tic = time.time() ebs, lbs = [], [] for i, batch in enumerate(train_data): data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False) labels = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False) with ag.record(): ots = [net(X) for X in data] embedds = [ot[0] for ot in ots] outputs = [ot[1] for ot in ots] losses = [loss(yhat, y, emb) for yhat, y, emb in zip(outputs, labels, embedds)] for l in losses: ag.backward(l) if plot: for es, ls in zip(embedds, labels): assert len(es) == len(ls) for idx in range(len(es)): ebs.append(es[idx].asnumpy()) lbs.append(ls[idx].asscalar()) lr_scheduler.update(i, epoch) trainer.step(batch_size) metric.update(labels, outputs) train_loss += sum([l.mean().asscalar() for l in losses]) / len(losses) _, train_acc = metric.get() train_loss /= num_batch val_acc, val_loss, val_ebs, val_lbs = validate(net, val_data, ctx, loss, plot) toc = time.time() print('[epoch % 3d] train accuracy: %.6f, train loss: %.6f | ' 'val accuracy: %.6f, val loss: %.6f, time: %.6f' % (epoch, train_acc, train_loss, val_acc, val_loss, toc - tic)) if plot: ebs, lbs = np.vstack(ebs), np.hstack(lbs) plot_result(ebs, lbs, os.path.join("../../resources", "center-train-epoch{}.png".format(epoch))) plot_result(val_ebs, val_lbs, os.path.join("../../resources", "center-val-epoch{}.png".format(epoch)))
class Trainer(object): def __init__(self, args): self.args = args # image transform input_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize([.485, .456, .406], [.229, .224, .225]), ]) # dataset and dataloader trainset = get_segmentation_dataset( args.dataset, split='train', transform=input_transform) valset = get_segmentation_dataset( args.dataset, split='val', transform=input_transform) self.train_data = gluon.data.DataLoader( trainset, args.batch_size, shuffle=True, last_batch='rollover', num_workers=args.workers) self.eval_data = gluon.data.DataLoader(valset, args.test_batch_size, last_batch='keep', num_workers=args.workers) # create network model = get_segmentation_model(model=args.model, dataset=args.dataset, backbone=args.backbone, norm_layer=args.norm_layer, aux=args.aux, norm_kwargs=args.norm_kwargs) # model.hybridize(static_alloc=True, static_shape=True) print(model) self.net = DataParallelModel(model, args.ctx, args.syncbn) self.evaluator = DataParallelModel(SegEvalModel(model), args.ctx) # resume checkpoint if needed if args.resume is not None: if os.path.isfile(args.resume): model.load_params(args.resume, ctx=args.ctx) else: raise RuntimeError("=> no checkpoint found at '{}'" \ .format(args.resume)) # create criterion criterion = SoftmaxCrossEntropyLossWithAux(args.aux) self.criterion = DataParallelCriterion(criterion, args.ctx, args.syncbn) # optimizer and lr scheduling self.lr_scheduler = LRScheduler(mode='poly', baselr=args.lr, niters=len(self.train_data), nepochs=args.epochs) kv = mx.kv.create(args.kvstore) self.optimizer = gluon.Trainer(self.net.module.collect_params(), 'sgd', {'lr_scheduler': self.lr_scheduler, 'wd':args.weight_decay, 'momentum': args.momentum, 'multi_precision': True}, kvstore = kv) def training(self, epoch): tbar = tqdm(self.train_data) train_loss = 0.0 for i, (data, target) in enumerate(tbar): self.lr_scheduler.update(i, epoch) with autograd.record(True): outputs = self.net(data) losses = self.criterion(outputs, target) mx.nd.waitall() autograd.backward(losses) self.optimizer.step(self.args.batch_size) for loss in losses: train_loss += loss.asnumpy()[0] / len(losses) tbar.set_description('Epoch %d, training loss %.3f'%\ (epoch, train_loss/(i+1))) mx.nd.waitall() # save every epoch save_checkpoint(self.net.module, self.args, False) def validation(self, epoch): total_inter, total_union, total_correct, total_label = 0, 0, 0, 0 tbar = tqdm(self.eval_data) for i, (data, target) in enumerate(tbar): outputs = self.evaluator(data, target) for (correct, labeled, inter, union) in outputs: total_correct += correct total_label += labeled total_inter += inter total_union += union pixAcc = 1.0 * total_correct / (np.spacing(1) + total_label) IoU = 1.0 * total_inter / (np.spacing(1) + total_union) mIoU = IoU.mean() tbar.set_description('Epoch %d, validation pixAcc: %.3f, mIoU: %.3f'%\ (epoch, pixAcc, mIoU)) mx.nd.waitall()
def train(net, train_data, val_data, eval_metric, ctx, args): """Training pipeline""" net.collect_params().reset_ctx(ctx) if args.no_wd: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 if args.label_smooth: net._target_generator._label_smooth = True if args.lr_decay_period > 0: lr_decay_epoch = list(range(args.lr_decay_period, args.epochs, args.lr_decay_period)) else: lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')] lr_scheduler = LRScheduler(mode=args.lr_mode, baselr=args.lr, niters=args.num_samples // args.batch_size, nepochs=args.epochs, step=lr_decay_epoch, step_factor=args.lr_decay, power=2, warmup_epochs=args.warmup_epochs) trainer = gluon.Trainer( net.collect_params(), 'sgd', {'wd': args.wd, 'momentum': args.momentum, 'lr_scheduler': lr_scheduler}, kvstore='local') # targets sigmoid_ce = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) l1_loss = gluon.loss.L1Loss() # metrics obj_metrics = mx.metric.Loss('ObjLoss') center_metrics = mx.metric.Loss('BoxCenterLoss') scale_metrics = mx.metric.Loss('BoxScaleLoss') cls_metrics = mx.metric.Loss('ClassLoss') # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): if args.mixup: # TODO(zhreshold): more elegant way to control mixup during runtime try: train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5) except AttributeError: train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5) if epoch >= args.epochs - args.no_mixup_epochs: try: train_data._dataset.set_mixup(None) except AttributeError: train_data._dataset._data.set_mixup(None) tic = time.time() btic = time.time() mx.nd.waitall() net.hybridize() for i, batch in enumerate(train_data): batch_size = batch[0].shape[0] data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) # objectness, center_targets, scale_targets, weights, class_targets fixed_targets = [gluon.utils.split_and_load(batch[it], ctx_list=ctx, batch_axis=0) for it in range(1, 6)] gt_boxes = gluon.utils.split_and_load(batch[6], ctx_list=ctx, batch_axis=0) sum_losses = [] obj_losses = [] center_losses = [] scale_losses = [] cls_losses = [] with autograd.record(): for ix, x in enumerate(data): obj_loss, center_loss, scale_loss, cls_loss = net(x, gt_boxes[ix], *[ft[ix] for ft in fixed_targets]) sum_losses.append(obj_loss + center_loss + scale_loss + cls_loss) obj_losses.append(obj_loss) center_losses.append(center_loss) scale_losses.append(scale_loss) cls_losses.append(cls_loss) autograd.backward(sum_losses) lr_scheduler.update(i, epoch) trainer.step(batch_size) obj_metrics.update(0, obj_losses) center_metrics.update(0, center_losses) scale_metrics.update(0, scale_losses) cls_metrics.update(0, cls_losses) if args.log_interval and not (i + 1) % args.log_interval: name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() logger.info('[Epoch {}][Batch {}], LR: {:.2E}, Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'.format( epoch, i, trainer.learning_rate, batch_size/(time.time()-btic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) btic = time.time() name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() logger.info('[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'.format( epoch, (time.time()-tic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) if not (epoch + 1) % args.val_interval: # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric) val_msg = '\n'.join(['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0. save_params(net, best_map, current_map, epoch, args.save_interval, args.save_prefix)
def train(net, train_data, ctx, args): """Training pipeline""" net.collect_params().reset_ctx(ctx) if args.no_wd: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 if args.label_smooth: net._target_generator._label_smooth = True if args.lr_decay_period > 0: lr_decay_epoch = list(range(args.lr_decay_period, args.epochs, args.lr_decay_period)) else: lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')] lr_scheduler = LRScheduler(mode=args.lr_mode, baselr=args.lr, niters=args.num_samples // args.batch_size, nepochs=args.epochs, step=lr_decay_epoch, step_factor=args.lr_decay, power=2, warmup_epochs=args.warmup_epochs) if args.optimizer.lower() == 'adam': opt_name = 'adam' opt_param = {'wd': args.wd, 'lr_scheduler': lr_scheduler} elif args.optimizer.lower() == 'sgd': opt_name = 'sgd' opt_param = {'wd': args.wd, 'momentum': args.momentum, 'lr_scheduler': lr_scheduler} else: raise NotImplementedError(f'The optimizer {args.optimizer.lower()} is not implemented.') trainer = gluon.Trainer(net.collect_params(), opt_name, opt_param, kvstore='local') # metrics obj_metrics = mx.metric.Loss('O') center_metrics = mx.metric.Loss('BC') scale_metrics = mx.metric.Loss('BS') cls_metrics = mx.metric.Loss('C') coef_metrics = mx.metric.Loss('Cf') # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) for epoch in range(args.start_epoch, args.epochs): if args.mixup: # TODO(threshold): more elegant way to control mixup during runtime try: train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5) except AttributeError: train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5) if epoch >= args.epochs - args.no_mixup_epochs: try: train_data._dataset.set_mixup(None) except AttributeError: train_data._dataset._data.set_mixup(None) tic = time.time() btic = time.time() mx.nd.waitall() net.hybridize() for i, batch in enumerate(train_data): batch_size = batch[0].shape[0] data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) fixed_targets = [gluon.utils.split_and_load(batch[it], ctx_list=ctx, batch_axis=0) for it in range(1, 6)] gt_boxes = gluon.utils.split_and_load(batch[6], ctx_list=ctx, batch_axis=0) sum_losses = [] obj_losses = [] center_losses = [] scale_losses = [] cls_losses = [] with autograd.record(): for ix, x in enumerate(data): obj_loss, center_loss, scale_loss, cls_loss = net(x, gt_boxes[ix], *[ft[ix] for ft in fixed_targets]) sum_losses.append(obj_loss + center_loss + scale_loss + cls_loss) obj_losses.append(obj_loss) center_losses.append(center_loss) scale_losses.append(scale_loss) cls_losses.append(cls_loss) autograd.backward(sum_losses) lr_scheduler.update(i, epoch) trainer.step(batch_size) obj_metrics.update(0, obj_losses) center_metrics.update(0, center_losses) scale_metrics.update(0, scale_losses) cls_metrics.update(0, cls_losses) if args.log_interval and not (i + 1) % args.log_interval: name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() logger.info('[E {}][B {}], LR: {:.2E}, {:.1f} S/s, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'.format( epoch, i, trainer.learning_rate, batch_size / (time.time() - btic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) btic = time.time() name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() logger.info('[E {}] {:.1f} sec, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'.format( epoch, (time.time() - tic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) save_params(net, epoch, args.save_interval, args.save_prefix)
def train_net(train_epoch,ctx,batch_size,data_dir,pre_trained_model,output_stride, \ freeze_batch_norm,initial_learning_rate,weight_decay,base_architecture,aspp_or_vortex,resume): if base_architecture == 'resnet_v2_50': print('use resnet_v2_50') net = ResNet(BottleneckV2, [3, 4, 6, 3], [64, 256, 512, 1024, 2048], output_stride, aspp_or_vortex) elif base_architecture == 'resnet_v2_101': print('use resnet_v2_101') net = ResNet(BottleneckV2, [3, 4, 23, 3], [64, 256, 512, 1024, 2048], output_stride, aspp_or_vortex) if resume >= 0: print('resume for continue trainning') begin_epoch = resume + 1 model_path = './checkpoint/deeplabv3_%s.params' % resume net.initialize(ctx=ctx) print('model_path', model_path) net.collect_params().load(model_path, ctx=ctx, restore_prefix='') if output_stride == 8: begin_epoch = resume + 1 - 46 if freeze_batch_norm == 1: print('In the last 30K iters,freeze batchnorm ') net.collect_params( '.*gamma|.*beta|.*running_mean|.*running_var').setattr( 'grad_req', 'null') else: print('begin trainning') begin_epoch = 0 print('before auto init') net.initialize(ctx=ctx) print('after auto init') net.load_params(pre_trained_model, ctx=ctx, allow_missing=True, ignore_extra=True) loss = SoftmaxCrossEntropyLoss() #first 30K iter ,use split='train_aug',the last 30K iter use trainval. train_data = VOCSegDataset(root=data_dir, split='trainval') val_data = VOCSegDataset(root=data_dir, split='val') train_dataiter = gluon.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, last_batch='discard') val_dataiter = gluon.data.DataLoader(val_data, batch_size=batch_size, last_batch='discard') lr_scheduler = LRScheduler(mode='poly', baselr=initial_learning_rate, niters=len(train_dataiter), nepochs=train_epoch) trainer = gluon.Trainer( net.collect_params(), 'sgd', { 'lr_scheduler': lr_scheduler, 'wd': weight_decay, 'momentum': 0.9, 'multi_precision': True }) for epoch in range(begin_epoch, train_epoch): train_loss, train_acc, meaniou, n, m = 0, 0, 0, 0, 0 total_inter, total_union, total_correct, total_label = (0, ) * 4 iter = 0 for i, batch in enumerate(train_dataiter): data, label, batch_size = _get_batch(batch, ctx) lr_scheduler.update(i, epoch) with autograd.record(): output = [net(x) for x in data] losses = [loss(yhat, y) for yhat, y in zip(output, label)] for l in losses: l.backward() trainer.step(batch_size) train_loss += sum([l.sum().asscalar() for l in losses]) n += batch_size m += sum([y.size for y in label]) # evaluation correct, labeled = (0, ) * 2 result_pix = [ batch_pix_accuracy(output_, label_) for output_, label_ in zip(output, label) ] for i in range(len(result_pix)): correct += result_pix[i][0] labeled += result_pix[i][1] inter, union = (0, ) * 2 result_iou = [ batch_intersection_union(output_, label_, 21) for output_, label_ in zip(output, label) ] for i in range(len(result_iou)): inter += result_iou[i][0] union += result_iou[i][1] total_correct += correct.astype('int64') total_label += labeled.astype('int64') total_inter += inter.astype('int64') total_union += union.astype('int64') pix_acc = np.float64(1.0) * total_correct / ( np.spacing(1, dtype=np.float64) + total_label) IoU = np.float64(1.0) * total_inter / ( np.spacing(1, dtype=np.float64) + total_union) mIoU = IoU.mean() iter = iter + 1 if iter % 10 == 0: print( '-Epoch %s, Batch %d. Loss: %f, pix_acc: %.4f, mIoU: %.4f' % (epoch, n, train_loss / n, pix_acc, mIoU)) net.collect_params().save(filename='./checkpoint/deeplabv3_%s.params' % (epoch)) val_pix_acc, val_mIoU = evaluate_accuracy(val_dataiter, net, ctx) print('val_pix_acc: %.4f, val_mIoU: %.4f' % (val_pix_acc, val_mIoU))