def __init__(self, args): self.args = args # image transform input_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize([.485, .456, .406], [.229, .224, .225]), ]) # dataset and dataloader data_kwargs = {'transform': input_transform, 'base_size': args.base_size, 'crop_size': args.crop_size} trainset = get_segmentation_dataset( args.dataset, split=args.train_split, mode='train', **data_kwargs) valset = get_segmentation_dataset( args.dataset, split='val', mode='val', **data_kwargs) self.train_data = gluon.data.DataLoader( trainset, args.batch_size, shuffle=True, last_batch='rollover', num_workers=args.workers) self.eval_data = gluon.data.DataLoader(valset, args.test_batch_size, last_batch='rollover', num_workers=args.workers) # create network if args.model_zoo is not None: model = get_model(args.model_zoo, pretrained=True) else: model = get_segmentation_model(model=args.model, dataset=args.dataset, backbone=args.backbone, norm_layer=args.norm_layer, norm_kwargs=args.norm_kwargs, aux=args.aux, crop_size=args.crop_size) model.cast(args.dtype) print(model) self.net = DataParallelModel(model, args.ctx, args.syncbn) self.evaluator = DataParallelModel(SegEvalModel(model), args.ctx) # resume checkpoint if needed if args.resume is not None: if os.path.isfile(args.resume): model.load_parameters(args.resume, ctx=args.ctx) else: raise RuntimeError("=> no checkpoint found at '{}'" \ .format(args.resume)) # create criterion criterion = MixSoftmaxCrossEntropyLoss(args.aux, aux_weight=args.aux_weight) self.criterion = DataParallelCriterion(criterion, args.ctx, args.syncbn) # optimizer and lr scheduling self.lr_scheduler = LRScheduler(mode='poly', baselr=args.lr, niters=len(self.train_data), nepochs=args.epochs) kv = mx.kv.create(args.kvstore) optimizer_params = {'lr_scheduler': self.lr_scheduler, 'wd':args.weight_decay, 'momentum': args.momentum} if args.dtype == 'float16': optimizer_params['multi_precision'] = True if args.no_wd: for k, v in self.net.module.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 self.optimizer = gluon.Trainer(self.net.module.collect_params(), 'sgd', optimizer_params, kvstore = kv) # evaluation metrics self.metric = gluoncv.utils.metrics.SegmentationMetric(trainset.num_class)
def train(net, train_data, val_data, eval_metric, ctx, args): """Training pipeline""" net.collect_params().reset_ctx(ctx) if args.no_wd: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 if args.label_smooth: net._target_generator._label_smooth = True if args.lr_decay_period > 0: lr_decay_epoch = list(range(args.lr_decay_period, args.epochs, args.lr_decay_period)) else: lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')] lr_scheduler = LRScheduler(mode=args.lr_mode, baselr=args.lr, niters=args.num_samples // args.batch_size, nepochs=args.epochs, step=lr_decay_epoch, step_factor=args.lr_decay, power=2, warmup_epochs=args.warmup_epochs) trainer = gluon.Trainer( net.collect_params(), 'sgd', {'wd': args.wd, 'momentum': args.momentum, 'lr_scheduler': lr_scheduler}, kvstore='local') # targets sigmoid_ce = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) l1_loss = gluon.loss.L1Loss() # metrics obj_metrics = mx.metric.Loss('ObjLoss') center_metrics = mx.metric.Loss('BoxCenterLoss') scale_metrics = mx.metric.Loss('BoxScaleLoss') cls_metrics = mx.metric.Loss('ClassLoss') # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): if args.mixup: # TODO(zhreshold): more elegant way to control mixup during runtime try: train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5) except AttributeError: train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5) if epoch >= args.epochs - args.no_mixup_epochs: try: train_data._dataset.set_mixup(None) except AttributeError: train_data._dataset._data.set_mixup(None) tic = time.time() btic = time.time() mx.nd.waitall() net.hybridize() for i, batch in enumerate(train_data): batch_size = batch[0].shape[0] data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) # objectness, center_targets, scale_targets, weights, class_targets fixed_targets = [gluon.utils.split_and_load(batch[it], ctx_list=ctx, batch_axis=0) for it in range(1, 6)] gt_boxes = gluon.utils.split_and_load(batch[6], ctx_list=ctx, batch_axis=0) sum_losses = [] obj_losses = [] center_losses = [] scale_losses = [] cls_losses = [] with autograd.record(): for ix, x in enumerate(data): obj_loss, center_loss, scale_loss, cls_loss = net(x, gt_boxes[ix], *[ft[ix] for ft in fixed_targets]) sum_losses.append(obj_loss + center_loss + scale_loss + cls_loss) obj_losses.append(obj_loss) center_losses.append(center_loss) scale_losses.append(scale_loss) cls_losses.append(cls_loss) autograd.backward(sum_losses) lr_scheduler.update(i, epoch) trainer.step(batch_size) obj_metrics.update(0, obj_losses) center_metrics.update(0, center_losses) scale_metrics.update(0, scale_losses) cls_metrics.update(0, cls_losses) if args.log_interval and not (i + 1) % args.log_interval: name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() logger.info('[Epoch {}][Batch {}], LR: {:.2E}, Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'.format( epoch, i, trainer.learning_rate, batch_size/(time.time()-btic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) btic = time.time() name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() logger.info('[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'.format( epoch, (time.time()-tic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) if not (epoch + 1) % args.val_interval: # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric) val_msg = '\n'.join(['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0. save_params(net, best_map, current_map, epoch, args.save_interval, args.save_prefix)
class Trainer(object): def __init__(self, args): self.args = args # image transform input_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize([.485, .456, .406], [.229, .224, .225]), ]) # dataset and dataloader data_kwargs = {'transform': input_transform, 'base_size': args.base_size, 'crop_size': args.crop_size} trainset = get_segmentation_dataset( args.dataset, split=args.train_split, mode='train', **data_kwargs) valset = get_segmentation_dataset( args.dataset, split='val', mode='val', **data_kwargs) self.train_data = gluon.data.DataLoader( trainset, args.batch_size, shuffle=True, last_batch='rollover', num_workers=args.workers) self.eval_data = gluon.data.DataLoader(valset, args.test_batch_size, last_batch='rollover', num_workers=args.workers) # create network if args.model_zoo is not None: model = get_model(args.model_zoo, pretrained=True) else: model = get_segmentation_model(model=args.model, dataset=args.dataset, backbone=args.backbone, norm_layer=args.norm_layer, norm_kwargs=args.norm_kwargs, aux=args.aux, crop_size=args.crop_size) model.cast(args.dtype) print(model) self.net = DataParallelModel(model, args.ctx, args.syncbn) self.evaluator = DataParallelModel(SegEvalModel(model), args.ctx) # resume checkpoint if needed if args.resume is not None: if os.path.isfile(args.resume): model.load_parameters(args.resume, ctx=args.ctx) else: raise RuntimeError("=> no checkpoint found at '{}'" \ .format(args.resume)) # create criterion criterion = MixSoftmaxCrossEntropyLoss(args.aux, aux_weight=args.aux_weight) self.criterion = DataParallelCriterion(criterion, args.ctx, args.syncbn) # optimizer and lr scheduling self.lr_scheduler = LRScheduler(mode='poly', baselr=args.lr, niters=len(self.train_data), nepochs=args.epochs) kv = mx.kv.create(args.kvstore) optimizer_params = {'lr_scheduler': self.lr_scheduler, 'wd':args.weight_decay, 'momentum': args.momentum} if args.dtype == 'float16': optimizer_params['multi_precision'] = True if args.no_wd: for k, v in self.net.module.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 self.optimizer = gluon.Trainer(self.net.module.collect_params(), 'sgd', optimizer_params, kvstore = kv) # evaluation metrics self.metric = gluoncv.utils.metrics.SegmentationMetric(trainset.num_class) def training(self, epoch): tbar = tqdm(self.train_data) train_loss = 0.0 alpha = 0.2 for i, (data, target) in enumerate(tbar): self.lr_scheduler.update(i, epoch) with autograd.record(True): outputs = self.net(data.astype(args.dtype, copy=False)) losses = self.criterion(outputs, target) mx.nd.waitall() autograd.backward(losses) self.optimizer.step(self.args.batch_size) for loss in losses: train_loss += loss.asnumpy()[0] / len(losses) tbar.set_description('Epoch %d, training loss %.3f'%\ (epoch, train_loss/(i+1))) mx.nd.waitall() # save every epoch save_checkpoint(self.net.module, self.args, False) def validation(self, epoch): #total_inter, total_union, total_correct, total_label = 0, 0, 0, 0 self.metric.reset() tbar = tqdm(self.eval_data) for i, (data, target) in enumerate(tbar): outputs = self.evaluator(data.astype(args.dtype, copy=False)) outputs = [x[0] for x in outputs] targets = mx.gluon.utils.split_and_load(target, args.ctx, even_split=False) self.metric.update(targets, outputs) pixAcc, mIoU = self.metric.get() tbar.set_description('Epoch %d, validation pixAcc: %.3f, mIoU: %.3f'%\ (epoch, pixAcc, mIoU)) mx.nd.waitall()
def train(net, train_data, val_data, eval_metric, ctx, args): """Training pipeline""" net.collect_params().reset_ctx(ctx) if args.no_wd: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 if args.label_smooth: net._target_generator._label_smooth = True if args.lr_decay_period > 0: lr_decay_epoch = list( range(args.lr_decay_period, args.epochs, args.lr_decay_period)) else: lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')] lr_scheduler = LRScheduler(mode=args.lr_mode, baselr=args.lr, niters=args.num_samples // args.batch_size, nepochs=args.epochs, step=lr_decay_epoch, step_factor=args.lr_decay, power=2, warmup_epochs=args.warmup_epochs) trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'wd': args.wd, 'momentum': args.momentum, 'lr_scheduler': lr_scheduler }, kvstore='local') # targets sigmoid_ce = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) l1_loss = gluon.loss.L1Loss() # metrics obj_metrics = mx.metric.Loss('ObjLoss') center_metrics = mx.metric.Loss('BoxCenterLoss') scale_metrics = mx.metric.Loss('BoxScaleLoss') cls_metrics = mx.metric.Loss('ClassLoss') # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): if args.mixup: # TODO(zhreshold): more elegant way to control mixup during runtime try: train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5) except AttributeError: train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5) if epoch >= args.epochs - args.no_mixup_epochs: try: train_data._dataset.set_mixup(None) except AttributeError: train_data._dataset._data.set_mixup(None) tic = time.time() btic = time.time() mx.nd.waitall() net.hybridize() for i, batch in enumerate(train_data): batch_size = batch[0].shape[0] data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) # objectness, center_targets, scale_targets, weights, class_targets fixed_targets = [ gluon.utils.split_and_load(batch[it], ctx_list=ctx, batch_axis=0) for it in range(1, 6) ] gt_boxes = gluon.utils.split_and_load(batch[6], ctx_list=ctx, batch_axis=0) sum_losses = [] obj_losses = [] center_losses = [] scale_losses = [] cls_losses = [] with autograd.record(): for ix, x in enumerate(data): obj_loss, center_loss, scale_loss, cls_loss = net( x, gt_boxes[ix], *[ft[ix] for ft in fixed_targets]) sum_losses.append(obj_loss + center_loss + scale_loss + cls_loss) obj_losses.append(obj_loss) center_losses.append(center_loss) scale_losses.append(scale_loss) cls_losses.append(cls_loss) autograd.backward(sum_losses) lr_scheduler.update(i, epoch) trainer.step(batch_size) obj_metrics.update(0, obj_losses) center_metrics.update(0, center_losses) scale_metrics.update(0, scale_losses) cls_metrics.update(0, cls_losses) if args.log_interval and not (i + 1) % args.log_interval: name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() logger.info( '[Epoch {}][Batch {}], LR: {:.2E}, Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}' .format(epoch, i, trainer.learning_rate, batch_size / (time.time() - btic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) btic = time.time() name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() logger.info( '[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}' .format(epoch, (time.time() - tic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) if not (epoch + 1) % args.val_interval: # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric) val_msg = '\n'.join( ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0. save_params(net, best_map, current_map, epoch, args.save_interval, args.save_prefix)
class Trainer(object): def __init__(self, args): self.args = args # image transform input_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize([.485, .456, .406], [.229, .224, .225]), ]) # dataset and dataloader trainset = get_segmentation_dataset( args.dataset, split='train', transform=input_transform, crop_size=args.crop_size) valset = get_segmentation_dataset( args.dataset, split='val', transform=input_transform, crop_size=args.crop_size) self.train_data = gluon.data.DataLoader( trainset, args.batch_size, shuffle=True, last_batch='rollover', num_workers=args.workers) self.eval_data = gluon.data.DataLoader(valset, args.test_batch_size, last_batch='rollover', num_workers=args.workers) # create network model = get_segmentation_model(model=args.model, dataset=args.dataset, height=args.crop_size, width=args.crop_size, backbone=args.backbone, norm_layer=args.norm_layer, norm_kwargs=args.norm_kwargs, aux=args.aux) model.cast(args.dtype) print(model) self.net = DataParallelModel(model, args.ctx, args.syncbn) self.evaluator = DataParallelModel(SegEvalModel(model), args.ctx) # resume checkpoint if needed if args.resume is not None: if os.path.isfile(args.resume): model.load_parameters(args.resume, ctx=args.ctx) else: raise RuntimeError("=> no checkpoint found at '{}'" \ .format(args.resume)) # create criterion criterion = MixSoftmaxCrossEntropyLoss(args.aux, aux_weight=args.aux_weight) self.criterion = DataParallelCriterion(criterion, args.ctx, args.syncbn) # optimizer and lr scheduling self.lr_scheduler = LRScheduler(mode='poly', baselr=args.lr, niters=len(self.train_data), nepochs=args.epochs) kv = mx.kv.create(args.kvstore) optimizer_params = {'lr_scheduler': self.lr_scheduler, 'wd':args.weight_decay, 'momentum': args.momentum} if args.dtype == 'float16': optimizer_params['multi_precision'] = True self.optimizer = gluon.Trainer(self.net.module.collect_params(), 'sgd', optimizer_params, kvstore = kv) # evaluation metrics self.metric = gluoncv.utils.metrics.SegmentationMetric(trainset.num_class) def training(self, epoch): tbar = tqdm(self.train_data) train_loss = 0.0 alpha = 0.2 for i, (data, target) in enumerate(tbar): self.lr_scheduler.update(i, epoch) with autograd.record(True): outputs = self.net(data.astype(args.dtype, copy=False)) losses = self.criterion(outputs, target) mx.nd.waitall() autograd.backward(losses) self.optimizer.step(self.args.batch_size) for loss in losses: train_loss += loss.asnumpy()[0] / len(losses) tbar.set_description('Epoch %d, training loss %.3f'%\ (epoch, train_loss/(i+1))) mx.nd.waitall() # save every epoch save_checkpoint(self.net.module, self.args, False) def validation(self, epoch): #total_inter, total_union, total_correct, total_label = 0, 0, 0, 0 self.metric.reset() tbar = tqdm(self.eval_data) for i, (data, target) in enumerate(tbar): outputs = self.evaluator(data.astype(args.dtype, copy=False)) outputs = [x[0] for x in outputs] targets = mx.gluon.utils.split_and_load(target, args.ctx, even_split=False) self.metric.update(targets, outputs) pixAcc, mIoU = self.metric.get() tbar.set_description('Epoch %d, validation pixAcc: %.3f, mIoU: %.3f'%\ (epoch, pixAcc, mIoU)) mx.nd.waitall()
context = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()] lr_decay = opt.lr_decay lr_decay_period = opt.lr_decay_period if opt.lr_decay_period > 0: lr_decay_epoch = list( range(lr_decay_period, opt.num_epochs, lr_decay_period)) else: lr_decay_epoch = [int(i) for i in opt.lr_decay_epoch.split(',')] lr_decay_epoch = [e - opt.warmup_epochs for e in lr_decay_epoch] num_batches = num_training_samples // batch_size lr_scheduler = LRSequential([ LRScheduler('linear', base_lr=0, target_lr=opt.lr, nepochs=opt.warmup_epochs, iters_per_epoch=num_batches), LRScheduler(opt.lr_mode, base_lr=opt.lr, target_lr=0, nepochs=opt.num_epochs - opt.warmup_epochs, iters_per_epoch=num_batches, step_epoch=lr_decay_epoch, step_factor=lr_decay, power=2) ]) kwargs = {'ctx': context, 'pretrained': True} optimizer = 'SGD'
class Trainer(object): def __init__(self, args): self.args = args # image transform input_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize([.485, .456, .406], [.229, .224, .225]), ]) # dataset and dataloader trainset = get_segmentation_dataset( args.dataset, split='train', transform=input_transform) valset = get_segmentation_dataset( args.dataset, split='val', transform=input_transform) self.train_data = gluon.data.DataLoader( trainset, args.batch_size, shuffle=True, last_batch='rollover', num_workers=args.workers) self.eval_data = gluon.data.DataLoader(valset, args.test_batch_size, last_batch='keep', num_workers=args.workers) # create network model = get_segmentation_model(model=args.model, dataset=args.dataset, backbone=args.backbone, norm_layer=args.norm_layer, aux=args.aux, norm_kwargs=args.norm_kwargs) # model.hybridize(static_alloc=True, static_shape=True) print(model) self.net = DataParallelModel(model, args.ctx, args.syncbn) self.evaluator = DataParallelModel(SegEvalModel(model), args.ctx) # resume checkpoint if needed if args.resume is not None: if os.path.isfile(args.resume): model.load_params(args.resume, ctx=args.ctx) else: raise RuntimeError("=> no checkpoint found at '{}'" \ .format(args.resume)) # create criterion criterion = SoftmaxCrossEntropyLossWithAux(args.aux) self.criterion = DataParallelCriterion(criterion, args.ctx, args.syncbn) # optimizer and lr scheduling self.lr_scheduler = LRScheduler(mode='poly', baselr=args.lr, niters=len(self.train_data), nepochs=args.epochs) kv = mx.kv.create(args.kvstore) self.optimizer = gluon.Trainer(self.net.module.collect_params(), 'sgd', {'lr_scheduler': self.lr_scheduler, 'wd':args.weight_decay, 'momentum': args.momentum, 'multi_precision': True}, kvstore = kv) def training(self, epoch): tbar = tqdm(self.train_data) train_loss = 0.0 for i, (data, target) in enumerate(tbar): self.lr_scheduler.update(i, epoch) with autograd.record(True): outputs = self.net(data) losses = self.criterion(outputs, target) mx.nd.waitall() autograd.backward(losses) self.optimizer.step(self.args.batch_size) for loss in losses: train_loss += loss.asnumpy()[0] / len(losses) tbar.set_description('Epoch %d, training loss %.3f'%\ (epoch, train_loss/(i+1))) mx.nd.waitall() # save every epoch save_checkpoint(self.net.module, self.args, False) def validation(self, epoch): total_inter, total_union, total_correct, total_label = 0, 0, 0, 0 tbar = tqdm(self.eval_data) for i, (data, target) in enumerate(tbar): outputs = self.evaluator(data, target) for (correct, labeled, inter, union) in outputs: total_correct += correct total_label += labeled total_inter += inter total_union += union pixAcc = 1.0 * total_correct / (np.spacing(1) + total_label) IoU = 1.0 * total_inter / (np.spacing(1) + total_union) mIoU = IoU.mean() tbar.set_description('Epoch %d, validation pixAcc: %.3f, mIoU: %.3f'%\ (epoch, pixAcc, mIoU)) mx.nd.waitall()
os.makedirs(os.path.join("./models")) logger = logging.getLogger('TRAIN') logger.setLevel("INFO") logger.addHandler(logging.StreamHandler()) logger.addHandler( logging.FileHandler("./log/mob-arcloss%s.log" % datetime.strftime(datetime.now(), '%m%d_%H'))) num_batches = len(train_set) // batch_size train_params = net.collect_params() train_params.update(loss.params) lr_scheduler = LRScheduler(mode="cosine", baselr=lr, targetlr=1e-6, niters=num_batches, nepochs=epochs, warmup_lr=lr / 10, warmup_epochs=warmup_epochs, warmup_mode='linear') trainer = gluon.Trainer( train_params, 'sgd', { 'learning_rate': lr, 'momentum': momentum, 'wd': wd, "lr_scheduler": lr_scheduler, "multi_precision": True }) lr_counter = 0 logger.info([margin, scale, lr, batch_size]) logger.info("Batches per epoch: {}, Classes: {}".format(
def main(): opt = parse_args() filehandler = logging.FileHandler(opt.logging_file) streamhandler = logging.StreamHandler() logger = logging.getLogger('') logger.setLevel(logging.INFO) logger.addHandler(filehandler) logger.addHandler(streamhandler) logger.info(opt) batch_size = opt.batch_size classes = 1000 num_training_samples = 1281167 num_gpus = opt.num_gpus batch_size *= max(1, num_gpus) context = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()] num_workers = opt.num_workers lr_decay = opt.lr_decay lr_decay_period = opt.lr_decay_period if opt.lr_decay_period > 0: lr_decay_epoch = list( range(lr_decay_period, opt.num_epochs, lr_decay_period)) else: lr_decay_epoch = [int(i) for i in opt.lr_decay_epoch.split(',')] lr_decay_epoch = [e - opt.warmup_epochs for e in lr_decay_epoch] num_batches = num_training_samples // batch_size lr_scheduler = LRSequential([ LRScheduler('linear', base_lr=0, target_lr=opt.lr, nepochs=opt.warmup_epochs, iters_per_epoch=num_batches), LRScheduler(opt.lr_mode, base_lr=opt.lr, target_lr=0, nepochs=opt.num_epochs - opt.warmup_epochs, iters_per_epoch=num_batches, step_epoch=lr_decay_epoch, step_factor=lr_decay, power=2) ]) model_name = opt.model kwargs = { 'ctx': context, 'pretrained': opt.use_pretrained, 'classes': classes } if opt.use_gn: from gluoncv.nn import GroupNorm kwargs['norm_layer'] = GroupNorm if model_name.startswith('vgg'): kwargs['batch_norm'] = opt.batch_norm elif model_name.startswith('resnext'): kwargs['use_se'] = opt.use_se if opt.last_gamma: kwargs['last_gamma'] = True if opt.configuration is not None: kwargs['configuration'] = opt.configuration optimizer = 'nag' optimizer_params = { 'wd': opt.wd, 'momentum': opt.momentum, 'lr_scheduler': lr_scheduler } if opt.dtype != 'float32': optimizer_params['multi_precision'] = True if model_name in _models: net = _models[model_name](**kwargs) else: net = get_model(model_name, **kwargs) net.cast(opt.dtype) if opt.resume_params is not '': net.load_parameters(opt.resume_params, ctx=context) # teacher model for distillation training if opt.teacher is not None and opt.hard_weight < 1.0: teacher_name = opt.teacher teacher = get_model(teacher_name, pretrained=True, classes=classes, ctx=context) teacher.cast(opt.dtype) distillation = True else: distillation = False # Two functions for reading data from record file or raw images def get_data_rec(rec_train, rec_train_idx, rec_val, rec_val_idx, batch_size, num_workers): rec_train = os.path.expanduser(rec_train) rec_train_idx = os.path.expanduser(rec_train_idx) rec_val = os.path.expanduser(rec_val) rec_val_idx = os.path.expanduser(rec_val_idx) jitter_param = 0.4 lighting_param = 0.1 input_size = opt.input_size crop_ratio = opt.crop_ratio if opt.crop_ratio > 0 else 0.875 resize = int(math.ceil(input_size / crop_ratio)) mean_rgb = [123.68, 116.779, 103.939] std_rgb = [58.393, 57.12, 57.375] def batch_fn(batch, ctx): data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) return data, label train_data = mx.io.ImageRecordIter( path_imgrec=rec_train, path_imgidx=rec_train_idx, preprocess_threads=num_workers, shuffle=True, batch_size=batch_size, data_shape=(3, input_size, input_size), mean_r=mean_rgb[0], mean_g=mean_rgb[1], mean_b=mean_rgb[2], std_r=std_rgb[0], std_g=std_rgb[1], std_b=std_rgb[2], rand_mirror=True, random_resized_crop=True, max_aspect_ratio=4. / 3., min_aspect_ratio=3. / 4., max_random_area=1, min_random_area=0.08, brightness=jitter_param, saturation=jitter_param, contrast=jitter_param, pca_noise=lighting_param, ) val_data = mx.io.ImageRecordIter( path_imgrec=rec_val, path_imgidx=rec_val_idx, preprocess_threads=num_workers, shuffle=False, batch_size=batch_size, resize=resize, data_shape=(3, input_size, input_size), mean_r=mean_rgb[0], mean_g=mean_rgb[1], mean_b=mean_rgb[2], std_r=std_rgb[0], std_g=std_rgb[1], std_b=std_rgb[2], ) return train_data, val_data, batch_fn def get_data_loader(data_dir, batch_size, num_workers): normalize = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) jitter_param = 0.4 lighting_param = 0.1 input_size = opt.input_size crop_ratio = opt.crop_ratio if opt.crop_ratio > 0 else 0.875 resize = int(math.ceil(input_size / crop_ratio)) def batch_fn(batch, ctx): data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) return data, label transform_train = transforms.Compose([ transforms.RandomResizedCrop(input_size), transforms.RandomFlipLeftRight(), transforms.RandomColorJitter(brightness=jitter_param, contrast=jitter_param, saturation=jitter_param), transforms.RandomLighting(lighting_param), transforms.ToTensor(), normalize ]) transform_test = transforms.Compose([ transforms.Resize(resize, keep_ratio=True), transforms.CenterCrop(input_size), transforms.ToTensor(), normalize ]) train_data = gluon.data.DataLoader(imagenet.classification.ImageNet( data_dir, train=True).transform_first(transform_train), batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers) val_data = gluon.data.DataLoader(imagenet.classification.ImageNet( data_dir, train=False).transform_first(transform_test), batch_size=batch_size, shuffle=False, num_workers=num_workers) return train_data, val_data, batch_fn if opt.use_rec: train_data, val_data, batch_fn = get_data_rec(opt.rec_train, opt.rec_train_idx, opt.rec_val, opt.rec_val_idx, batch_size, num_workers) else: train_data, val_data, batch_fn = get_data_loader( opt.data_dir, batch_size, num_workers) if opt.mixup: train_metric = mx.metric.RMSE() else: train_metric = mx.metric.Accuracy() acc_top1 = mx.metric.Accuracy() acc_top5 = mx.metric.TopKAccuracy(5) save_frequency = opt.save_frequency if opt.save_dir and save_frequency: save_dir = opt.save_dir makedirs(save_dir) else: save_dir = '' save_frequency = 0 def mixup_transform(label, classes, lam=1, eta=0.0): if isinstance(label, nd.NDArray): label = [label] res = [] for l in label: y1 = l.one_hot(classes, on_value=1 - eta + eta / classes, off_value=eta / classes) y2 = l[::-1].one_hot(classes, on_value=1 - eta + eta / classes, off_value=eta / classes) res.append(lam * y1 + (1 - lam) * y2) return res def smooth(label, classes, eta=0.1): if isinstance(label, nd.NDArray): label = [label] smoothed = [] for l in label: res = l.one_hot(classes, on_value=1 - eta + eta / classes, off_value=eta / classes) smoothed.append(res) return smoothed def test(ctx, val_data): if opt.use_rec: val_data.reset() acc_top1.reset() acc_top5.reset() for i, batch in enumerate(val_data): data, label = batch_fn(batch, ctx) outputs = [net(X.astype(opt.dtype, copy=False)) for X in data] acc_top1.update(label, outputs) acc_top5.update(label, outputs) _, top1 = acc_top1.get() _, top5 = acc_top5.get() return (1 - top1, 1 - top5) def train(ctx): if isinstance(ctx, mx.Context): ctx = [ctx] if opt.resume_params is '': net.initialize(mx.init.MSRAPrelu(), ctx=ctx) if opt.no_wd: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) if opt.resume_states is not '': trainer.load_states(opt.resume_states) if opt.label_smoothing or opt.mixup: sparse_label_loss = False else: sparse_label_loss = True if distillation: L = gcv.loss.DistillationSoftmaxCrossEntropyLoss( temperature=opt.temperature, hard_weight=opt.hard_weight, sparse_label=sparse_label_loss) else: L = gluon.loss.SoftmaxCrossEntropyLoss( sparse_label=sparse_label_loss) best_val_score = 1 for epoch in range(opt.resume_epoch, opt.num_epochs): tic = time.time() if opt.use_rec: train_data.reset() train_metric.reset() btic = time.time() for i, batch in enumerate(train_data): data, label = batch_fn(batch, ctx) if opt.mixup: lam = np.random.beta(opt.mixup_alpha, opt.mixup_alpha) if epoch >= opt.num_epochs - opt.mixup_off_epoch: lam = 1 data = [lam * X + (1 - lam) * X[::-1] for X in data] if opt.label_smoothing: eta = 0.1 else: eta = 0.0 label = mixup_transform(label, classes, lam, eta) elif opt.label_smoothing: hard_label = label label = smooth(label, classes) if distillation: teacher_prob = [nd.softmax(teacher(X.astype(opt.dtype, copy=False)) / opt.temperature) \ for X in data] with ag.record(): outputs = [ net(X.astype(opt.dtype, copy=False)) for X in data ] if distillation: loss = [ L(yhat.astype('float32', copy=False), y.astype('float32', copy=False), p.astype('float32', copy=False)) for yhat, y, p in zip(outputs, label, teacher_prob) ] else: loss = [ L(yhat, y.astype(opt.dtype, copy=False)) for yhat, y in zip(outputs, label) ] for l in loss: l.backward() trainer.step(batch_size) if opt.mixup: output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False)) \ for out in outputs] train_metric.update(label, output_softmax) else: if opt.label_smoothing: train_metric.update(hard_label, outputs) else: train_metric.update(label, outputs) if opt.log_interval and not (i + 1) % opt.log_interval: train_metric_name, train_metric_score = train_metric.get() logger.info( 'Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f\tlr=%f' % (epoch, i, batch_size * opt.log_interval / (time.time() - btic), train_metric_name, train_metric_score, trainer.learning_rate)) btic = time.time() train_metric_name, train_metric_score = train_metric.get() throughput = int(batch_size * i / (time.time() - tic)) err_top1_val, err_top5_val = test(ctx, val_data) logger.info('[Epoch %d] training: %s=%f' % (epoch, train_metric_name, train_metric_score)) logger.info('[Epoch %d] speed: %d samples/sec\ttime cost: %f' % (epoch, throughput, time.time() - tic)) logger.info('[Epoch %d] validation: err-top1=%f err-top5=%f' % (epoch, err_top1_val, err_top5_val)) if err_top1_val < best_val_score: best_val_score = err_top1_val net.save_parameters( '%s/%.4f-imagenet-%s-%d-best.params' % (save_dir, best_val_score, model_name, epoch)) trainer.save_states( '%s/%.4f-imagenet-%s-%d-best.states' % (save_dir, best_val_score, model_name, epoch)) if save_frequency and save_dir and (epoch + 1) % save_frequency == 0: net.save_parameters('%s/imagenet-%s-%d.params' % (save_dir, model_name, epoch)) trainer.save_states('%s/imagenet-%s-%d.states' % (save_dir, model_name, epoch)) if save_frequency and save_dir: net.save_parameters('%s/imagenet-%s-%d.params' % (save_dir, model_name, opt.num_epochs - 1)) trainer.save_states('%s/imagenet-%s-%d.states' % (save_dir, model_name, opt.num_epochs - 1)) if opt.mode == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) if distillation: teacher.hybridize(static_alloc=True, static_shape=True) train(context)
def train(net, train_data, val_data, eval_metric, ctx, args): import gluoncv as gcv gcv.utils.check_version("0.6.0") from gluoncv import data as gdata from gluoncv import utils as gutils from gluoncv.data.batchify import Pad, Stack, Tuple from gluoncv.data.dataloader import RandomTransformDataLoader from gluoncv.data.transforms.presets.yolo import ( YOLO3DefaultTrainTransform, YOLO3DefaultValTransform, ) from gluoncv.model_zoo import get_model from gluoncv.utils import LRScheduler, LRSequential from gluoncv.utils.metrics.coco_detection import COCODetectionMetric from gluoncv.utils.metrics.voc_detection import VOC07MApMetric """Training pipeline""" net.collect_params().reset_ctx(ctx) if args.no_wd: for k, v in net.collect_params(".*beta|.*gamma|.*bias").items(): v.wd_mult = 0.0 if args.label_smooth: net._target_generator._label_smooth = True if args.lr_decay_period > 0: lr_decay_epoch = list( range(args.lr_decay_period, args.epochs, args.lr_decay_period)) else: lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(",")] lr_decay_epoch = [e - args.warmup_epochs for e in lr_decay_epoch] num_batches = args.num_samples // args.batch_size lr_scheduler = LRSequential([ LRScheduler( "linear", base_lr=0, target_lr=args.lr, nepochs=args.warmup_epochs, iters_per_epoch=num_batches, ), LRScheduler( args.lr_mode, base_lr=args.lr, nepochs=args.epochs - args.warmup_epochs, iters_per_epoch=num_batches, step_epoch=lr_decay_epoch, step_factor=args.lr_decay, power=2, ), ]) if args.horovod: hvd.broadcast_parameters(net.collect_params(), root_rank=0) trainer = hvd.DistributedTrainer( net.collect_params(), "sgd", { "wd": args.wd, "momentum": args.momentum, "lr_scheduler": lr_scheduler }, ) else: trainer = gluon.Trainer( net.collect_params(), "sgd", { "wd": args.wd, "momentum": args.momentum, "lr_scheduler": lr_scheduler }, kvstore="local", update_on_kvstore=(False if args.amp else None), ) if args.amp: amp.init_trainer(trainer) # targets sigmoid_ce = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) l1_loss = gluon.loss.L1Loss() # metrics obj_metrics = mx.metric.Loss("ObjLoss") center_metrics = mx.metric.Loss("BoxCenterLoss") scale_metrics = mx.metric.Loss("BoxScaleLoss") cls_metrics = mx.metric.Loss("ClassLoss") # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + "_train.log" log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) logger.info("Start training from [Epoch {}]".format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.num_epochs): if args.mixup: # TODO(zhreshold): more elegant way to control mixup during runtime try: train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5) except AttributeError: train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5) if epoch >= args.num_epochs - args.no_mixup_epochs: try: train_data._dataset.set_mixup(None) except AttributeError: train_data._dataset._data.set_mixup(None) tic = time.time() btic = time.time() mx.nd.waitall() net.hybridize() for i, batch in enumerate(train_data): data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) # objectness, center_targets, scale_targets, weights, class_targets fixed_targets = [ gluon.utils.split_and_load(batch[it], ctx_list=ctx, batch_axis=0) for it in range(1, 6) ] gt_boxes = gluon.utils.split_and_load(batch[6], ctx_list=ctx, batch_axis=0) sum_losses = [] obj_losses = [] center_losses = [] scale_losses = [] cls_losses = [] with autograd.record(): for ix, x in enumerate(data): obj_loss, center_loss, scale_loss, cls_loss = net( x, gt_boxes[ix], *[ft[ix] for ft in fixed_targets]) sum_losses.append(obj_loss + center_loss + scale_loss + cls_loss) obj_losses.append(obj_loss) center_losses.append(center_loss) scale_losses.append(scale_loss) cls_losses.append(cls_loss) if args.amp: with amp.scale_loss(sum_losses, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(sum_losses) trainer.step(batch_size) if not args.horovod or hvd.rank() == 0: obj_metrics.update(0, obj_losses) center_metrics.update(0, center_losses) scale_metrics.update(0, scale_losses) cls_metrics.update(0, cls_losses) if args.log_interval and not (i + 1) % args.log_interval: name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() logger.info( "[Epoch {}][Batch {}], LR: {:.2E}, Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}" .format( epoch, i, trainer.learning_rate, args.batch_size / (time.time() - btic), name1, loss1, name2, loss2, name3, loss3, name4, loss4, )) btic = time.time() if not args.horovod or hvd.rank() == 0: name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() logger.info( "[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}" .format( epoch, (time.time() - tic), name1, loss1, name2, loss2, name3, loss3, name4, loss4, )) if not (epoch + 1) % args.val_interval: # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric) val_msg = "\n".join( ["{}={}".format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info("[Epoch {}] Validation: \n{}".format( epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0.0 save_params(net, best_map, current_map, epoch, args.save_interval, args.save_prefix) # save model net.set_nms(nms_thresh=0.45, nms_topk=400, post_nms=100) net(mx.nd.ones((1, 3, args.data_shape, args.data_shape), ctx=ctx[0])) net.export("%s/model" % os.environ["SM_MODEL_DIR"])