def training(self, epoch): tbar = tqdm(self.train_data) train_loss = 0.0 alpha = 0.2 for i, (data, target) in enumerate(tbar): with autograd.record(True): outputs = self.net(data.astype(args.dtype, copy=False)) losses = self.criterion(outputs, target) mx.nd.waitall() autograd.backward(losses) self.optimizer.step(self.args.batch_size) for loss in losses: train_loss += loss.asnumpy()[0] / len(losses) tbar.set_description('Epoch %d, training loss %.3f'%\ (epoch, train_loss/(i+1))) mx.nd.waitall() # save every epoch save_checkpoint(self.net.module, self.args, False)
}, kvstore=kv) ############################################################################## # The training loop # ----------------- # train_loss = 0.0 epoch = 0 for i, (data, target) in enumerate(train_data): lr_scheduler.update(i, epoch) with autograd.record(True): outputs = model(data) losses = criterion(outputs, target) mx.nd.waitall() autograd.backward(losses) optimizer.step(batch_size) for loss in losses: train_loss += loss.asnumpy()[0] / len(losses) print('Epoch %d, batch %d, training loss %.3f' % (epoch, i, train_loss / (i + 1))) # just demo for 2 iters if i > 1: print('Terminated for this demo...') break ############################################################################## # You can `Start Training Now`_. # # References # ----------
def train_model(train_dataset, epochs=50): ctx = mx.gpu(0) net = gcv.model_zoo.get_model('ssd_512_resnet50_v1_custom', classes=train_dataset.classes, transfer='coco') net.collect_params().reset_ctx(ctx) width, height = 512, 512 # suppose we use 512 as base training size train_transform = gcv.data.transforms.presets.ssd.SSDDefaultTrainTransform( width, height) gcv.utils.random.seed(233) batch_size = 4 # you can make it larger(if your CPU has more cores) to accelerate data loading num_workers = 4 with autograd.train_mode(): _, _, anchors = net(mx.nd.zeros((1, 3, height, width), ctx)) anchors = anchors.as_in_context(mx.cpu()) train_transform = gcv.data.transforms.presets.ssd.SSDDefaultTrainTransform( width, height, anchors) batchify_fn = Tuple(Stack(), Stack(), Stack()) train_loader = mx.gluon.data.DataLoader( train_dataset.transform(train_transform), batch_size, shuffle=True, batchify_fn=batchify_fn, last_batch='rollover', num_workers=num_workers) mbox_loss = gcv.loss.SSDMultiBoxLoss() ce_metric = mx.metric.Loss('CrossEntropy') smoothl1_metric = mx.metric.Loss('SmoothL1') for k, v in net.collect_params().items(): if 'convpredictor' not in k: # freeze upper layers v.grad_req = 'null' trainer = mx.gluon.Trainer(net.collect_params(), 'sgd', { 'learning_rate': 0.001, 'wd': 0.0005, 'momentum': 0.9 }) net.hybridize(static_alloc=True, static_shape=True) for epoch in range(epochs): tic = time.time() btic = time.time() for i, batch in enumerate(train_loader): data = mx.gluon.utils.split_and_load(batch[0], ctx_list=[ctx], batch_axis=0) cls_targets = mx.gluon.utils.split_and_load(batch[1], ctx_list=[ctx], batch_axis=0) box_targets = mx.gluon.utils.split_and_load(batch[2], ctx_list=[ctx], batch_axis=0) with autograd.record(): cls_preds = [] box_preds = [] for x in data: cls_pred, box_pred, _ = net(x) cls_preds.append(cls_pred) box_preds.append(box_pred) sum_loss, cls_loss, box_loss = mbox_loss( cls_preds, box_preds, cls_targets, box_targets) autograd.backward(sum_loss) # since we have already normalized the loss, we don't want to normalize # by batch-size anymore trainer.step(1) ce_metric.update(0, [l * batch_size for l in cls_loss]) smoothl1_metric.update(0, [l * batch_size for l in box_loss]) name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() print( '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}' .format(epoch, i, batch_size / (time.time() - btic), name1, loss1, name2, loss2)) btic = time.time() return net
def _train_loop(self, train_data, val_data, train_eval_data, time_limit=math.inf): start_tic = time.time() wh_loss = MaskedL1Loss(weight=self._cfg.center_net.wh_weight) heatmap_loss = HeatmapFocalLoss(from_logits=True) center_reg_loss = MaskedL1Loss( weight=self._cfg.center_net.center_reg_weight) heatmap_loss_metric = mx.metric.Loss('HeatmapFocal') wh_metric = mx.metric.Loss('WHL1') center_reg_metric = mx.metric.Loss('CenterRegL1') self._logger.info('Start training from [Epoch %d]', max(self._cfg.train.start_epoch, self.epoch)) mean_ap = [-1] cp_name = '' self._time_elapsed += time.time() - start_tic for self.epoch in range(max(self._cfg.train.start_epoch, self.epoch), self._cfg.train.epochs): epoch = self.epoch tic = time.time() last_tic = time.time() if self._best_map >= 1.0: self._logger.info( '[Epoch %d] Early stopping as mAP is reaching 1.0', epoch) break wh_metric.reset() center_reg_metric.reset() heatmap_loss_metric.reset() self.net.hybridize() for i, batch in enumerate(train_data): btic = time.time() if self._time_elapsed > time_limit: self._logger.warning( f'`time_limit={time_limit}` reached, exit early...') return { 'train_map': float(mean_ap[-1]), 'valid_map': self._best_map, 'time': self._time_elapsed, 'checkpoint': cp_name } split_data = [ gluon.utils.split_and_load(batch[ind], ctx_list=self.ctx, batch_axis=0, even_split=False) for ind in range(6) ] data, heatmap_targets, wh_targets, wh_masks, center_reg_targets, center_reg_masks = split_data batch_size = self._cfg.train.batch_size with autograd.record(): sum_losses = [] heatmap_losses = [] wh_losses = [] center_reg_losses = [] wh_preds = [] center_reg_preds = [] for x, heatmap_target, wh_target, wh_mask, center_reg_target, center_reg_mask in zip( *split_data): heatmap_pred, wh_pred, center_reg_pred = self.net(x) wh_preds.append(wh_pred) center_reg_preds.append(center_reg_pred) wh_losses.append(wh_loss(wh_pred, wh_target, wh_mask)) center_reg_losses.append( center_reg_loss(center_reg_pred, center_reg_target, center_reg_mask)) heatmap_losses.append( heatmap_loss(heatmap_pred, heatmap_target)) curr_loss = heatmap_losses[-1] + wh_losses[ -1] + center_reg_losses[-1] sum_losses.append(curr_loss) autograd.backward(sum_losses) self.trainer.step(len(sum_losses)) # step with # gpus heatmap_loss_metric.update(0, heatmap_losses) wh_metric.update(0, wh_losses) center_reg_metric.update(0, center_reg_losses) if self._cfg.train.log_interval and not ( i + 1) % self._cfg.train.log_interval: name2, loss2 = wh_metric.get() name3, loss3 = center_reg_metric.get() name4, loss4 = heatmap_loss_metric.get() self._logger.info( '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, ' 'LR={}, {}={:.3f}, {}={:.3f}, {}={:.3f}'.format( epoch, i, batch_size / (time.time() - last_tic), self.trainer.learning_rate, name2, loss2, name3, loss3, name4, loss4)) last_tic = time.time() self._time_elapsed += time.time() - btic post_tic = time.time() name2, loss2 = wh_metric.get() name3, loss3 = center_reg_metric.get() name4, loss4 = heatmap_loss_metric.get() self._logger.info( '[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}' .format(epoch, (time.time() - tic), name2, loss2, name3, loss3, name4, loss4)) if (epoch % self._cfg.valid.interval == 0) or (epoch == self._cfg.train.epochs - 1): # consider reduce the frequency of validation to save time map_name, mean_ap = self._evaluate(val_data) val_msg = '\n'.join( ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) self._logger.info('[Epoch %d] Validation: \n%s', epoch, val_msg) current_map = float(mean_ap[-1]) if current_map > self._best_map: cp_name = os.path.join(self._logdir, _BEST_CHECKPOINT_FILE) self._logger.info( '[Epoch %d] Current best map: %f vs previous %f, saved to %s', self.epoch, current_map, self._best_map, cp_name) self.save(cp_name) self._best_map = current_map if self._reporter: self._reporter(epoch=epoch, map_reward=current_map) self._time_elapsed += time.time() - post_tic # map on train data tic = time.time() map_name, mean_ap = self._evaluate(train_eval_data) self._time_elapsed += time.time() - tic return { 'train_map': float(mean_ap[-1]), 'valid_map': self._best_map, 'time': self._time_elapsed, 'checkpoint': cp_name }
def _train_loop(self, train_data, val_data, train_eval_data): # fix seed for mxnet, numpy and python builtin random generator. gutils.random.seed(self._cfg.train.seed) # loss and metric mbox_loss = SSDMultiBoxLoss() ce_metric = mx.metric.Loss('CrossEntropy') smoothl1_metric = mx.metric.Loss('SmoothL1') # lr decay policy lr_decay = float(self._cfg.train.lr_decay) lr_steps = sorted([float(ls) for ls in self._cfg.train.lr_decay_epoch]) self._logger.info('Start training from [Epoch %d]', max(self._cfg.train.start_epoch, self.epoch)) self.net.collect_params().reset_ctx(self.ctx) for self.epoch in range(max(self._cfg.train.start_epoch, self.epoch), self._cfg.train.epochs): epoch = self.epoch while lr_steps and epoch >= lr_steps[0]: new_lr = self.trainer.learning_rate * lr_decay lr_steps.pop(0) self.trainer.set_learning_rate(new_lr) self._logger.info("[Epoch {}] Set learning rate to {}".format( epoch, new_lr)) ce_metric.reset() smoothl1_metric.reset() tic = time.time() btic = time.time() self.net.hybridize(static_alloc=True, static_shape=True) for i, batch in enumerate(train_data): if self._cfg.train.dali: # dali iterator returns a mxnet.io.DataBatch data = [d.data[0] for d in batch] box_targets = [d.label[0] for d in batch] cls_targets = [ nd.cast(d.label[1], dtype='float32') for d in batch ] else: data = gluon.utils.split_and_load(batch[0], ctx_list=self.ctx, batch_axis=0, even_split=False) cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=self.ctx, batch_axis=0, even_split=False) box_targets = gluon.utils.split_and_load(batch[2], ctx_list=self.ctx, batch_axis=0, even_split=False) with autograd.record(): cls_preds = [] box_preds = [] for x in data: cls_pred, box_pred, _ = self.net(x) cls_preds.append(cls_pred) box_preds.append(box_pred) sum_loss, cls_loss, box_loss = mbox_loss( cls_preds, box_preds, cls_targets, box_targets) if self._cfg.ssd.amp: with amp.scale_loss(sum_loss, self.trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(sum_loss) # since we have already normalized the loss, we don't want to normalize # by batch-size anymore self.trainer.step(1) if not self._cfg.horovod or hvd.rank() == 0: local_batch_size = int( self._cfg.train.batch_size // (hvd.size() if self._cfg.horovod else 1)) ce_metric.update(0, [l * local_batch_size for l in cls_loss]) smoothl1_metric.update( 0, [l * local_batch_size for l in box_loss]) if self._cfg.train.log_interval and not ( i + 1) % self._cfg.train.log_interval: name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() self._logger.info( '[Epoch %d][Batch %d], Speed: %f samples/sec, %s=%f, %s=%f', epoch, i, self._cfg.train.batch_size / (time.time() - btic), name1, loss1, name2, loss2) btic = time.time() if not self._cfg.horovod or hvd.rank() == 0: name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() self._logger.info('[Epoch %d] Training cost: %f, %s=%f, %s=%f', epoch, (time.time() - tic), name1, loss1, name2, loss2) if (epoch % self._cfg.valid.val_interval == 0) or \ (self._cfg.save_interval and epoch % self._cfg.save_interval == 0): # consider reduce the frequency of validation to save time map_name, mean_ap = self._evaluate(val_data) val_msg = '\n'.join([ '{}={}'.format(k, v) for k, v in zip(map_name, mean_ap) ]) self._logger.info('[Epoch %d] Validation: \n%s', epoch, str(val_msg)) current_map = float(mean_ap[-1]) if current_map > self._best_map: cp_name = os.path.join(self._logdir, 'best_checkpoint.pkl') self._logger.info( '[Epoch %d] Current best map: %f vs previous %f, saved to %s', self.epoch, current_map, self._best_map, cp_name) self.save(cp_name) self._best_map = current_map if self._reporter: self._reporter(epoch=epoch, map_reward=current_map) self._time_elapsed += time.time() - btic # map on train data map_name, mean_ap = self._evaluate(train_eval_data) return { 'train_map': float(mean_ap[-1]), 'valid_map': self._best_map, 'time': self._time_elapsed }
fake_B_list.append(fake_B) losses_log.add(loss_G_A=loss_G_A, loss_cycle_A=loss_cycle_A, loss_idt_A=loss_idt_A, loss_G_B=loss_G_B, loss_cycle_B=loss_cycle_B, loss_idt_B=loss_idt_B, real_A=A, fake_B=fake_B, rec_A=rec_A, idt_A=idt_A, real_B=B, fake_A=fake_A, rec_B=rec_B, idt_B=idt_B) autograd.backward(loss_G_list) optimizer_GA.step(opt.batchSize) optimizer_GB.step(opt.batchSize) with autograd.record(): for A, B, fake_A, fake_B in zip(real_A, real_B, fake_A_list, fake_B_list): #train D_A #real fake_B_tmp = fake_B_pool.query(fake_B) pred_real = netD_A(B) loss_D_real = gan_loss(pred_real, True) pred_fake = netD_A(fake_B_tmp.detach()) loss_D_fake = gan_loss(pred_fake, False) loss_D_A = (loss_D_real + loss_D_fake) * 0.5 loss_D_A_list.append(loss_D_A)
def train(net, train_data, val_data, eval_metric, ctx, args): """Training pipeline""" net.collect_params().reset_ctx(ctx) trainer = gluon.Trainer( net.collect_params(), 'sgd', {'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum}) # lr decay policy lr_decay = float(args.lr_decay) lr_steps = sorted([float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()]) mbox_loss = gcv.loss.SSDMultiBoxLoss() ce_metric = mx.metric.Loss('CrossEntropy') smoothl1_metric = mx.metric.Loss('SmoothL1') # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info("[Epoch {}] Set learning rate to {}".format(epoch, new_lr)) ce_metric.reset() smoothl1_metric.reset() tic = time.time() btic = time.time() net.hybridize(static_alloc=True, static_shape=True) for i, batch in enumerate(train_data): batch_size = batch[0].shape[0] data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0) with autograd.record(): cls_preds = [] box_preds = [] for x in data: cls_pred, box_pred, _ = net(x) cls_preds.append(cls_pred) box_preds.append(box_pred) sum_loss, cls_loss, box_loss = mbox_loss( cls_preds, box_preds, cls_targets, box_targets) autograd.backward(sum_loss) # since we have already normalized the loss, we don't want to normalize # by batch-size anymore trainer.step(1) ce_metric.update(0, [l * batch_size for l in cls_loss]) smoothl1_metric.update(0, [l * batch_size for l in box_loss]) if args.log_interval and not (i + 1) % args.log_interval: name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() logger.info('[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}'.format( epoch, i, batch_size/(time.time()-btic), name1, loss1, name2, loss2)) btic = time.time() name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() logger.info('[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}'.format( epoch, (time.time()-tic), name1, loss1, name2, loss2)) if (epoch % args.val_interval == 0) or (args.save_interval and epoch % args.save_interval == 0): # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric) val_msg = '\n'.join(['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0. save_params(net, best_map, current_map, epoch, args.save_interval, args.save_prefix)
def forward_backward(self, x): data, label, gt_mask, rpn_cls_targets, rpn_box_targets, rpn_box_masks = x with autograd.record(): gt_label = label[:, :, 4:5] gt_box = label[:, :, :4] cls_pred, box_pred, mask_pred, roi, samples, matches, rpn_score, rpn_box, anchors, \ cls_targets, box_targets, box_masks, indices = self.net(data, gt_box, gt_label) # losses of rpn rpn_score = rpn_score.squeeze(axis=-1) num_rpn_pos = (rpn_cls_targets >= 0).sum() rpn_loss1 = self.rpn_cls_loss( rpn_score, rpn_cls_targets, rpn_cls_targets >= 0) * rpn_cls_targets.size / num_rpn_pos rpn_loss2 = self.rpn_box_loss( rpn_box, rpn_box_targets, rpn_box_masks) * rpn_box.size / num_rpn_pos # rpn overall loss, use sum rather than average rpn_loss = rpn_loss1 + rpn_loss2 # losses of rcnn num_rcnn_pos = (cls_targets >= 0).sum() rcnn_loss1 = self.rcnn_cls_loss(cls_pred, cls_targets, cls_targets.expand_dims(-1) >= 0) * cls_targets.size / \ num_rcnn_pos rcnn_loss2 = self.rcnn_box_loss(box_pred, box_targets, box_masks) * box_pred.size / \ num_rcnn_pos rcnn_loss = rcnn_loss1 + rcnn_loss2 # generate targets for mask roi = mx.nd.concat( *[mx.nd.take(roi[i], indices[i]) for i in range(indices.shape[0])], dim=0) \ .reshape((indices.shape[0], -1, 4)) m_cls_targets = mx.nd.concat( *[mx.nd.take(cls_targets[i], indices[i]) for i in range(indices.shape[0])], dim=0) \ .reshape((indices.shape[0], -1)) matches = mx.nd.concat( *[mx.nd.take(matches[i], indices[i]) for i in range(indices.shape[0])], dim=0) \ .reshape((indices.shape[0], -1)) mask_targets, mask_masks = self.net.mask_target( roi, gt_mask, matches, m_cls_targets) # loss of mask mask_loss = self.rcnn_mask_loss(mask_pred, mask_targets, mask_masks) * \ mask_targets.size / mask_masks.sum() # overall losses total_loss = rpn_loss.sum() + rcnn_loss.sum() + mask_loss.sum() rpn_loss1_metric = rpn_loss1.mean() rpn_loss2_metric = rpn_loss2.mean() rcnn_loss1_metric = rcnn_loss1.sum() rcnn_loss2_metric = rcnn_loss2.sum() mask_loss_metric = mask_loss.sum() rpn_acc_metric = [[rpn_cls_targets, rpn_cls_targets >= 0], [rpn_score]] rpn_l1_loss_metric = [[rpn_box_targets, rpn_box_masks], [rpn_box]] rcnn_acc_metric = [[cls_targets], [cls_pred]] rcnn_l1_loss_metric = [[box_targets, box_masks], [box_pred]] rcnn_mask_metric = [[mask_targets, mask_masks], [mask_pred]] rcnn_fgmask_metric = [[mask_targets, mask_masks], [mask_pred]] if args.amp: with amp.scale_loss(total_loss, self._optimizer) as scaled_losses: autograd.backward(scaled_losses) else: total_loss.backward() return rpn_loss1_metric, rpn_loss2_metric, rcnn_loss1_metric, rcnn_loss2_metric, \ mask_loss_metric, rpn_acc_metric, rpn_l1_loss_metric, rcnn_acc_metric, \ rcnn_l1_loss_metric, rcnn_mask_metric, rcnn_fgmask_metric
def train(net, train_data, val_data, eval_metric, ctx, args): """Training pipeline""" net.collect_params().reset_ctx(ctx) if args.no_wd: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 if args.label_smooth: net._target_generator._label_smooth = True if args.lr_decay_period > 0: lr_decay_epoch = list(range(args.lr_decay_period, args.epochs, args.lr_decay_period)) else: lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')] lr_decay_epoch = [e - args.warmup_epochs for e in lr_decay_epoch] num_batches = args.num_samples // args.batch_size lr_scheduler = LRSequential([ LRScheduler('linear', base_lr=0, target_lr=args.lr, nepochs=args.warmup_epochs, iters_per_epoch=num_batches), LRScheduler(args.lr_mode, base_lr=args.lr, nepochs=args.epochs - args.warmup_epochs, iters_per_epoch=num_batches, step_epoch=lr_decay_epoch, step_factor=args.lr_decay, power=2), ]) trainer = gluon.Trainer( net.collect_params(), 'sgd', {'wd': args.wd, 'momentum': args.momentum, 'lr_scheduler': lr_scheduler}, kvstore='local') # targets sigmoid_ce = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) l1_loss = gluon.loss.L1Loss() # metrics obj_metrics = mx.metric.Loss('ObjLoss') center_metrics = mx.metric.Loss('BoxCenterLoss') scale_metrics = mx.metric.Loss('BoxScaleLoss') cls_metrics = mx.metric.Loss('ClassLoss') # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): if args.mixup: # TODO(zhreshold): more elegant way to control mixup during runtime try: train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5) except AttributeError: train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5) if epoch >= args.epochs - args.no_mixup_epochs: try: train_data._dataset.set_mixup(None) except AttributeError: train_data._dataset._data.set_mixup(None) tic = time.time() btic = time.time() mx.nd.waitall() net.hybridize() for i, batch in enumerate(train_data): batch_size = batch[0].shape[0] data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) # objectness, center_targets, scale_targets, weights, class_targets fixed_targets = [gluon.utils.split_and_load(batch[it], ctx_list=ctx, batch_axis=0) for it in range(1, 6)] gt_boxes = gluon.utils.split_and_load(batch[6], ctx_list=ctx, batch_axis=0) sum_losses = [] obj_losses = [] center_losses = [] scale_losses = [] cls_losses = [] with autograd.record(): for ix, x in enumerate(data): obj_loss, center_loss, scale_loss, cls_loss = net(x, gt_boxes[ix], *[ft[ix] for ft in fixed_targets]) sum_losses.append(obj_loss + center_loss + scale_loss + cls_loss) obj_losses.append(obj_loss) center_losses.append(center_loss) scale_losses.append(scale_loss) cls_losses.append(cls_loss) autograd.backward(sum_losses) trainer.step(batch_size) obj_metrics.update(0, obj_losses) center_metrics.update(0, center_losses) scale_metrics.update(0, scale_losses) cls_metrics.update(0, cls_losses) if args.log_interval and not (i + 1) % args.log_interval: name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() logger.info('[Epoch {}][Batch {}], LR: {:.2E}, Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'.format( epoch, i, trainer.learning_rate, batch_size/(time.time()-btic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) btic = time.time() name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() logger.info('[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'.format( epoch, (time.time()-tic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) if not (epoch + 1) % args.val_interval: # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric) val_msg = '\n'.join(['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0. save_params(net, best_map, current_map, epoch, args.save_interval, args.save_prefix)
def train(train_loader, val_loader, batch_size, save_as, lr_scheduler): optimizer = 'adam' # Set parameters optimizer_params = {'lr_scheduler': lr_scheduler} # Define our trainer for net trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) criterion_clothes = gloss.SoftmaxCrossEntropyLoss() criterion_color = gloss.SoftmaxCrossEntropyLoss() logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) fh = logging.FileHandler(args.save_as + "_train.log") logger.addHandler(fh) logger.info(args) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_acc = -100 for epoch in range(args.start_epoch, args.epoch): print('epoch:', epoch, ', learning rate:', trainer.learning_rate) tic = time.time() train_metric_clothes.reset() train_metric_colors.reset() train_loss_clothes = 0 train_loss_color = 0 # Loop through each batch of training data for i, batch in enumerate(train_loader): clothes_labels = batch[2].as_in_context(context) color_labels = batch[1].as_in_context(context) with autograd.record(): outputs = net(batch[0].as_in_context(context)) loss_clothes = criterion_clothes(outputs[0], clothes_labels) loss_color = criterion_color(outputs[1], color_labels) lr_scheduler.update(i, epoch) # Backpropagation autograd.backward([loss_clothes, loss_color]) # Optimize trainer.step(batch_size) # Update metrics train_loss_clothes += loss_clothes.sum().asscalar() train_loss_color += loss_color.sum().asscalar() train_metric_clothes.update(clothes_labels, outputs[0]) train_metric_colors.update(color_labels, outputs[1]) name, train_clothes_acc = train_metric_clothes.get() name, train_color_acc = train_metric_colors.get() # Evaluate on Validation data validate_clothes_acc, validate_color_acc = validate(context, val_loader) if (validate_clothes_acc + validate_color_acc) > best_acc: best_acc = validate_clothes_acc + validate_color_acc net.save_parameters(save_as + "_best") # Update history and print metrics train_history.update([1-train_clothes_acc, 1-train_color_acc, (1-train_clothes_acc + 1-train_color_acc) / 2, 1-validate_clothes_acc, 1-validate_color_acc, (1-validate_clothes_acc + 1-validate_color_acc) / 2]) logger.info('[Epoch {}] lr={:.2E} train_clothes_acc={:.3f} train_color_acc={:.3f} train_acc_avg={:.3f} train_clothes_loss={:.3f}, train_color_loss={:.3f}, ' 'validate_clothes_acc={:.3f}, validate_color_acc=={:.3f}, validate_acc_avg={:.3f} time: {}'.format (epoch, trainer.learning_rate, train_clothes_acc, train_color_acc, (train_clothes_acc + train_color_acc)/2, train_loss_clothes, train_loss_color, validate_clothes_acc, validate_color_acc, (validate_clothes_acc+validate_color_acc)/2, time.time()-tic)) # We can plot the metric scores with: train_history.plot() net.save_parameters(save_as + "_" + str(args.epoch) + "epoch")
net.hybridize(static_alloc=True, static_shape=True) for i, batch in enumerate(train_data): batch_size = batch[0].shape[0] data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0) with autograd.record(): cls_preds = [] box_preds = [] for x in data: cls_pred, box_pred, _ = net(x) cls_preds.append(cls_pred) box_preds.append(box_pred) sum_loss, cls_loss, box_loss = mbox_loss( cls_preds, box_preds, cls_targets, box_targets) autograd.backward(sum_loss) # since we have already normalized the loss, we don't want to normalize # by batch-size anymore trainer.step(1) ce_metric.update(0, [l * batch_size for l in cls_loss]) smoothl1_metric.update(0, [l * batch_size for l in box_loss]) name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() if i % 20 == 0: print('[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}'.format( epoch, i, batch_size/(time.time()-btic), name1, loss1, name2, loss2)) btic = time.time() ############################################################################################# # Save finetuned weights to disk net.save_parameters('ssd_512_mobilenet1.0_pikachu.params')
idt_B = netG_B(A) loss_idt_B = cyc_loss(idt_B,A) * opt.lambda_A * opt.lambda_idt loss_G_A = gan_loss(netD_A(fake_B),True) loss_G_B = gan_loss(netD_B(fake_A),True) loss_cycle_A = cyc_loss(rec_A,A) * opt.lambda_A loss_cycle_B = cyc_loss(rec_B,B) * opt.lambda_B loss_G = loss_G_A + loss_G_B + loss_cycle_A + loss_cycle_B + loss_idt_A + loss_idt_B loss_G_list.append(loss_G) fake_A_list.append(fake_A) fake_B_list.append(fake_B) losses_log.add(loss_G_A=loss_G_A, loss_cycle_A=loss_cycle_A, loss_idt_A=loss_idt_A,loss_G_B=loss_G_B, loss_cycle_B=loss_cycle_B, loss_idt_B=loss_idt_B,real_A=A, fake_B=fake_B, rec_A=rec_A, idt_A=idt_A, real_B=B, fake_A=fake_A, rec_B=rec_B,idt_B=idt_B) autograd.backward(loss_G_list) optimizer_GA.step(opt.batchSize) optimizer_GB.step(opt.batchSize) with autograd.record(): for A,B,fake_A,fake_B in zip(real_A,real_B,fake_A_list,fake_B_list): #train D_A #real fake_B_tmp = fake_B_pool.query(fake_B) pred_real = netD_A(B) loss_D_real = gan_loss(pred_real,True) pred_fake = netD_A(fake_B_tmp.detach()) loss_D_fake = gan_loss(pred_fake, False) loss_D_A = (loss_D_real + loss_D_fake) * 0.5 loss_D_A_list.append(loss_D_A) #train D_B
def train_net(args): ctx = [] cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip() if len(cvd)>0: for i in xrange(len(cvd.split(','))): ctx.append(mx.gpu(i)) if len(ctx)==0: ctx = [mx.cpu()] print('use cpu') else: print('gpu num:', len(ctx)) prefix = args.prefix prefix_dir = os.path.dirname(prefix) if not os.path.exists(prefix_dir): os.makedirs(prefix_dir) end_epoch = args.end_epoch args.ctx_num = len(ctx) args.num_layers = int(args.network[1:]) print('num_layers', args.num_layers) if args.per_batch_size==0: args.per_batch_size = 128 args.batch_size = args.per_batch_size*args.ctx_num args.image_channel = 3 data_dir = args.data_dir if args.task=='gender': data_dir = args.gender_data_dir elif args.task=='age': data_dir = args.age_data_dir print('data dir', data_dir) path_imgrec = None path_imglist = None prop = face_image.load_property(data_dir) args.num_classes = prop.num_classes image_size = prop.image_size args.image_h = image_size[0] args.image_w = image_size[1] print('image_size', image_size) assert(args.num_classes>0) print('num_classes', args.num_classes) path_imgrec = os.path.join(data_dir, "train.rec") print('Called with argument:', args) data_shape = (args.image_channel,image_size[0],image_size[1]) mean = None begin_epoch = 0 net = get_model() #if args.task=='': # test_net = get_model_test(net) #print(net.__class__) #net = net0[0] if args.network[0]=='r' or args.network[0]=='y': initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style elif args.network[0]=='i' or args.network[0]=='x': initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2) #inception else: initializer = mx.init.Xavier(rnd_type='uniform', factor_type="in", magnitude=2) net.hybridize() if args.mode=='gluon': if len(args.pretrained)==0: pass else: net.load_params(args.pretrained, allow_missing=True, ignore_extra = True) net.initialize(initializer) net.collect_params().reset_ctx(ctx) val_iter = None if args.task=='': train_iter = FaceImageIter( batch_size = args.batch_size, data_shape = data_shape, path_imgrec = path_imgrec, shuffle = True, rand_mirror = args.rand_mirror, mean = mean, cutoff = args.cutoff, ) else: train_iter = FaceImageIterAge( batch_size = args.batch_size, data_shape = data_shape, path_imgrec = path_imgrec, task = args.task, shuffle = True, rand_mirror = args.rand_mirror, mean = mean, cutoff = args.cutoff, ) if args.task=='age': metric = CompositeEvalMetric([MAEMetric(), CUMMetric()]) elif args.task=='gender': metric = CompositeEvalMetric([AccMetric()]) else: metric = CompositeEvalMetric([AccMetric()]) ver_list = [] ver_name_list = [] if args.task=='': for name in args.eval.split(','): path = os.path.join(data_dir,name+".bin") if os.path.exists(path): data_set = verification.load_bin(path, image_size) ver_list.append(data_set) ver_name_list.append(name) print('ver', name) def ver_test(nbatch): results = [] for i in xrange(len(ver_list)): acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(ver_list[i], net, ctx, batch_size = args.batch_size) print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm)) #print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1)) print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc2, std2)) results.append(acc2) return results def val_test(nbatch=0): acc = 0.0 #if args.task=='age': if len(args.age_data_dir)>0: val_iter = FaceImageIterAge( batch_size = args.batch_size, data_shape = data_shape, path_imgrec = os.path.join(args.age_data_dir, 'val.rec'), task = args.task, shuffle = False, rand_mirror = False, mean = mean, ) _metric = MAEMetric() val_metric = mx.metric.create(_metric) val_metric.reset() _metric2 = CUMMetric() val_metric2 = mx.metric.create(_metric2) val_metric2.reset() val_iter.reset() for batch in val_iter: data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) outputs = [] for x in data: outputs.append(net(x)[2]) val_metric.update(label, outputs) val_metric2.update(label, outputs) _value = val_metric.get_name_value()[0][1] print('[%d][VMAE]: %f'%(nbatch, _value)) _value = val_metric2.get_name_value()[0][1] if args.task=='age': acc = _value print('[%d][VCUM]: %f'%(nbatch, _value)) if len(args.gender_data_dir)>0: val_iter = FaceImageIterAge( batch_size = args.batch_size, data_shape = data_shape, path_imgrec = os.path.join(args.gender_data_dir, 'val.rec'), task = args.task, shuffle = False, rand_mirror = False, mean = mean, ) _metric = AccMetric() val_metric = mx.metric.create(_metric) val_metric.reset() val_iter.reset() for batch in val_iter: data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) outputs = [] for x in data: outputs.append(net(x)[1]) val_metric.update(label, outputs) _value = val_metric.get_name_value()[0][1] if args.task=='gender': acc = _value print('[%d][VACC]: %f'%(nbatch, _value)) return acc total_time = 0 num_epochs = 0 best_acc = [0] highest_acc = [0.0, 0.0] #lfw and target global_step = [0] save_step = [0] if len(args.lr_steps)==0: lr_steps = [100000, 140000, 160000] p = 512.0/args.batch_size for l in xrange(len(lr_steps)): lr_steps[l] = int(lr_steps[l]*p) else: lr_steps = [int(x) for x in args.lr_steps.split(',')] print('lr_steps', lr_steps) kv = mx.kv.create('device') #kv = mx.kv.create('local') #_rescale = 1.0/args.ctx_num #opt = optimizer.SGD(learning_rate=args.lr, momentum=args.mom, wd=args.wd, rescale_grad=_rescale) #opt = optimizer.SGD(learning_rate=args.lr, momentum=args.mom, wd=args.wd) if args.mode=='gluon': trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.mom, 'multi_precision': True}, kvstore=kv) else: _rescale = 1.0/args.ctx_num opt = optimizer.SGD(learning_rate=args.lr, momentum=args.mom, wd=args.wd, rescale_grad=_rescale) _cb = mx.callback.Speedometer(args.batch_size, 20) arg_params = None aux_params = None data = mx.sym.var('data') label = mx.sym.var('softmax_label') if args.margin_a>0.0: fc7 = net(data, label) else: fc7 = net(data) #sym = mx.symbol.SoftmaxOutput(data=fc7, label = label, name='softmax', normalization='valid') ceop = gluon.loss.SoftmaxCrossEntropyLoss() loss = ceop(fc7, label) #loss = loss/args.per_batch_size loss = mx.sym.mean(loss) sym = mx.sym.Group( [mx.symbol.BlockGrad(fc7), mx.symbol.MakeLoss(loss, name='softmax')] ) def _batch_callback(): mbatch = global_step[0] global_step[0]+=1 for _lr in lr_steps: if mbatch==_lr: args.lr *= 0.1 if args.mode=='gluon': trainer.set_learning_rate(args.lr) else: opt.lr = args.lr print('lr change to', args.lr) break #_cb(param) if mbatch%1000==0: print('lr-batch-epoch:',args.lr, mbatch) if mbatch>0 and mbatch%args.verbose==0: save_step[0]+=1 msave = save_step[0] do_save = False is_highest = False if args.task=='age' or args.task=='gender': acc = val_test(mbatch) if acc>=highest_acc[-1]: highest_acc[-1] = acc is_highest = True do_save = True else: acc_list = ver_test(mbatch) if len(acc_list)>0: lfw_score = acc_list[0] if lfw_score>highest_acc[0]: highest_acc[0] = lfw_score if lfw_score>=0.998: do_save = True if acc_list[-1]>=highest_acc[-1]: highest_acc[-1] = acc_list[-1] if lfw_score>=0.99: do_save = True is_highest = True if args.ckpt==0: do_save = False elif args.ckpt>1: do_save = True if do_save: print('saving', msave) #print('saving gluon params') fname = os.path.join(args.prefix, 'model-gluon.params') net.save_params(fname) fname = os.path.join(args.prefix, 'model') net.export(fname, msave) #arg, aux = model.get_params() #mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux) print('[%d]Accuracy-Highest: %1.5f'%(mbatch, highest_acc[-1])) if args.max_steps>0 and mbatch>args.max_steps: sys.exit(0) def _batch_callback_sym(param): _cb(param) _batch_callback() if args.mode!='gluon': model = mx.mod.Module( context = ctx, symbol = sym, ) model.fit(train_iter, begin_epoch = 0, num_epoch = args.end_epoch, eval_data = None, eval_metric = metric, kvstore = 'device', optimizer = opt, initializer = initializer, arg_params = arg_params, aux_params = aux_params, allow_missing = True, batch_end_callback = _batch_callback_sym, epoch_end_callback = None ) else: loss_weight = 1.0 if args.task=='age': loss_weight = 1.0/AGE #loss = gluon.loss.SoftmaxCrossEntropyLoss(weight = loss_weight) loss = nd.SoftmaxOutput #loss = gluon.loss.SoftmaxCrossEntropyLoss() while True: #trainer = update_learning_rate(opt.lr, trainer, epoch, opt.lr_factor, lr_steps) tic = time.time() train_iter.reset() metric.reset() btic = time.time() for i, batch in enumerate(train_iter): _batch_callback() #data = gluon.utils.split_and_load(batch.data[0].astype(opt.dtype), ctx_list=ctx, batch_axis=0) #label = gluon.utils.split_and_load(batch.label[0].astype(opt.dtype), ctx_list=ctx, batch_axis=0) data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) outputs = [] Ls = [] with ag.record(): for x, y in zip(data, label): #print(y.asnumpy()) if args.task=='': if args.margin_a>0.0: z = net(x,y) else: z = net(x) #print(z[0].shape, z[1].shape) else: z = net(x) if args.task=='gender': L = loss(z[1], y) #L = L/args.per_batch_size Ls.append(L) outputs.append(z[1]) elif args.task=='age': for k in xrange(AGE): _z = nd.slice_axis(z[2], axis=1, begin=k*2, end=k*2+2) _y = nd.slice_axis(y, axis=1, begin=k, end=k+1) _y = nd.flatten(_y) L = loss(_z, _y) #L = L/args.per_batch_size #L /= AGE Ls.append(L) outputs.append(z[2]) else: L = loss(z, y) #L = L/args.per_batch_size Ls.append(L) outputs.append(z) # store the loss and do backward after we have done forward # on all GPUs for better speed on multiple GPUs. ag.backward(Ls) #trainer.step(batch.data[0].shape[0], ignore_stale_grad=True) #trainer.step(args.ctx_num) n = batch.data[0].shape[0] #print(n,n) trainer.step(n) metric.update(label, outputs) if i>0 and i%20==0: name, acc = metric.get() if len(name)==2: logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f, %s=%f'%( num_epochs, i, args.batch_size/(time.time()-btic), name[0], acc[0], name[1], acc[1])) else: logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f'%( num_epochs, i, args.batch_size/(time.time()-btic), name[0], acc[0])) #metric.reset() btic = time.time() epoch_time = time.time()-tic # First epoch will usually be much slower than the subsequent epics, # so don't factor into the average if num_epochs > 0: total_time = total_time + epoch_time #name, acc = metric.get() #logger.info('[Epoch %d] training: %s=%f, %s=%f'%(num_epochs, name[0], acc[0], name[1], acc[1])) logger.info('[Epoch %d] time cost: %f'%(num_epochs, epoch_time)) num_epochs = num_epochs + 1 #name, val_acc = test(ctx, val_data) #logger.info('[Epoch %d] validation: %s=%f, %s=%f'%(epoch, name[0], val_acc[0], name[1], val_acc[1])) # save model if meet requirements #save_checkpoint(epoch, val_acc[0], best_acc) if num_epochs > 1: print('Average epoch time: {}'.format(float(total_time)/(num_epochs - 1)))
'momentum': 0.9, 'multi_precision': True}, kvstore = kv) ############################################################################## # The training loop # ----------------- # train_loss = 0.0 epoch = 0 for i, (data, target) in enumerate(train_data): with autograd.record(True): outputs = model(data) losses = criterion(outputs, target) mx.nd.waitall() autograd.backward(losses) optimizer.step(batch_size) for loss in losses: train_loss += loss.asnumpy()[0] / len(losses) print('Epoch %d, batch %d, training loss %.3f'%(epoch, i, train_loss/(i+1))) # just demo for 2 iters if i > 1: print('Terminated for this demo...') break ############################################################################## # You can `Start Training Now`_. # # References # ----------
def train(net, train_data, val_data, eval_metric, ctx, args): """Training pipeline""" net.collect_params().reset_ctx(ctx) trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum }, update_on_kvstore=(False if args.amp else None)) # lr decay policy lr_decay = float(args.lr_decay) lr_steps = sorted( [float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()]) mbox_loss = gcv.loss.SSDMultiBoxLoss() ce_metric = mx.metric.Loss('CrossEntropy') smoothl1_metric = mx.metric.Loss('SmoothL1') # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info("[Epoch {}] Set learning rate to {}".format( epoch, new_lr)) ce_metric.reset() smoothl1_metric.reset() tic = time.time() btic = time.time() net.hybridize(static_alloc=True, static_shape=True) for i, batch in enumerate(train_data): if args.dali: # dali iterator returns a mxnet.io.DataBatch data = [d.data[0] for d in batch] box_targets = [d.label[0] for d in batch] cls_targets = [ nd.cast(d.label[1], dtype='float32') for d in batch ] else: data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0) with autograd.record(): cls_preds = [] box_preds = [] for x in data: cls_pred, box_pred, _ = net(x) cls_preds.append(cls_pred) box_preds.append(box_pred) sum_loss, cls_loss, box_loss = mbox_loss( cls_preds, box_preds, cls_targets, box_targets) if args.amp: with amp.scale_loss(sum_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(sum_loss) # since we have already normalized the loss, we don't want to normalize # by batch-size anymore trainer.step(1) if (not args.horovod or hvd.rank() == 0): local_batch_size = int(args.batch_size // (hvd.size() if args.horovod else 1)) ce_metric.update(0, [l * local_batch_size for l in cls_loss]) smoothl1_metric.update( 0, [l * local_batch_size for l in box_loss]) if args.log_interval and not (i + 1) % args.log_interval: name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() logger.info( '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}' .format(epoch, i, args.batch_size / (time.time() - btic), name1, loss1, name2, loss2)) btic = time.time() if (not args.horovod or hvd.rank() == 0): name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() logger.info( '[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}'. format(epoch, (time.time() - tic), name1, loss1, name2, loss2)) if (epoch % args.val_interval == 0) or (args.save_interval and epoch % args.save_interval == 0): # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric) val_msg = '\n'.join( ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch {}] Validation: \n{}'.format( epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0. save_params(net, best_map, current_map, epoch, args.save_interval, args.save_prefix)
def train(): """Training loop for language model. """ print(model) from_epoch = 0 model.initialize(mx.init.Xavier(factor_type='out'), ctx=context) trainer_params = {'learning_rate': args.lr, 'wd': 0, 'eps': args.eps} trainer = gluon.Trainer(model.collect_params(), 'adagrad', trainer_params) if args.from_epoch: from_epoch = args.from_epoch checkpoint_name = '%s.%s' % (args.save, format(from_epoch - 1, '02d')) model.load_parameters(checkpoint_name) trainer.load_states('%s.state' % args.save) print('Loaded parameters from checkpoint %s' % (checkpoint_name)) model.hybridize(static_alloc=True, static_shape=True) encoder_params = model.encoder.collect_params().values() embedding_params = list(model.embedding.collect_params().values()) for epoch in range(from_epoch, args.epochs): sys.stdout.flush() total_L = 0.0 start_epoch_time = time.time() start_log_interval_time = time.time() hiddens = [ model.begin_state(batch_size=args.batch_size, func=mx.nd.zeros, ctx=ctx) for ctx in context ] nbatch = 0 has_next = True train_data_iter = iter(train_data) data, target, mask, sample = next(train_data_iter) while has_next: nbatch += 1 hiddens = detach(hiddens) Ls = [] with autograd.record(): for j, (X, y, m, s, h) in enumerate( zip(data, target, mask, sample, hiddens)): output, h, new_target = model(X, y, h, s) output = output.reshape((-3, -1)) new_target = new_target.reshape((-1, )) l = loss(output, new_target) * m.reshape((-1, )) Ls.append(l / args.batch_size) hiddens[j] = h autograd.backward(Ls) # prefetch the next batch of data try: data, target, mask, sample = next(train_data_iter) except StopIteration: has_next = False # rescale embedding grad for ctx in context: x = embedding_params[0].grad(ctx) x[:] *= args.batch_size encoder_grad = [p.grad(ctx) for p in encoder_params] # perform gradient clipping per ctx gluon.utils.clip_global_norm(encoder_grad, args.clip) trainer.step(len(context)) total_L += sum([mx.nd.sum(L).asscalar() / args.bptt for L in Ls]) if nbatch % args.log_interval == 0: cur_L = total_L / args.log_interval / len(context) ppl = math.exp(cur_L) if cur_L < 100 else float('inf') print('[Epoch %d Batch %d] loss %.2f, ppl %.2f, ' 'throughput %.2f samples/s' % (epoch, nbatch, cur_L, ppl, train_batch_size * args.log_interval / (time.time() - start_log_interval_time))) total_L = 0.0 start_log_interval_time = time.time() sys.stdout.flush() end_epoch_time = time.time() print('Epoch %d took %.2f seconds.' % (epoch, end_epoch_time - start_epoch_time)) mx.nd.waitall() checkpoint_name = '%s.%s' % (args.save, format(epoch, '02d')) model.save_parameters(checkpoint_name) trainer.save_states('%s.state' % args.save)
def train(net, train_data, val_data, eval_metric, ctx, args): """Training pipeline""" net.collect_params().setattr('grad_req', 'null') net.collect_train_params().setattr('grad_req', 'write') trainer = gluon.Trainer( net.collect_train_params(), # fix batchnorm, fix first stage, etc... 'sgd', {'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum, 'clip_gradient': 5}) # lr decay policy lr_decay = float(args.lr_decay) lr_steps = sorted([float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()]) lr_warmup = float(args.lr_warmup) # avoid int division rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) rpn_box_loss = mx.gluon.loss.HuberLoss(rho=1 / 9.) # == smoothl1 rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss() rcnn_box_loss = mx.gluon.loss.HuberLoss() # == smoothl1 rcnn_mask_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) metrics = [mx.metric.Loss('RPN_Conf'), mx.metric.Loss('RPN_SmoothL1'), mx.metric.Loss('RCNN_CrossEntropy'), mx.metric.Loss('RCNN_SmoothL1'), mx.metric.Loss('RCNN_Mask')] rpn_acc_metric = RPNAccMetric() rpn_bbox_metric = RPNL1LossMetric() rcnn_acc_metric = RCNNAccMetric() rcnn_bbox_metric = RCNNL1LossMetric() rcnn_mask_metric = MaskAccMetric() rcnn_fgmask_metric = MaskFGAccMetric() metrics2 = [rpn_acc_metric, rpn_bbox_metric, rcnn_acc_metric, rcnn_bbox_metric, rcnn_mask_metric, rcnn_fgmask_metric] # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) if args.verbose: logger.info('Trainable parameters:') logger.info(net.collect_train_params().keys()) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info("[Epoch {}] Set learning rate to {}".format(epoch, new_lr)) for metric in metrics: metric.reset() tic = time.time() btic = time.time() if not args.disable_hybridization: net.hybridize(static_alloc=args.static_alloc) base_lr = trainer.learning_rate for i, batch in enumerate(train_data): if epoch == 0 and i <= lr_warmup: # adjust based on real percentage new_lr = base_lr * get_lr_at_iter(i / lr_warmup) if new_lr != trainer.learning_rate: if i % args.log_interval == 0: logger.info( '[Epoch 0 Iteration {}] Set learning rate to {}'.format(i, new_lr)) trainer.set_learning_rate(new_lr) batch = split_and_load(batch, ctx_list=ctx) batch_size = len(batch[0]) losses = [] metric_losses = [[] for _ in metrics] add_losses = [[] for _ in metrics2] with autograd.record(): for data, label, gt_mask, rpn_cls_targets, rpn_box_targets, rpn_box_masks in zip( *batch): gt_label = label[:, :, 4:5] gt_box = label[:, :, :4] cls_pred, box_pred, mask_pred, roi, samples, matches, rpn_score, rpn_box, anchors = net( data, gt_box) # losses of rpn rpn_score = rpn_score.squeeze(axis=-1) num_rpn_pos = (rpn_cls_targets >= 0).sum() rpn_loss1 = rpn_cls_loss(rpn_score, rpn_cls_targets, rpn_cls_targets >= 0) * rpn_cls_targets.size / num_rpn_pos rpn_loss2 = rpn_box_loss(rpn_box, rpn_box_targets, rpn_box_masks) * rpn_box.size / num_rpn_pos # rpn overall loss, use sum rather than average rpn_loss = rpn_loss1 + rpn_loss2 # generate targets for rcnn cls_targets, box_targets, box_masks = net.target_generator(roi, samples, matches, gt_label, gt_box) # losses of rcnn num_rcnn_pos = (cls_targets >= 0).sum() rcnn_loss1 = rcnn_cls_loss(cls_pred, cls_targets, cls_targets >= 0) * cls_targets.size / \ cls_targets.shape[0] / num_rcnn_pos rcnn_loss2 = rcnn_box_loss(box_pred, box_targets, box_masks) * box_pred.size / \ box_pred.shape[0] / num_rcnn_pos rcnn_loss = rcnn_loss1 + rcnn_loss2 # generate targets for mask mask_targets, mask_masks = net.mask_target(roi, gt_mask, matches, cls_targets) # loss of mask mask_loss = rcnn_mask_loss(mask_pred, mask_targets, mask_masks) * \ mask_targets.size / mask_targets.shape[0] / mask_masks.sum() # overall losses losses.append(rpn_loss.sum() + rcnn_loss.sum() + mask_loss.sum()) metric_losses[0].append(rpn_loss1.sum()) metric_losses[1].append(rpn_loss2.sum()) metric_losses[2].append(rcnn_loss1.sum()) metric_losses[3].append(rcnn_loss2.sum()) metric_losses[4].append(mask_loss.sum()) add_losses[0].append([[rpn_cls_targets, rpn_cls_targets >= 0], [rpn_score]]) add_losses[1].append([[rpn_box_targets, rpn_box_masks], [rpn_box]]) add_losses[2].append([[cls_targets], [cls_pred]]) add_losses[3].append([[box_targets, box_masks], [box_pred]]) add_losses[4].append([[mask_targets, mask_masks], [mask_pred]]) add_losses[5].append([[mask_targets, mask_masks], [mask_pred]]) autograd.backward(losses) for metric, record in zip(metrics, metric_losses): metric.update(0, record) for metric, records in zip(metrics2, add_losses): for pred in records: metric.update(pred[0], pred[1]) trainer.step(batch_size) # update metrics if args.log_interval and not (i + 1) % args.log_interval: msg = ','.join(['{}={:.3f}'.format(*metric.get()) for metric in metrics + metrics2]) logger.info('[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'.format( epoch, i, args.log_interval * batch_size / (time.time() - btic), msg)) btic = time.time() msg = ','.join(['{}={:.3f}'.format(*metric.get()) for metric in metrics]) logger.info('[Epoch {}] Training cost: {:.3f}, {}'.format( epoch, (time.time() - tic), msg)) if not (epoch + 1) % args.val_interval: # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric, args) val_msg = '\n'.join(['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0. save_params(net, logger, best_map, current_map, epoch, args.save_interval, args.save_prefix)
def run(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], offset_alloc_size=(64, 64), anchors={"shallow": [(10, 13), (16, 30), (33, 23)], "middle": [(30, 61), (62, 45), (59, 119)], "deep": [(116, 90), (156, 198), (373, 326)]}, graphviz=False, epoch=100, input_size=[416, 416], batch_log=100, batch_size=16, batch_interval=10, subdivision=4, train_dataset_path="Dataset/train", valid_dataset_path="Dataset/valid", multiscale=False, factor_scale=[13, 5], ignore_threshold=0.5, dynamic=False, data_augmentation=True, num_workers=4, optimizer="ADAM", save_period=5, load_period=10, learning_rate=0.001, decay_lr=0.999, decay_step=10, GPU_COUNT=0, Darknetlayer=53, pretrained_base=True, pretrained_path="modelparam", AMP=True, valid_size=8, eval_period=5, tensorboard=True, valid_graph_path="valid_Graph", using_mlflow=True, multiperclass=True, nms_thresh=0.5, nms_topk=500, iou_thresh=0.5, except_class_thresh=0.05, plot_class_thresh=0.5): if GPU_COUNT == 0: ctx = mx.cpu(0) AMP = False elif GPU_COUNT == 1: ctx = mx.gpu(0) else: ctx = [mx.gpu(i) for i in range(GPU_COUNT)] # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if isinstance(ctx, (list, tuple)): for i, c in enumerate(ctx): free_memory, total_memory = mx.context.gpu_memory_info(i) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info(f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB') else: if GPU_COUNT == 1: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info(f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB') else: logging.info(f'Running on {ctx}') # 입력 사이즈를 32의 배수로 지정해 버리기 - stride가 일그러지는 것을 막기 위함 if input_size[0] % 32 != 0 and input_size[1] % 32 != 0: logging.info("The input size must be a multiple of 32") exit(0) if GPU_COUNT > 0 and batch_size < GPU_COUNT: logging.info("batch size must be greater than gpu number") exit(0) if AMP: amp.init() if multiscale: logging.info("Using MultiScale") if data_augmentation: logging.info("Using Data Augmentation") logging.info("training YoloV3 Detector") input_shape = (1, 3) + tuple(input_size) try: net = Yolov3(Darknetlayer=Darknetlayer, anchors=anchors, pretrained=False, ctx=mx.cpu()) train_dataloader, train_dataset = traindataloader(multiscale=multiscale, factor_scale=factor_scale, augmentation=data_augmentation, path=train_dataset_path, input_size=input_size, batch_size=batch_size, batch_interval=batch_interval, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, ignore_threshold=ignore_threshold, dynamic=dynamic, from_sigmoid=False, make_target=True) valid_dataloader, valid_dataset = validdataloader(path=valid_dataset_path, input_size=input_size, batch_size=valid_size, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, ignore_threshold=ignore_threshold, dynamic=dynamic, from_sigmoid=False, make_target=True) except Exception: logging.info("dataset 없음") exit(0) train_update_number_per_epoch = len(train_dataloader) if train_update_number_per_epoch < 1: logging.warning("train batch size가 데이터 수보다 큼") exit(0) valid_list = glob.glob(os.path.join(valid_dataset_path, "*")) if valid_list: valid_update_number_per_epoch = len(valid_dataloader) if valid_update_number_per_epoch < 1: logging.warning("valid batch size가 데이터 수보다 큼") exit(0) num_classes = train_dataset.num_class # 클래스 수 name_classes = train_dataset.classes optimizer = optimizer.upper() if pretrained_base: model = str(input_size[0]) + "_" + str(input_size[1]) + "_" + optimizer + "_P" + "Dark_" + str(Darknetlayer) else: model = str(input_size[0]) + "_" + str(input_size[1]) + "_" + optimizer + "_Dark_" + str(Darknetlayer) weight_path = f"weights/{model}" sym_path = os.path.join(weight_path, f'{model}-symbol.json') param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params') if os.path.exists(param_path) and os.path.exists(sym_path): start_epoch = load_period logging.info(f"loading {os.path.basename(param_path)} weights\n") net = gluon.SymbolBlock.imports(sym_path, ['data'], param_path, ctx=ctx) else: start_epoch = 0 ''' mxnet c++에서 arbitrary input image 를 받기 위한 전략 alloc_size : tuple of int, default is (128, 128) For advanced users. Define `alloc_size` to generate large enough offset maps, which will later saved in parameters. During inference, we support arbitrary input image by cropping corresponding area of the anchor map. This allow us to export to symbol so we can run it in c++, Scalar, etc. ''' net = Yolov3(Darknetlayer=Darknetlayer, input_size=input_size, anchors=anchors, num_classes=num_classes, # foreground만 pretrained=pretrained_base, pretrained_path=pretrained_path, alloc_size=offset_alloc_size, ctx=ctx) if isinstance(ctx, (list, tuple)): net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.summary(mx.nd.ones(shape=input_shape, ctx=ctx)) ''' active (bool, default True) – Whether to turn hybrid on or off. static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase. static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower. ''' if multiscale: net.hybridize(active=True, static_alloc=True, static_shape=False) else: net.hybridize(active=True, static_alloc=True, static_shape=True) if start_epoch + 1 >= epoch + 1: logging.info("this model has already been optimized") exit(0) if tensorboard: summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10, verbose=False) if isinstance(ctx, (list, tuple)): net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.forward(mx.nd.ones(shape=input_shape, ctx=ctx)) summary.add_graph(net) if graphviz: gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model) # optimizer unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size step = unit * decay_step lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate) for p in net.collect_params().values(): if p.grad_req != "null": p.grad_req = 'add' if AMP: ''' update_on_kvstore : bool, default None Whether to perform parameter updates on kvstore. If None, then trainer will choose the more suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored. ''' if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False}, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False}, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0005, "momentum": 0.9, 'multi_precision': False}, update_on_kvstore=False) # for Dynamic loss scaling else: logging.error("optimizer not selected") exit(0) amp.init_trainer(trainer) else: if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False}) elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False}) elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0005, "momentum": 0.9, 'multi_precision': False}) else: logging.error("optimizer not selected") exit(0) loss = Yolov3Loss(sparse_label=True, from_sigmoid=False, batch_axis=None, num_classes=num_classes, reduction="sum", exclude=False) prediction = Prediction( from_sigmoid=False, num_classes=num_classes, nms_thresh=nms_thresh, nms_topk=nms_topk, except_class_thresh=except_class_thresh, multiperclass=multiperclass) precision_recall = Voc_2007_AP(iou_thresh=iou_thresh, class_names=name_classes) start_time = time.time() for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch): xcyc_loss_sum = 0 wh_loss_sum = 0 object_loss_sum = 0 class_loss_sum = 0 time_stamp = time.time() for batch_count, (image, _, xcyc_all, wh_all, objectness_all, class_all, weights_all, _) in enumerate( train_dataloader, start=1): td_batch_size = image.shape[0] image = mx.nd.split(data=image, num_outputs=subdivision, axis=0) xcyc_all = mx.nd.split(data=xcyc_all, num_outputs=subdivision, axis=0) wh_all = mx.nd.split(data=wh_all, num_outputs=subdivision, axis=0) objectness_all = mx.nd.split(data=objectness_all, num_outputs=subdivision, axis=0) class_all = mx.nd.split(data=class_all, num_outputs=subdivision, axis=0) weights_all = mx.nd.split(data=weights_all, num_outputs=subdivision, axis=0) if subdivision == 1: image = [image] xcyc_all = [xcyc_all] wh_all = [wh_all] objectness_all = [objectness_all] class_all = [class_all] weights_all = [weights_all] ''' autograd 설명 https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html ''' with autograd.record(train_mode=True): xcyc_all_losses = [] wh_all_losses = [] object_all_losses = [] class_all_losses = [] for image_split, xcyc_split, wh_split, objectness_split, class_split, weights_split in zip(image, xcyc_all, wh_all, objectness_all, class_all, weights_all): if GPU_COUNT <= 1: image_split = gluon.utils.split_and_load(image_split, [ctx], even_split=False) xcyc_split = gluon.utils.split_and_load(xcyc_split, [ctx], even_split=False) wh_split = gluon.utils.split_and_load(wh_split, [ctx], even_split=False) objectness_split = gluon.utils.split_and_load(objectness_split, [ctx], even_split=False) class_split = gluon.utils.split_and_load(class_split, [ctx], even_split=False) weights_split = gluon.utils.split_and_load(weights_split, [ctx], even_split=False) else: image_split = gluon.utils.split_and_load(image_split, ctx, even_split=False) xcyc_split = gluon.utils.split_and_load(xcyc_split, ctx, even_split=False) wh_split = gluon.utils.split_and_load(wh_split, ctx, even_split=False) objectness_split = gluon.utils.split_and_load(objectness_split, ctx, even_split=False) class_split = gluon.utils.split_and_load(class_split, ctx, even_split=False) weights_split = gluon.utils.split_and_load(weights_split, ctx, even_split=False) xcyc_losses = [] wh_losses = [] object_losses = [] class_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, xcyc_target, wh_target, objectness, class_target, weights in zip(image_split, xcyc_split, wh_split, objectness_split, class_split, weights_split): output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net( img) xcyc_loss, wh_loss, object_loss, class_loss = loss(output1, output2, output3, xcyc_target, wh_target, objectness, class_target, weights) xcyc_losses.append(xcyc_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) object_losses.append(object_loss.asscalar()) class_losses.append(class_loss.asscalar()) total_loss.append(xcyc_loss + wh_loss + object_loss + class_loss) if AMP: with amp.scale_loss(total_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(total_loss) xcyc_all_losses.append(sum(xcyc_losses)) wh_all_losses.append(sum(wh_losses)) object_all_losses.append(sum(object_losses)) class_all_losses.append(sum(class_losses)) trainer.step(batch_size=td_batch_size, ignore_stale_grad=False) # 비우기 for p in net.collect_params().values(): p.zero_grad() xcyc_loss_sum += sum(xcyc_all_losses) / td_batch_size wh_loss_sum += sum(wh_all_losses) / td_batch_size object_loss_sum += sum(object_all_losses) / td_batch_size class_loss_sum += sum(class_all_losses) / td_batch_size if batch_count % batch_log == 0: logging.info(f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],' f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],' f'[Lr = {trainer.learning_rate}]' f'[xcyc loss = {sum(xcyc_all_losses) / td_batch_size:.3f}]' f'[wh loss = {sum(wh_all_losses) / td_batch_size:.3f}]' f'[obj loss = {sum(object_all_losses) / td_batch_size:.3f}]' f'[class loss = {sum(class_all_losses) / td_batch_size:.3f}]') time_stamp = time.time() train_xcyc_loss_mean = np.divide(xcyc_loss_sum, train_update_number_per_epoch) train_wh_loss_mean = np.divide(wh_loss_sum, train_update_number_per_epoch) train_object_loss_mean = np.divide(object_loss_sum, train_update_number_per_epoch) train_class_loss_mean = np.divide(class_loss_sum, train_update_number_per_epoch) train_total_loss_mean = train_xcyc_loss_mean + train_wh_loss_mean + train_object_loss_mean + train_class_loss_mean logging.info( f"train xcyc loss : {train_xcyc_loss_mean} / " f"train wh loss : {train_wh_loss_mean} / " f"train object loss : {train_object_loss_mean} / " f"train class loss : {train_class_loss_mean} / " f"train total loss : {train_total_loss_mean}" ) if i % eval_period == 0 and valid_list: xcyc_loss_sum = 0 wh_loss_sum = 0 object_loss_sum = 0 class_loss_sum = 0 # loss 구하기 for image, label, xcyc_all, wh_all, objectness_all, class_all, weights_all, _ in valid_dataloader: vd_batch_size, _, height, width = image.shape if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) xcyc_all = gluon.utils.split_and_load(xcyc_all, [ctx], even_split=False) wh_all = gluon.utils.split_and_load(wh_all, [ctx], even_split=False) objectness_all = gluon.utils.split_and_load(objectness_all, [ctx], even_split=False) class_all = gluon.utils.split_and_load(class_all, [ctx], even_split=False) weights_all = gluon.utils.split_and_load(weights_all, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) xcyc_all = gluon.utils.split_and_load(xcyc_all, ctx, even_split=False) wh_all = gluon.utils.split_and_load(wh_all, ctx, even_split=False) objectness_all = gluon.utils.split_and_load(objectness_all, ctx, even_split=False) class_all = gluon.utils.split_and_load(class_all, ctx, even_split=False) weights_all = gluon.utils.split_and_load(weights_all, ctx, even_split=False) xcyc_losses = [] wh_losses = [] object_losses = [] class_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, lb, xcyc_target, wh_target, objectness, class_target, weights in zip(image, label, xcyc_all, wh_all, objectness_all, class_all, weights_all): gt_box = lb[:, :, :4] gt_id = lb[:, :, 4:5] output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net( img) id, score, bbox = prediction(output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3) precision_recall.update(pred_bboxes=bbox, pred_labels=id, pred_scores=score, gt_boxes=gt_box, gt_labels=gt_id) xcyc_loss, wh_loss, object_loss, class_loss = loss(output1, output2, output3, xcyc_target, wh_target, objectness, class_target, weights) xcyc_losses.append(xcyc_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) object_losses.append(object_loss.asscalar()) class_losses.append(class_loss.asscalar()) total_loss.append(xcyc_losses + wh_losses + object_losses + class_losses) xcyc_loss_sum += sum(xcyc_losses) / vd_batch_size wh_loss_sum += sum(wh_losses) / vd_batch_size object_loss_sum += sum(object_losses) / vd_batch_size class_loss_sum += sum(class_losses) / vd_batch_size valid_xcyc_loss_mean = np.divide(xcyc_loss_sum, valid_update_number_per_epoch) valid_wh_loss_mean = np.divide(wh_loss_sum, valid_update_number_per_epoch) valid_object_loss_mean = np.divide(object_loss_sum, valid_update_number_per_epoch) valid_class_loss_mean = np.divide(class_loss_sum, valid_update_number_per_epoch) valid_total_loss_mean = valid_xcyc_loss_mean + valid_wh_loss_mean + valid_object_loss_mean + valid_class_loss_mean logging.info( f"valid xcyc loss : {valid_xcyc_loss_mean} / " f"valid wh loss : {valid_wh_loss_mean} / " f"valid object loss : {valid_object_loss_mean} / " f"valid class loss : {valid_class_loss_mean} / " f"valid total loss : {valid_total_loss_mean}" ) AP_appender = [] round_position = 2 class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list() for j, c, p, r in zip(range(len(recall)), class_name, precision, recall): name, AP = precision_recall.get_AP(c, p, r) logging.info(f"class {j}'s {name} AP : {round(AP * 100, round_position)}%") AP_appender.append(AP) mAP_result = np.mean(AP_appender) logging.info(f"mAP : {round(mAP_result * 100, round_position)}%") precision_recall.get_PR_curve(name=class_name, precision=precision, recall=recall, threshold=threshold, AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i) precision_recall.reset() if tensorboard: # gpu N 개를 대비한 코드 (Data Parallelism) dataloader_iter = iter(valid_dataloader) image, label, _, _, _, _, _, _ = next(dataloader_iter) if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) ground_truth_colors = {} for k in range(num_classes): ground_truth_colors[k] = (0, 0, 1) batch_image = [] for img, lb in zip(image, label): gt_boxes = lb[:, :, :4] gt_ids = lb[:, :, 4:5] output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net( img) ids, scores, bboxes = prediction(output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3) for ig, gt_id, gt_box, id, score, bbox in zip(img, gt_ids, gt_boxes, ids, scores, bboxes): ig = ig.transpose( (1, 2, 0)) * mx.nd.array(std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context) ig = (ig * 255).clip(0, 255) # ground truth box 그리기 ground_truth = plot_bbox(ig, gt_box, scores=None, labels=gt_id, thresh=None, reverse_rgb=True, class_names=valid_dataset.classes, absolute_coordinates=True, colors=ground_truth_colors) # prediction box 그리기 prediction_box = plot_bbox(ground_truth, bbox, scores=score, labels=id, thresh=plot_class_thresh, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True) # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다. prediction_box = cv2.cvtColor(prediction_box, cv2.COLOR_BGR2RGB) prediction_box = np.transpose(prediction_box, axes=(2, 0, 1)) batch_image.append(prediction_box) # (batch, channel, height, width) summary.add_image(tag="valid_result", image=np.array(batch_image), global_step=i) summary.add_scalar(tag="xy_loss", value={"train_xcyc_loss": train_xcyc_loss_mean, "valid_xcyc_loss": valid_xcyc_loss_mean}, global_step=i) summary.add_scalar(tag="wh_loss", value={"train_wh_loss": train_wh_loss_mean, "valid_wh_loss": valid_wh_loss_mean}, global_step=i) summary.add_scalar(tag="object_loss", value={"train_object_loss": train_object_loss_mean, "valid_object_loss": valid_object_loss_mean}, global_step=i) summary.add_scalar(tag="class_loss", value={"train_class_loss": train_class_loss_mean, "valid_class_loss": valid_class_loss_mean}, global_step=i) summary.add_scalar(tag="total_loss", value={ "train_total_loss": train_total_loss_mean, "valid_total_loss": valid_total_loss_mean}, global_step=i) params = net.collect_params().values() if GPU_COUNT > 1: for c in ctx: for p in params: summary.add_histogram(tag=p.name, values=p.data(ctx=c), global_step=i, bins='default') else: for p in params: summary.add_histogram(tag=p.name, values=p.data(), global_step=i, bins='default') if i % save_period == 0: weight_epoch_path = os.path.join(weight_path, str(i)) if not os.path.exists(weight_epoch_path): os.makedirs(weight_epoch_path) ''' Hybrid models can be serialized as JSON files using the export function Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface. When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc. ''' if GPU_COUNT >= 1: context = mx.gpu(0) else: context = mx.cpu(0) postnet = PostNet(net=net, auxnet=prediction) try: net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True) # for onnx net.save_parameters(os.path.join(weight_path, f"{i}.params")) # onnx 추출용 # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 / onnx로는 추출 못함. export_block_for_cplusplus(path=os.path.join(weight_epoch_path, f"{model}_prepost"), block=postnet, data_shape=tuple(input_size) + tuple((3,)), epoch=i, preprocess=True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=context, remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공") net.collect_params().reset_ctx(ctx) end_time = time.time() learning_time = end_time - start_time logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H") logging.info("optimization completed") if using_mlflow: ml.log_metric("learning time", round(learning_time / 3600, 2))
metrics = ConfusMatMulticls(nb_cls=2, output="batch_stat.txt") test_metri = ConfusMatMulticls(nb_cls=2) for epoch in range(num_epochs): t0 = time.time() total_loss = 0 metrics.reset() count = 0 nbatch = 0 for data, label in train_loader: batch_size = data.shape[0] with ag.record(): preds = model(data) losses = criterion(preds, label) ag.backward(losses) total_loss += sum([l.sum().asscalar() for l in losses]) trainer.step(batch_size) metrics.update(batch=nbatch, labels=label, preds=preds) count = count + batch_size nbatch += 1 confusionMat, tps, tns, fps, fns = metrics.get() acc = (tps + tns) / (tps + tns + fps + fns) recalls = tps / ((tps + fns) + 1e-8) precisions = tps / ((tps + fps) + 1e-8) f1s = 2 * (recalls * precisions) / ((recalls + precisions) + 1e-8)
def train(net, train_data, val_data, eval_metric, ctx, args): """Training pipeline""" net.collect_params().reset_ctx(ctx) if args.lr_decay_period > 0: lr_decay_epoch = list( range(args.lr_decay_period, args.epochs, args.lr_decay_period)) else: lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')] lr_decay_epoch = [e - args.warmup_epochs for e in lr_decay_epoch] num_batches = args.num_samples // args.batch_size lr_scheduler = LRSequential([ LRScheduler('linear', base_lr=0, target_lr=args.lr, nepochs=args.warmup_epochs, iters_per_epoch=num_batches), LRScheduler(args.lr_mode, base_lr=args.lr, nepochs=args.epochs - args.warmup_epochs, iters_per_epoch=num_batches, step_epoch=lr_decay_epoch, step_factor=args.lr_decay, power=2), ]) trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'lr_scheduler': lr_scheduler, 'wd': args.wd, 'momentum': args.momentum }, update_on_kvstore=(False if args.amp else None)) if args.amp: amp.init_trainer(trainer) print("train_efficientdet.py-148 train classes=", classes, len(classes)) cls_box_loss = EfficientDetLoss(len(classes) + 1, rho=0.1, lambd=50.0) ce_metric = mx.metric.Loss('FocalLoss') smoothl1_metric = mx.metric.Loss('SmoothL1') # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch + 1, args.epochs + 1): logger.info("[Epoch {}] Set learning rate to {}".format( epoch, trainer.learning_rate)) ce_metric.reset() smoothl1_metric.reset() tic = time.time() btic = time.time() net.hybridize() for i, batch in enumerate(train_data): data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0) with autograd.record(): cls_preds = [] box_preds = [] for x in data: cls_pred, box_pred, _ = net(x) cls_preds.append(cls_pred) box_preds.append(box_pred) sum_loss, cls_loss, box_loss = cls_box_loss( cls_preds, box_preds, cls_targets, box_targets) if args.amp: with amp.scale_loss(sum_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(sum_loss) # since we have already normalized the loss, we don't want to normalize # by batch-size anymore trainer.step(1) local_batch_size = int(args.batch_size) ce_metric.update(0, [l * local_batch_size for l in cls_loss]) smoothl1_metric.update(0, [l * local_batch_size for l in box_loss]) if args.log_interval and not (i + 1) % args.log_interval: name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() logger.info( '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}' .format(epoch, i, args.batch_size / (time.time() - btic), name1, loss1, name2, loss2)) btic = time.time() name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() logger.info( '[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}'.format( epoch, (time.time() - tic), name1, loss1, name2, loss2)) if (epoch % args.val_interval == 0) or (args.save_interval and epoch % args.save_interval == 0): # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric) val_msg = '\n'.join( ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0. save_params(net, best_map, current_map, epoch, args.save_interval, args.save_prefix)
running_reward = running_reward * 0.99 + t * 0.01 R = 0 for i in range(len(rewards)-1, -1, -1): R = rewards[i] + args.gamma * R rewards[i] = R rewards = np.array(rewards) rewards -= rewards.mean() rewards /= rewards.std() + np.finfo(rewards.dtype).eps # compute loss and gradient L = sum([loss(value, mx.nd.array([r])) for r, value in zip(rewards, values)]) final_nodes = [L] for logp, r, v in zip(heads, rewards, values): reward = r - v.asnumpy()[0,0] # Here we differentiate the stochastic graph, corresponds to the # first term of equation (6) in https://arxiv.org/pdf/1506.05254.pdf # Optimizer minimizes the loss but we want to maximizing the reward, # so use we use -reward here. final_nodes.append(logp*(-reward)) autograd.backward(final_nodes) trainer.step(t) if epoch % args.log_interval == 0: print('Episode {}\tLast length: {:5d}\tAverage length: {:.2f}'.format( epoch, t, running_reward)) if running_reward > 200: print("Solved! Running reward is now {} and " "the last episode runs to {} time steps!".format(running_reward, t)) break
def _train_loop(self, train_data, val_data, train_eval_data, time_limit=math.inf): start_tic = time.time() # fix seed for mxnet, numpy and python builtin random generator. gutils.random.seed(self._cfg.train.seed) # metrics obj_metrics = mx.metric.Loss('ObjLoss') center_metrics = mx.metric.Loss('BoxCenterLoss') scale_metrics = mx.metric.Loss('BoxScaleLoss') cls_metrics = mx.metric.Loss('ClassLoss') trainer = self.trainer self._logger.info('Start training from [Epoch %d]', max(self._cfg.train.start_epoch, self.epoch)) early_stopper = EarlyStopperOnPlateau( patience=self._cfg.train.early_stop_patience, min_delta=self._cfg.train.early_stop_min_delta, baseline_value=self._cfg.train.early_stop_baseline, max_value=self._cfg.train.early_stop_max_value) mean_ap = [-1] cp_name = '' self._time_elapsed += time.time() - start_tic for self.epoch in range(max(self._cfg.train.start_epoch, self.epoch), self._cfg.train.epochs): epoch = self.epoch if self._best_map >= 1.0: self._logger.info('[Epoch {}] Early stopping as mAP is reaching 1.0'.format(epoch)) break should_stop, stop_message = early_stopper.get_early_stop_advice() if should_stop: self._logger.info('[Epoch {}] '.format(epoch) + stop_message) break tic = time.time() last_tic = time.time() if self._cfg.train.mixup: # TODO(zhreshold): more elegant way to control mixup during runtime try: train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5) except AttributeError: train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5) if epoch >= self._cfg.train.epochs - self._cfg.train.no_mixup_epochs: try: train_data._dataset.set_mixup(None) except AttributeError: train_data._dataset._data.set_mixup(None) mx.nd.waitall() self.net.hybridize() for i, batch in enumerate(train_data): btic = time.time() if self._time_elapsed > time_limit: self._logger.warning(f'`time_limit={time_limit}` reached, exit early...') return {'train_map': float(mean_ap[-1]), 'valid_map': self._best_map, 'time': self._time_elapsed, 'checkpoint': cp_name} data = gluon.utils.split_and_load(batch[0], ctx_list=self.ctx, batch_axis=0, even_split=False) # objectness, center_targets, scale_targets, weights, class_targets fixed_targets = [gluon.utils.split_and_load(batch[it], ctx_list=self.ctx, batch_axis=0, even_split=False) for it in range(1, 6)] gt_boxes = gluon.utils.split_and_load(batch[6], ctx_list=self.ctx, batch_axis=0, even_split=False) sum_losses = [] obj_losses = [] center_losses = [] scale_losses = [] cls_losses = [] with autograd.record(): for ix, x in enumerate(data): obj_loss, center_loss, scale_loss, cls_loss = self.net(x, gt_boxes[ix], *[ft[ix] for ft in fixed_targets]) sum_losses.append(obj_loss + center_loss + scale_loss + cls_loss) obj_losses.append(obj_loss) center_losses.append(center_loss) scale_losses.append(scale_loss) cls_losses.append(cls_loss) if self._cfg.yolo3.amp: with amp.scale_loss(sum_losses, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(sum_losses) trainer.step(self.batch_size) if (not self._cfg.horovod or hvd.rank() == 0): obj_metrics.update(0, obj_losses) center_metrics.update(0, center_losses) scale_metrics.update(0, scale_losses) cls_metrics.update(0, cls_losses) if self._cfg.train.log_interval and not (i + 1) % self._cfg.train.log_interval: name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() self._logger.info( '[Epoch {}][Batch {}], LR: {:.2E}, Speed: {:.3f} samples/sec,' ' {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'.format( epoch, i, trainer.learning_rate, self._cfg.train.batch_size / (time.time() - last_tic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) last_tic = time.time() self._time_elapsed += time.time() - btic post_tic = time.time() if (not self._cfg.horovod or hvd.rank() == 0): name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() self._logger.info('[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'.format( epoch, (time.time() - tic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) if not (epoch + 1) % self._cfg.valid.val_interval: # consider reduce the frequency of validation to save time map_name, mean_ap = self._evaluate(val_data) val_msg = '\n'.join(['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) self._logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg)) current_map = float(mean_ap[-1]) if current_map > self._best_map: cp_name = os.path.join(self._logdir, _BEST_CHECKPOINT_FILE) self._logger.info('[Epoch %d] Current best map: %f vs previous %f, saved to %s', self.epoch, current_map, self._best_map, cp_name) self.save(cp_name) self._best_map = current_map if self._reporter: self._reporter(epoch=epoch, map_reward=current_map) early_stopper.update(current_map, epoch=epoch) self._time_elapsed += time.time() - post_tic # map on train data tic = time.time() map_name, mean_ap = self._evaluate(train_eval_data) self._time_elapsed += time.time() - tic return {'train_map': float(mean_ap[-1]), 'valid_map': self._best_map, 'time': self._time_elapsed, 'checkpoint': cp_name}
def train_crnn(net, train_dataset, val_dataset=None, gpus=[7], base_lr=1e-3, momentum=.9, wd=1e-4, log_interval=50): criterion = mx.gluon.loss.CTCLoss(layout='NTC', label_layout='NT') train_loader = mx.gluon.data.DataLoader(train_dataset, shuffle=True, batch_size=16, num_workers=16) if val_dataset is not None: val_loader = mx.gluon.data.DataLoader(val_dataset, shuffle=True, batch_size=32) ctx_list = [mx.gpu(x) for x in gpus] net.collect_params().reset_ctx(ctx_list) net.hybridize(static_alloc=True, static_shape=True) trainer = mx.gluon.Trainer( net.collect_params(), 'adam', { 'learning_rate': base_lr, # 'wd': wd, # 'momentum': momentum, 'clip_gradient': 5 }) metric = mx.metric.Loss(name="ctx_loss") acc_metric = SentenceAccuMetric(name="accu") eval_metrics = mx.metric.CompositeEvalMetric() eval_metrics.add(metric) eval_metrics.add(acc_metric) btic = time.time() step = 0 for n_epoch in range(100): if n_epoch == 4: trainer.set_learning_rate(base_lr * 0.1) for n_batch, data_batch in enumerate(train_loader): data, label, label_lengths = [ x.as_in_context(ctx_list[0]).astype('f') for x in data_batch ] # label_cat = [l[:l_l.asscalar()] for l,l_l in zip(label, label_lengths)] # label_cat = mx.nd.concat(*label_cat, dim=0) # label_cat = label_cat.asnumpy() with ag.record(): y = net(data) # loss = criterion(y.reshape(1, -1, y.shape[2]), label_cat.reshape(1, -1)) # type: mx.nd.NDArray loss = criterion( y, label, mx.nd.array([y.shape[1]] * y.shape[0], ctx=y.context), label_lengths) loss = loss / data.shape[0] loss = loss.sum() ag.backward(loss) trainer.step(batch_size=1) metric.update(None, preds=loss) acc_metric.update(labels=label, preds=y) step += 1 if n_batch % 1000 == 0: save_path = "output/weight-{}-{}-{:.3f}.params".format( n_epoch, n_batch, acc_metric.get()[1]) net.collect_params().save(save_path) trainer.save_states(save_path + ".trainer") if n_batch % log_interval == 0: msg = ','.join([ '{}={:.5f}'.format(w, v) for w, v in zip(*eval_metrics.get()) ]) msg += ",lr={}".format(trainer.learning_rate) msg += ",Speed: {:.3f} samples/sec".format( (log_interval * data.shape[0]) / (time.time() - btic), ) logging.info("Epoch={},Step={},N_Batch={},".format( n_epoch, step, n_batch) + msg) btic = time.time() eval_metrics.reset() acc_metric.reset()
def train(self, nb_epoch=1): """Train the model and update the model parameters.""" stats = dict() if self.is_worker: start_time = time.time() if self.trainer: # Imperative API for epoch in range(nb_epoch): self.train_data.reset() if self.metrics: self.metrics.reset() # metrics will accumulate for one batch batch_start_time = time.time() epoch_start_time = time.time() for i, batch in enumerate(self.train_data): data = gluon.utils.split_and_load( batch.data[0].astype("float32"), ctx_list=[mx.cpu()], batch_axis=0) label = gluon.utils.split_and_load( batch.label[0].astype("float32"), ctx_list=[mx.cpu()], batch_axis=0) outputs = [] Ls = [] from mxnet import autograd as ag with ag.record(): for x, y in zip(data, label): z = self.model(x) # forward L = self.loss(z, y) # store the loss and do backward on a batch for better speed Ls.append(L) outputs.append(z) ag.backward(Ls) self.trainer.step(batch.data[0].shape[0]) if self.metrics: self.metrics.update(label, outputs) if not (i + 1) % self.config["log_interval"]: # This would be logged on driver for each worker process. iteration_log = \ "Epoch[%d] Batch[%d] Speed: %f samples/sec %s=%f" \ % (epoch, i, self.config["batch_size"] / (time.time() - batch_start_time), "loss", Ls[0].asnumpy().mean()) if self.metrics: names, accs = self.metrics.get() names, accs = to_list(names), to_list(accs) for name, acc in zip(names, accs): iteration_log += " %s=%f" % (name, acc) self.logger.info(iteration_log) batch_start_time = time.time() # Epoch time log self.logger.info("[Epoch %d] time cost: %f" % (epoch, time.time() - epoch_start_time)) # Epoch metrics log on train data if self.metrics: epoch_train_log = "[Epoch %d] training: " % epoch names, accs = self.metrics.get() names, accs = to_list(names), to_list(accs) for name, acc in zip(names, accs): epoch_train_log += "%s=%f " % (name, acc) self.logger.info(epoch_train_log) # Epoch metrics log on validation data if any: if self.val_data: self.metrics.reset() self.val_data.reset() for batch in self.val_data: data = gluon.utils.split_and_load( batch.data[0].astype("float32", copy=False), ctx_list=[mx.cpu()], batch_axis=0) label = gluon.utils.split_and_load( batch.label[0].astype("float32", copy=False), ctx_list=[mx.cpu()], batch_axis=0) outputs = [self.model(X) for X in data] self.metrics.update(label, outputs) epoch_val_log = "[Epoch %d] validation: " % epoch names, accs = self.metrics.get() names, accs = to_list(names), to_list(accs) for name, acc in zip(names, accs): epoch_val_log += "%s=%f " % (name, acc) self.logger.info(epoch_val_log) # TODO: save checkpoints if self.metrics: names, accs = self.metrics.get() names, accs = to_list(names), to_list(accs) for name, acc in zip(names, accs): stats[name] = acc else: # Symbolic API # TODO: seems no history (i.e. validation accuracy) returned by fit? if "init" not in self.config: from mxnet.initializer import Uniform self.config["init"] = Uniform(0.01) # This is the default value for MXNet self.model.fit(train_data=self.train_data, num_epoch=nb_epoch, initializer=self.config["init"], kvstore=self.kv, optimizer=self.config["optimizer"], optimizer_params=self.config["optimizer_params"], eval_data=self.val_data, # TODO: eval and validation metrics could be different eval_metric=self.metrics, validation_metric=self.metrics, batch_end_callback=mx.callback.Speedometer( self.config["batch_size"], self.config["log_interval"]), epoch_end_callback=None if "model" not in self.config else mx.callback.do_checkpoint(self.config["model"])) epoch_time = time.time() - start_time stats["epoch_time"] = epoch_time return stats
def train(gt_labeling_task, epochs, base_network, classes, learning_rate, wd, momentum, model_dir, train, labels, current_host, hosts): """ Transfer learning. """ import gluoncv as gcv from gluoncv import model_zoo, data, utils # get the pretrained model and set classes to AWS model = gcv.model_zoo.get_model(base_network, classes=classes, pretrained_base=False, transfer='voc') #images and labels from Groundtruth are downloaded by Sagemaker into training instance train_dataset = GroundTruthDetectionDataset(split='train', label_path=labels, data_path=train, task=gt_labeling_task) val_dataset = GroundTruthDetectionDataset(split='val', label_path=labels, data_path=train, task=gt_labeling_task) #define dataloader train_loader = get_dataloader(model, train_dataset, val_dataset, 512, 512, 16, 1) #check if GPUs are available ctx = [mx.gpu() if mx.context.num_gpus() > 0 else mx.cpu()] print('ctx:', ctx) #reassign parameters to context ctx model.collect_params().reset_ctx(ctx) #define Trainer trainer = gluon.Trainer(model.collect_params(), 'sgd', { 'learning_rate': learning_rate, 'wd': wd, 'momentum': momentum }) # SSD losses: Confidence Loss (Cross entropy) + Location Loss (L2 loss) mbox_loss = gcv.loss.SSDMultiBoxLoss() ce_metric = mx.metric.Loss('CrossEntropy') smoothl1_metric = mx.metric.Loss('SmoothL1') # start transfer learning for epoch in range(0, epochs): ce_metric.reset() smoothl1_metric.reset() tic = time.time() btic = time.time() #hybridize model model.hybridize(static_alloc=True, static_shape=True) #iterate over training images for i, batch in enumerate(train_loader): #load data on the right context batch_size = batch[0].shape[0] #Splits an NDArray into len(ctx_list) slices and loads each slice to one context data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0) #forward pass with autograd.record(): cls_preds = [] box_preds = [] for x in data: cls_pred, box_pred, _ = model(x) cls_preds.append(cls_pred) box_preds.append(box_pred) sum_loss, cls_loss, box_loss = mbox_loss( cls_preds, box_preds, cls_targets, box_targets) autograd.backward(sum_loss) #upate model parameters trainer.step(1) #update and print metrics ce_metric.update(0, [l * batch_size for l in cls_loss]) smoothl1_metric.update(0, [l * batch_size for l in box_loss]) name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() if i % 1 == 0: print( '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}' .format(epoch, i, batch_size / (time.time() - btic), name1, loss1, name2, loss2)) btic = time.time() #save model model.set_nms(nms_thresh=0.45, nms_topk=400, post_nms=100) model(mx.nd.ones((1, 3, 512, 512), ctx=ctx[0])) model.export('%s/model' % model_dir) return model
def _worker(loss): autograd.backward(loss)
def train(self, train_data, epochs=1, batch_size=32, validation_data=None, train_resize_batch_num=None): """Train the model and update the model parameters.""" stats = dict() if self.is_worker: from zoo.orca.data.shard import RayPartition if isinstance(train_data, RayPartition): from zoo.orca.data.utils import ray_partition_get_data_label data, label = ray_partition_get_data_label( train_data.get_data(), allow_tuple=False, allow_list=False) train_data_iter = mx.io.NDArrayIter(data=data, label=label, batch_size=batch_size, shuffle=True) if train_resize_batch_num is not None: train_data_iter = mx.io.ResizeIter(train_data_iter, train_resize_batch_num) if validation_data: data_val, label_val = ray_partition_get_data_label( validation_data.get_data(), allow_tuple=False, allow_list=False) val_data_iter = mx.io.NDArrayIter(data=data_val, label=label_val, batch_size=batch_size, shuffle=True) else: val_data_iter = None else: # data_creator functions; should return Iter or DataLoader config = self.config if "batch_size" not in config: config["batch_size"] = batch_size train_data_iter = train_data(config, self.kv) val_data_iter = validation_data( config, self.kv) if validation_data else None start_time = time.time() if self.trainer: # Imperative API for epoch in range(epochs): train_data_iter.reset() if self.eval_metrics: self.eval_metrics.reset( ) # metrics will accumulate for one batch batch_start_time = time.time() epoch_start_time = time.time() for i, batch in enumerate(train_data_iter): data = gluon.utils.split_and_load( batch.data[0].astype("float32"), ctx_list=[mx.cpu()], batch_axis=0) label = gluon.utils.split_and_load( batch.label[0].astype("float32"), ctx_list=[mx.cpu()], batch_axis=0) outputs = [] Ls = [] from mxnet import autograd as ag with ag.record(): for x, y in zip(data, label): z = self.model(x) # forward L = self.loss(z, y) # store the loss and do backward on a batch for better speed Ls.append(L) outputs.append(z) ag.backward(Ls) self.trainer.step(batch.data[0].shape[0]) if self.eval_metrics: self.eval_metrics.update(label, outputs) if not (i + 1) % self.config["log_interval"]: # This would be logged on driver for each worker process. iteration_log = \ "Epoch[%d] Batch[%d] Speed: %f samples/sec %s=%f" \ % (epoch, i, batch_size / (time.time() - batch_start_time), "loss", Ls[0].asnumpy().mean()) if self.eval_metrics: names, accs = self.eval_metrics.get() names, accs = to_list(names), to_list(accs) for name, acc in zip(names, accs): iteration_log += " %s=%f" % (name, acc) self.logger.info(iteration_log) batch_start_time = time.time() # Epoch time log self.logger.info("[Epoch %d] time cost: %f" % (epoch, time.time() - epoch_start_time)) # Epoch metrics log on train data if self.eval_metrics: epoch_train_log = "[Epoch %d] training: " % epoch names, accs = self.eval_metrics.get() names, accs = to_list(names), to_list(accs) for name, acc in zip(names, accs): epoch_train_log += "%s=%f " % (name, acc) self.logger.info(epoch_train_log) # Epoch metrics log on validation data if any: if val_data_iter: self.val_metrics.reset() val_data_iter.reset() for batch in val_data_iter: data = gluon.utils.split_and_load( batch.data[0].astype("float32", copy=False), ctx_list=[mx.cpu()], batch_axis=0) label = gluon.utils.split_and_load( batch.label[0].astype("float32", copy=False), ctx_list=[mx.cpu()], batch_axis=0) outputs = [self.model(X) for X in data] self.val_metrics.update(label, outputs) epoch_val_log = "[Epoch %d] validation: " % epoch names, accs = self.val_metrics.get() names, accs = to_list(names), to_list(accs) for name, acc in zip(names, accs): epoch_val_log += "%s=%f " % (name, acc) self.logger.info(epoch_val_log) # TODO: save checkpoints if self.eval_metrics: names, accs = self.eval_metrics.get() names, accs = to_list(names), to_list(accs) for name, acc in zip(names, accs): stats[name] = acc else: # Symbolic API # TODO: seems no history (i.e. validation accuracy) returned by fit? if "init" not in self.config: from mxnet.initializer import Uniform self.config["init"] = Uniform( 0.01) # This is the default value for MXNet if self.eval_metrics is None: self.eval_metrics = 'acc' self.model.fit( train_data=train_data_iter, num_epoch=epochs, initializer=self.config["init"], kvstore=self.kv, optimizer=self.config["optimizer"], optimizer_params=self.config["optimizer_params"], eval_data=val_data_iter, eval_metric=self.eval_metrics, validation_metric=self.val_metrics, batch_end_callback=mx.callback.Speedometer( batch_size, self.config["log_interval"]), epoch_end_callback=None if "model" not in self.config else mx.callback.do_checkpoint(self.config["model"])) epoch_time = time.time() - start_time stats["epoch_time"] = epoch_time if isinstance(train_data, RayPartition): del train_data if validation_data and isinstance(validation_data, RayPartition): del validation_data return stats
def train_job(net, train_data, val_data, eval_metric, ctx, args): """Training pipeline""" net.collect_params().reset_ctx(ctx) if args.no_wd: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 if args.label_smooth: net._target_generator._label_smooth = True if args.lr_decay_period > 0: lr_decay_epoch = list(range(args.lr_decay_period, args.epochs, args.lr_decay_period)) else: lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')] lr_decay_epoch = [e - args.warmup_epochs for e in lr_decay_epoch] num_batches = args.num_samples // args.batch_size lr_scheduler = LRSequential([ LRScheduler('linear', base_lr=0, target_lr=args.lr, nepochs=args.warmup_epochs, iters_per_epoch=num_batches), LRScheduler(args.lr_mode, base_lr=args.lr, nepochs=args.epochs - args.warmup_epochs, iters_per_epoch=num_batches, step_epoch=lr_decay_epoch, step_factor=args.lr_decay, power=2), ]) trainer = gluon.Trainer( net.collect_params(), 'sgd', {'wd': args.wd, 'momentum': args.momentum, 'lr_scheduler': lr_scheduler}, kvstore='local') # targets sigmoid_ce = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) l1_loss = gluon.loss.L1Loss() # metrics obj_metrics = mx.metric.Loss('ObjLoss') center_metrics = mx.metric.Loss('BoxCenterLoss') scale_metrics = mx.metric.Loss('BoxScaleLoss') cls_metrics = mx.metric.Loss('ClassLoss') # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): if args.mixup: # TODO(zhreshold): more elegant way to control mixup during runtime try: train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5) except AttributeError: train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5) if epoch >= args.epochs - args.no_mixup_epochs: try: train_data._dataset.set_mixup(None) except AttributeError: train_data._dataset._data.set_mixup(None) tic = time.time() btic = time.time() mx.nd.waitall() net.hybridize() for i, batch in enumerate(train_data): batch_size = batch[0].shape[0] data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) # objectness, center_targets, scale_targets, weights, class_targets fixed_targets = [gluon.utils.split_and_load(batch[it], ctx_list=ctx, batch_axis=0) for it in range(1, 6)] gt_boxes = gluon.utils.split_and_load(batch[6], ctx_list=ctx, batch_axis=0) sum_losses = [] obj_losses = [] center_losses = [] scale_losses = [] cls_losses = [] with autograd.record(): for ix, x in enumerate(data): obj_loss, center_loss, scale_loss, cls_loss = net(x, gt_boxes[ix], *[ft[ix] for ft in fixed_targets]) sum_losses.append(obj_loss + center_loss + scale_loss + cls_loss) obj_losses.append(obj_loss) center_losses.append(center_loss) scale_losses.append(scale_loss) cls_losses.append(cls_loss) autograd.backward(sum_losses) trainer.step(batch_size) obj_metrics.update(0, obj_losses) center_metrics.update(0, center_losses) scale_metrics.update(0, scale_losses) cls_metrics.update(0, cls_losses) if args.log_interval and not (i + 1) % args.log_interval: name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() logger.info('[Epoch {}][Batch {}], LR: {:.2E}, Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'.format( epoch, i, trainer.learning_rate, batch_size/(time.time()-btic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) btic = time.time() name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() logger.info('[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'.format( epoch, (time.time()-tic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) if not (epoch + 1) % args.val_interval: # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric) val_msg = '\n'.join(['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0. CWMetrics.CW_eval("yolov3-darknet53-custom", is_training=True, obj_loss=loss1, bcenter_loss=loss2, bscale_loss=loss3, class_loss=loss4, m_ap=current_map) save_params(net, best_map, current_map, epoch, args.save_interval, args.save_prefix)
def train(opt, ctx): if isinstance(ctx, mx.Context): ctx = [ctx] train_data, val_data = get_data_iters(dataset, batch_size, opt) net.collect_params().reset_ctx(ctx) trainer = gluon.Trainer(net.collect_params(), 'sgd', optimizer_params={ 'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum, 'multi_precision': True }, kvstore=kv) loss = gluon.loss.SoftmaxCrossEntropyLoss() total_time = 0 num_epochs = 0 best_acc = [0] for epoch in range(opt.start_epoch, opt.epochs): trainer = update_learning_rate(opt.lr, trainer, epoch, opt.lr_factor, lr_steps) tic = time.time() train_data.reset() metric.reset() btic = time.time() for i, batch in enumerate(train_data): data = gluon.utils.split_and_load(batch.data[0].astype(opt.dtype), ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load(batch.label[0].astype( opt.dtype), ctx_list=ctx, batch_axis=0) outputs = [] Ls = [] with ag.record(): for x, y in zip(data, label): z = net(x) L = loss(z, y) # store the loss and do backward after we have done forward # on all GPUs for better speed on multiple GPUs. Ls.append(L) outputs.append(z) ag.backward(Ls) trainer.step(batch.data[0].shape[0]) metric.update(label, outputs) if opt.log_interval and not (i + 1) % opt.log_interval: name, acc = metric.get() logger.info( 'Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f, %s=%f' % (epoch, i, batch_size / (time.time() - btic), name[0], acc[0], name[1], acc[1])) btic = time.time() epoch_time = time.time() - tic # First epoch will usually be much slower than the subsequent epics, # so don't factor into the average if num_epochs > 0: total_time = total_time + epoch_time num_epochs = num_epochs + 1 name, acc = metric.get() logger.info('[Epoch %d] training: %s=%f, %s=%f' % (epoch, name[0], acc[0], name[1], acc[1])) logger.info('[Epoch %d] time cost: %f' % (epoch, epoch_time)) name, val_acc = test(ctx, val_data) logger.info('[Epoch %d] validation: %s=%f, %s=%f' % (epoch, name[0], val_acc[0], name[1], val_acc[1])) # save model if meet requirements save_checkpoint(epoch, val_acc[0], best_acc) if num_epochs > 1: print('Average epoch time: {}'.format( float(total_time) / (num_epochs - 1)))
def train(net, train_data, train_dataset, val_data, eval_metric, ctx, save_prefix, start_epoch, num_samples): """Training pipeline""" net.collect_params().reset_ctx(ctx) if FLAGS.no_wd: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 if FLAGS.label_smooth: net._target_generator._label_smooth = True if FLAGS.lr_decay_period > 0: lr_decay_epoch = list(range(FLAGS.lr_decay_period, FLAGS.epochs, FLAGS.lr_decay_period)) else: lr_decay_epoch = FLAGS.lr_decay_epoch # for handling reloading from past epoch lr_decay_epoch_tmp = list() for e in lr_decay_epoch: if int(e) <= start_epoch: FLAGS.lr = FLAGS.lr * FLAGS.lr_decay else: lr_decay_epoch_tmp.append(int(e) - start_epoch - FLAGS.warmup_epochs) lr_decay_epoch = lr_decay_epoch_tmp num_batches = num_samples // FLAGS.batch_size lr_scheduler = LRSequential([ LRScheduler('linear', base_lr=0, target_lr=FLAGS.lr, nepochs=FLAGS.warmup_epochs, iters_per_epoch=num_batches), LRScheduler(FLAGS.lr_mode, base_lr=FLAGS.lr, nepochs=FLAGS.epochs - FLAGS.warmup_epochs - start_epoch, iters_per_epoch=num_batches, step_epoch=lr_decay_epoch, step_factor=FLAGS.lr_decay, power=2), ]) trainer = gluon.Trainer( net.collect_params(), 'sgd', {'wd': FLAGS.wd, 'momentum': FLAGS.momentum, 'lr_scheduler': lr_scheduler}, kvstore='local') # targets sigmoid_ce = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) l1_loss = gluon.loss.L1Loss() # metrics obj_metrics = mx.metric.Loss('ObjLoss') center_metrics = mx.metric.Loss('BoxCenterLoss') scale_metrics = mx.metric.Loss('BoxScaleLoss') cls_metrics = mx.metric.Loss('ClassLoss') # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) # logger.info(FLAGS) # set up tensorboard summary writer tb_sw = SummaryWriter(log_dir=os.path.join(log_dir, 'tb'), comment=FLAGS.save_prefix) # Check if wanting to resume logger.info('Start training from [Epoch {}]'.format(start_epoch)) if FLAGS.resume.strip() and os.path.exists(save_prefix+'_best_map.log'): with open(save_prefix+'_best_map.log', 'r') as f: lines = [line.split()[1] for line in f.readlines()] best_map = [float(lines[-1])] else: best_map = [0] # Training loop num_batches = int(len(train_dataset)/FLAGS.batch_size) for epoch in range(start_epoch, FLAGS.epochs+1): st = time.time() if FLAGS.mixup: # TODO(zhreshold): more elegant way to control mixup during runtime try: train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5) except AttributeError: train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5) if epoch >= FLAGS.epochs - FLAGS.no_mixup_epochs: try: train_data._dataset.set_mixup(None) except AttributeError: train_data._dataset._data.set_mixup(None) tic = time.time() btic = time.time() if not FLAGS.nd_only: net.hybridize() for i, batch in enumerate(train_data): batch_size = batch[0].shape[0] if FLAGS.max_epoch_time > 0 and (time.time()-st)/60 > FLAGS.max_epoch_time: logger.info('Max epoch time of %d minutes reached after completing %d%% of epoch. ' 'Moving on to next epoch' % (FLAGS.max_epoch_time, int(100*(i/num_batches)))) break if FLAGS.features_dir is not None: f1 = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) f2 = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) f3 = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0) # objectness, center_targets, scale_targets, weights, class_targets fixed_targets = [gluon.utils.split_and_load(batch[it], ctx_list=ctx, batch_axis=0) for it in range(3, 8)] gt_boxes = gluon.utils.split_and_load(batch[8], ctx_list=ctx, batch_axis=0) else: data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) # objectness, center_targets, scale_targets, weights, class_targets fixed_targets = [gluon.utils.split_and_load(batch[it], ctx_list=ctx, batch_axis=0) for it in range(1, 6)] gt_boxes = gluon.utils.split_and_load(batch[6], ctx_list=ctx, batch_axis=0) sum_losses = [] obj_losses = [] center_losses = [] scale_losses = [] cls_losses = [] if FLAGS.features_dir is not None: with autograd.record(): for ix, (x1, x2, x3) in enumerate(zip(f1, f2, f3)): obj_loss, center_loss, scale_loss, cls_loss = net(x1, x2, x3, gt_boxes[ix], *[ft[ix] for ft in fixed_targets]) sum_losses.append(obj_loss + center_loss + scale_loss + cls_loss) obj_losses.append(obj_loss) center_losses.append(center_loss) scale_losses.append(scale_loss) cls_losses.append(cls_loss) autograd.backward(sum_losses) else: with autograd.record(): for ix, x in enumerate(data): obj_loss, center_loss, scale_loss, cls_loss = net(x, gt_boxes[ix], *[ft[ix] for ft in fixed_targets]) sum_losses.append(obj_loss + center_loss + scale_loss + cls_loss) obj_losses.append(obj_loss) center_losses.append(center_loss) scale_losses.append(scale_loss) cls_losses.append(cls_loss) autograd.backward(sum_losses) if FLAGS.motion_stream is None: trainer.step(batch_size) else: trainer.step(batch_size, ignore_stale_grad=True) # we don't use all layers of each stream obj_metrics.update(0, obj_losses) center_metrics.update(0, center_losses) scale_metrics.update(0, scale_losses) cls_metrics.update(0, cls_losses) if FLAGS.log_interval and not (i + 1) % FLAGS.log_interval: name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() logger.info('[Epoch {}][Batch {}/{}], LR: {:.2E}, Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}, ' '{}={:.3f}, {}={:.3f}'.format(epoch, i, num_batches, trainer.learning_rate, batch_size/(time.time()-btic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) tb_sw.add_scalar(tag='Training_' + name1, scalar_value=loss1, global_step=(epoch * len(train_data) + i)) tb_sw.add_scalar(tag='Training_' + name2, scalar_value=loss2, global_step=(epoch * len(train_data) + i)) tb_sw.add_scalar(tag='Training_' + name3, scalar_value=loss3, global_step=(epoch * len(train_data) + i)) tb_sw.add_scalar(tag='Training_' + name4, scalar_value=loss4, global_step=(epoch * len(train_data) + i)) btic = time.time() name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() logger.info('[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'.format( epoch, (time.time()-tic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) if not (epoch + 1) % FLAGS.val_interval: # consider reduce the frequency of validation to save time logger.info('End Epoch {}: # samples: {}, seconds: {}, samples/sec: {:.2f}'.format( epoch, len(train_data)*batch_size, time.time() - st, (len(train_data)*batch_size)/(time.time() - st))) st = time.time() map_name, mean_ap = validate(net, val_data, ctx, eval_metric) logger.info('End Val: # samples: {}, seconds: {}, samples/sec: {:.2f}'.format( len(val_data)*batch_size, time.time() - st, (len(val_data) * batch_size)/(time.time() - st))) val_msg = '\n'.join(['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) tb_sw.add_scalar(tag='Validation_mAP', scalar_value=float(mean_ap[-1]), global_step=(epoch * len(train_data) + i)) logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0. save_params(net, best_map, current_map, epoch, FLAGS.save_interval, save_prefix)
def train(train_epoch=20): dataset = gcv.data.RecordFileDetection('train.rec') print(dataset) classes = ['mercedes', 'person', 'car'] # only one foreground class here image, label = dataset[10] print('label:', label) # display image and label #ax = viz.plot_bbox(image, bboxes=label[:, :4], labels=label[:, 4:5], class_names=classes) #plt.show() #load model net = gcv.model_zoo.get_model('ssd_512_mobilenet1.0_voc', pretrained=True) print('old classes', net.classes) net.reset_class(classes) net = gcv.model_zoo.get_model('ssd_512_mobilenet1.0_custom', classes=classes, pretrained_base=False, transfer='voc') train_data = get_dataloader(net, dataset, 512, 16, 0) try: a = mx.nd.zeros((1, ), ctx=mx.gpu(0)) ctx = [mx.gpu(0)] except: ctx = [mx.cpu()] net.collect_params().reset_ctx(ctx) trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'learning_rate': 0.001, 'wd': 0.0005, 'momentum': 0.9 }) mbox_loss = gcv.loss.SSDMultiBoxLoss() ce_metric = mx.metric.Loss('CrossEntropy') smoothl1_metric = mx.metric.Loss('SmoothL1') for epoch in range(0, train_epoch): ce_metric.reset() smoothl1_metric.reset() tic = time.time() btic = time.time() net.hybridize(static_alloc=True, static_shape=True) for i, batch in enumerate(train_data): batch_size = batch[0].shape[0] data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0) with autograd.record(): cls_preds = [] box_preds = [] for x in data: cls_pred, box_pred, _ = net(x) cls_preds.append(cls_pred) box_preds.append(box_pred) sum_loss, cls_loss, box_loss = mbox_loss( cls_preds, box_preds, cls_targets, box_targets) autograd.backward(sum_loss) # since we have already normalized the loss, we don't want to normalize # by batch-size anymore trainer.step(1) ce_metric.update(0, [l * batch_size for l in cls_loss]) smoothl1_metric.update(0, [l * batch_size for l in box_loss]) name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() if i % 20 == 0: print( '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f} ' .format(epoch, i, batch_size / (time.time() - btic), name1, loss1)) #net.save_parameters('ssd_512_mobilenet1.0_benz.params') btic = time.time() net.save_parameters('ssd_512_mobilenet1.0_benz.params')
def train(net, train_data, val_data, eval_metric, ctx, args): import gluoncv as gcv gcv.utils.check_version('0.6.0') from gluoncv import data as gdata from gluoncv import utils as gutils from gluoncv.model_zoo import get_model from gluoncv.data.batchify import Tuple, Stack, Pad from gluoncv.data.transforms.presets.yolo import YOLO3DefaultTrainTransform from gluoncv.data.transforms.presets.yolo import YOLO3DefaultValTransform from gluoncv.data.dataloader import RandomTransformDataLoader from gluoncv.utils.metrics.voc_detection import VOC07MApMetric from gluoncv.utils.metrics.coco_detection import COCODetectionMetric from gluoncv.utils import LRScheduler, LRSequential """Training pipeline""" net.collect_params().reset_ctx(ctx) if args.no_wd: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 if args.label_smooth: net._target_generator._label_smooth = True if args.lr_decay_period > 0: lr_decay_epoch = list( range(args.lr_decay_period, args.epochs, args.lr_decay_period)) else: lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')] lr_decay_epoch = [e - args.warmup_epochs for e in lr_decay_epoch] num_batches = args.num_samples // args.batch_size lr_scheduler = LRSequential([ LRScheduler('linear', base_lr=0, target_lr=args.lr, nepochs=args.warmup_epochs, iters_per_epoch=num_batches), LRScheduler(args.lr_mode, base_lr=args.lr, nepochs=args.epochs - args.warmup_epochs, iters_per_epoch=num_batches, step_epoch=lr_decay_epoch, step_factor=args.lr_decay, power=2), ]) if args.horovod: hvd.broadcast_parameters(net.collect_params(), root_rank=0) trainer = hvd.DistributedTrainer(net.collect_params(), 'sgd', { 'wd': args.wd, 'momentum': args.momentum, 'lr_scheduler': lr_scheduler }) else: trainer = gluon.Trainer( net.collect_params(), 'sgd', { 'wd': args.wd, 'momentum': args.momentum, 'lr_scheduler': lr_scheduler }, kvstore='local', update_on_kvstore=(False if args.amp else None)) if args.amp: amp.init_trainer(trainer) # targets sigmoid_ce = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) l1_loss = gluon.loss.L1Loss() # metrics obj_metrics = mx.metric.Loss('ObjLoss') center_metrics = mx.metric.Loss('BoxCenterLoss') scale_metrics = mx.metric.Loss('BoxScaleLoss') cls_metrics = mx.metric.Loss('ClassLoss') # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.num_epochs): if args.mixup: # TODO(zhreshold): more elegant way to control mixup during runtime try: train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5) except AttributeError: train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5) if epoch >= args.num_epochs - args.no_mixup_epochs: try: train_data._dataset.set_mixup(None) except AttributeError: train_data._dataset._data.set_mixup(None) tic = time.time() btic = time.time() mx.nd.waitall() net.hybridize() for i, batch in enumerate(train_data): data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) # objectness, center_targets, scale_targets, weights, class_targets fixed_targets = [ gluon.utils.split_and_load(batch[it], ctx_list=ctx, batch_axis=0) for it in range(1, 6) ] gt_boxes = gluon.utils.split_and_load(batch[6], ctx_list=ctx, batch_axis=0) sum_losses = [] obj_losses = [] center_losses = [] scale_losses = [] cls_losses = [] with autograd.record(): for ix, x in enumerate(data): obj_loss, center_loss, scale_loss, cls_loss = net( x, gt_boxes[ix], *[ft[ix] for ft in fixed_targets]) sum_losses.append(obj_loss + center_loss + scale_loss + cls_loss) obj_losses.append(obj_loss) center_losses.append(center_loss) scale_losses.append(scale_loss) cls_losses.append(cls_loss) if args.amp: with amp.scale_loss(sum_losses, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(sum_losses) trainer.step(batch_size) if (not args.horovod or hvd.rank() == 0): obj_metrics.update(0, obj_losses) center_metrics.update(0, center_losses) scale_metrics.update(0, scale_losses) cls_metrics.update(0, cls_losses) if args.log_interval and not (i + 1) % args.log_interval: name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() logger.info( '[Epoch {}][Batch {}], LR: {:.2E}, Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}' .format(epoch, i, trainer.learning_rate, args.batch_size / (time.time() - btic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) btic = time.time() if (not args.horovod or hvd.rank() == 0): name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() logger.info( '[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}' .format(epoch, (time.time() - tic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) if not (epoch + 1) % args.val_interval: # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric) val_msg = '\n'.join( ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch {}] Validation: \n{}'.format( epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0. save_params(net, best_map, current_map, epoch, args.save_interval, args.save_prefix) #save model net.set_nms(nms_thresh=0.45, nms_topk=400, post_nms=100) net(mx.nd.ones((1, 3, args.data_shape, args.data_shape), ctx=ctx[0])) net.export('%s/model' % os.environ['SM_MODEL_DIR'])
def train(net, train_data, val_data, eval_metric, ctx, args): """Training pipeline""" net.collect_params().setattr('grad_req', 'null') net.collect_train_params().setattr('grad_req', 'write') trainer = gluon.Trainer( net.collect_train_params(), # fix batchnorm, fix first stage, etc... 'sgd', {'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum, 'clip_gradient': 5}) # lr decay policy lr_decay = float(args.lr_decay) lr_steps = sorted([float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()]) lr_warmup = float(args.lr_warmup) # avoid int division # TODO(zhreshold) losses? rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) rpn_box_loss = mx.gluon.loss.HuberLoss(rho=1/9.) # == smoothl1 rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss() rcnn_box_loss = mx.gluon.loss.HuberLoss() # == smoothl1 metrics = [mx.metric.Loss('RPN_Conf'), mx.metric.Loss('RPN_SmoothL1'), mx.metric.Loss('RCNN_CrossEntropy'), mx.metric.Loss('RCNN_SmoothL1'),] rpn_acc_metric = RPNAccMetric() rpn_bbox_metric = RPNL1LossMetric() rcnn_acc_metric = RCNNAccMetric() rcnn_bbox_metric = RCNNL1LossMetric() metrics2 = [rpn_acc_metric, rpn_bbox_metric, rcnn_acc_metric, rcnn_bbox_metric] # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) if args.verbose: logger.info('Trainable parameters:') logger.info(net.collect_train_params().keys()) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): mix_ratio = 1.0 if args.mixup: # TODO(zhreshold) only support evenly mixup now, target generator needs to be modified otherwise train_data._dataset.set_mixup(np.random.uniform, 0.5, 0.5) mix_ratio = 0.5 if epoch >= args.epochs - args.no_mixup_epochs: train_data._dataset.set_mixup(None) mix_ratio = 1.0 while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info("[Epoch {}] Set learning rate to {}".format(epoch, new_lr)) for metric in metrics: metric.reset() tic = time.time() btic = time.time() net.hybridize(static_alloc=True) base_lr = trainer.learning_rate for i, batch in enumerate(train_data): if epoch == 0 and i <= lr_warmup: # adjust based on real percentage new_lr = base_lr * get_lr_at_iter(i / lr_warmup) if new_lr != trainer.learning_rate: if i % args.log_interval == 0: logger.info('[Epoch 0 Iteration {}] Set learning rate to {}'.format(i, new_lr)) trainer.set_learning_rate(new_lr) batch = split_and_load(batch, ctx_list=ctx) batch_size = len(batch[0]) losses = [] metric_losses = [[] for _ in metrics] add_losses = [[] for _ in metrics2] with autograd.record(): for data, label, rpn_cls_targets, rpn_box_targets, rpn_box_masks in zip(*batch): gt_label = label[:, :, 4:5] gt_box = label[:, :, :4] cls_pred, box_pred, roi, samples, matches, rpn_score, rpn_box, anchors = net(data, gt_box) # losses of rpn rpn_score = rpn_score.squeeze(axis=-1) num_rpn_pos = (rpn_cls_targets >= 0).sum() rpn_loss1 = rpn_cls_loss(rpn_score, rpn_cls_targets, rpn_cls_targets >= 0) * rpn_cls_targets.size / num_rpn_pos rpn_loss2 = rpn_box_loss(rpn_box, rpn_box_targets, rpn_box_masks) * rpn_box.size / num_rpn_pos # rpn overall loss, use sum rather than average rpn_loss = rpn_loss1 + rpn_loss2 # generate targets for rcnn cls_targets, box_targets, box_masks = net.target_generator(roi, samples, matches, gt_label, gt_box) # losses of rcnn num_rcnn_pos = (cls_targets >= 0).sum() rcnn_loss1 = rcnn_cls_loss(cls_pred, cls_targets, cls_targets >= 0) * cls_targets.size / cls_targets.shape[0] / num_rcnn_pos rcnn_loss2 = rcnn_box_loss(box_pred, box_targets, box_masks) * box_pred.size / box_pred.shape[0] / num_rcnn_pos rcnn_loss = rcnn_loss1 + rcnn_loss2 # overall losses losses.append(rpn_loss.sum() * mix_ratio + rcnn_loss.sum() * mix_ratio) metric_losses[0].append(rpn_loss1.sum() * mix_ratio) metric_losses[1].append(rpn_loss2.sum() * mix_ratio) metric_losses[2].append(rcnn_loss1.sum() * mix_ratio) metric_losses[3].append(rcnn_loss2.sum() * mix_ratio) add_losses[0].append([[rpn_cls_targets, rpn_cls_targets>=0], [rpn_score]]) add_losses[1].append([[rpn_box_targets, rpn_box_masks], [rpn_box]]) add_losses[2].append([[cls_targets], [cls_pred]]) add_losses[3].append([[box_targets, box_masks], [box_pred]]) autograd.backward(losses) for metric, record in zip(metrics, metric_losses): metric.update(0, record) for metric, records in zip(metrics2, add_losses): for pred in records: metric.update(pred[0], pred[1]) trainer.step(batch_size) # update metrics if args.log_interval and not (i + 1) % args.log_interval: # msg = ','.join(['{}={:.3f}'.format(*metric.get()) for metric in metrics]) msg = ','.join(['{}={:.3f}'.format(*metric.get()) for metric in metrics + metrics2]) logger.info('[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'.format( epoch, i, args.log_interval * batch_size/(time.time()-btic), msg)) btic = time.time() msg = ','.join(['{}={:.3f}'.format(*metric.get()) for metric in metrics]) logger.info('[Epoch {}] Training cost: {:.3f}, {}'.format( epoch, (time.time()-tic), msg)) # if not (epoch + 1) % args.val_interval: # # consider reduce the frequency of validation to save time # map_name, mean_ap = validate(net, val_data, ctx, eval_metric) # val_msg = '\n'.join(['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) # logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg)) # current_map = float(mean_ap[-1]) # else: # current_map = 0. current_map = 0 save_params(net, logger, best_map, current_map, epoch, args.save_interval, args.save_prefix)
box_targets = gluon.utils.split_and_load(batch[3], ctx_list=ctx, batch_axis=0) with autograd.record(): cls_preds = [] ori_preds = [] box_preds = [] for x in data: cls_pred, ori_pred, box_pred, _ = net(x) cls_preds.append(cls_pred) ori_preds.append(ori_pred) box_preds.append(box_pred) sum_loss, cls_loss, ori_loss, box_loss = mbox_loss( cls_preds, ori_preds, box_preds, cls_targets, ori_targets, box_targets) autograd.backward(sum_loss) # since we have already normalized the loss, we don't want to normalize # by batch-size anymore trainer.step(1) ce_metric.update(0, [l * batch_size for l in cls_loss]) ori_ce_metric.update(0, [l * batch_size for l in ori_loss]) smoothl1_metric.update(0, [l * batch_size for l in box_loss]) name1, loss1 = ce_metric.get() name3, loss3 = ori_ce_metric.get() name2, loss2 = smoothl1_metric.get() pbar.set_postfix({ 'loss': '{0:1.5f}'.format(loss1 + loss2 + loss3), 'loss_ce': '{0:1.4f}'.format(loss1),
def train(net, train_data, val_data, eval_metric, polygon_metric, ctx, args): """Training pipeline""" net.collect_params().reset_ctx(ctx) if args.no_wd: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 if args.label_smooth: net._target_generator._label_smooth = True if args.lr_decay_period > 0: lr_decay_epoch = list( range(args.lr_decay_period, args.epochs, args.lr_decay_period)) else: lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')] lr_scheduler = LRScheduler(mode=args.lr_mode, baselr=args.lr, niters=args.num_samples // args.batch_size, nepochs=args.epochs, step=lr_decay_epoch, step_factor=args.lr_decay, power=2, warmup_epochs=args.warmup_epochs) trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'wd': args.wd, 'momentum': args.momentum, 'lr_scheduler': lr_scheduler }, kvstore='local') # metrics obj_metrics = mx.metric.Loss('ObjLoss') center_metrics = mx.metric.Loss('BoxCenterLoss') scale_metrics = mx.metric.Loss('BoxScaleLoss') coef_metrics = mx.metric.Loss('CoefLoss') cls_metrics = mx.metric.Loss('ClassLoss') # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): if args.mixup: # TODO(threshold): more elegant way to control mixup during runtime try: train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5) except AttributeError: train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5) if epoch >= args.epochs - args.no_mixup_epochs: try: train_data._dataset.set_mixup(None) except AttributeError: train_data._dataset._data.set_mixup(None) tic = time.time() btic = time.time() mx.nd.waitall() # net.hybridize() for i, batch in enumerate(train_data): batch_size = batch[0].shape[0] data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) fixed_targets = [ gluon.utils.split_and_load(batch[it], ctx_list=ctx, batch_axis=0) for it in range(1, 7) ] gt_boxes = gluon.utils.split_and_load(batch[7], ctx_list=ctx, batch_axis=0) sum_losses = [] obj_losses = [] center_losses = [] scale_losses = [] # coef_center_losses = [] coef_losses = [] cls_losses = [] with autograd.record(): for ix, x in enumerate(data): obj_loss, center_loss, scale_loss, coef_loss, cls_loss = net( x, gt_boxes[ix], *[ft[ix] for ft in fixed_targets]) if (args.only_bbox): sum_losses.append(obj_loss + center_loss + scale_loss + cls_loss) else: sum_losses.append(obj_loss + center_loss + scale_loss + coef_loss + cls_loss) # coef_center_losses.append(coef_center_loss) coef_losses.append(coef_loss) obj_losses.append(obj_loss) center_losses.append(center_loss) scale_losses.append(scale_loss) cls_losses.append(cls_loss) autograd.backward(sum_losses) lr_scheduler.update(i, epoch) trainer.step(batch_size) if (args.only_bbox == False): # coef_center_metrics.update(0, coef_center_losses) coef_metrics.update(0, coef_losses) obj_metrics.update(0, obj_losses) center_metrics.update(0, center_losses) scale_metrics.update(0, scale_losses) cls_metrics.update(0, cls_losses) if args.log_interval and not (i + 1) % args.log_interval: name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() if (args.only_bbox == False): # name4, loss4 = coef_center_metrics.get() name5, loss5 = coef_metrics.get() name6, loss6 = cls_metrics.get() if (args.only_bbox): logger.info( '[Epoch {}][Batch {}], LR: {:.2E}, Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}' .format(epoch, i, trainer.learning_rate, batch_size / (time.time() - btic), name1, loss1, name2, loss2, name3, loss3, name6, loss6)) else: logger.info( '[Epoch {}][Batch {}], LR: {:.2E}, Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}' .format(epoch, i, trainer.learning_rate, batch_size / (time.time() - btic), name1, loss1, name2, loss2, name3, loss3, name5, loss5, name6, loss6)) btic = time.time() break # Save the model for speedtest name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() if (args.only_bbox == False): # name4, loss4 = coef_center_metrics.get() name5, loss5 = coef_metrics.get() name6, loss6 = cls_metrics.get() if (args.only_bbox): logger.info( '[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}' .format(epoch, (time.time() - tic), name1, loss1, name2, loss2, name3, loss3, name6, loss6)) else: logger.info( '[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}' .format(epoch, (time.time() - tic), name1, loss1, name2, loss2, name3, loss3, name5, loss5, name6, loss6)) if False and not (epoch) % args.val_interval: # consider reduce the frequency of validation to save time map_bbox, map_polygon = validate(net, val_data, ctx, eval_metric, polygon_metric, args) map_name, mean_ap = map_bbox polygonmap_name, polygonmean_ap = map_polygon val_msg = '\n'.join( ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg)) polygonval_msg = '\n'.join([ '{}={}'.format(k, v) for k, v in zip(polygonmap_name, polygonmean_ap) ]) logger.info('[Epoch {}] PolygonValidation: \n{}'.format( epoch, polygonval_msg)) current_map = float(polygonmean_ap[-1]) else: current_map = 0. save_params(net, best_map, current_map, epoch, args.save_interval, args.save_prefix)
def train(self): print("Training has begun....") episode_rewards = 0 final_rewards = 0 running_reward = 10 train_episodes_finished = 0 train_scores = [0] num_action_index = 0 for episode in range(0, self.env.episodes): # modify this line below env.reset should send back the next pack of 8 frames # we could use instead of env.reset the preprocess function self.action_server.reset_last_action() next_frame_bundle = self.env.reset() s1 = next_frame_bundle #update the number of steps depending on number of episodes if episode < 100: self.env.local_learning_steps = self.env.learning_steps elif episode < 200: self.env.local_learning_steps = self.env.learning_steps * 2 elif episode < 300: self.env.local_learning_steps = self.env.learning_steps * 3 else: self.env.local_learning_steps = self.env.learning_steps * 4 rewards = [] values = [] actions = [] heads = [] with autograd.record(): for learning_step in range(self.env.local_learning_steps): # Converts and down-samples the input image prob, value = self.model(s1) # dont always take the argmax, instead pick randomly based on probability index, logp = mx.nd.sample_multinomial(prob, get_prob=True) action = index.asnumpy()[0].astype(np.int64) # self.actions.append(self.env.action_map[action]) self.actions.append(action) # print('#', num_action_index,': ' , 'action Number: ', action, self.env.action_space[action]) num_action_index += 1 # skip frames reward = 0 # env step could be a set of funtions: # a function that packages 8 frames # a function that sends back the optical flow # when these two functions returns something we can set done (below) to true # not sure about the underscore next_frame_bundle, rew, done = self.env.step(action) reward += rew print( "EP: {:<5} | STEP {:<3} | ACTION: {:<8} | REWARD: {:4f}" .format(episode, learning_step, self.env.action_space[action], rew)) isterminal = done rewards.append(reward) actions.append(action) values.append(value) heads.append(logp) if isterminal: #print("finished_game") break s1 = next_frame_bundle if not isterminal else None train_scores.append(np.sum(rewards)) # reverse accumulate and normalize rewards R = 0 for i in range(len(rewards) - 1, -1, -1): R = rewards[i] + self.gamma * R rewards[i] = R rewards = np.array(rewards) rewards -= rewards.mean() rewards /= rewards.std() + np.finfo(rewards.dtype).eps # compute loss and gradient L = sum([ self.loss(value, mx.nd.array([r]).as_in_context(self.ctx)) for r, value in zip(rewards, values) ]) final_nodes = [L] for logp, r, v in zip(heads, rewards, values): reward = r - v.asnumpy()[0, 0] # Here we differentiate the stochastic graph, corresponds to the # first term of equation (6) in https://arxiv.org/pdf/1506.05254.pdf # Optimizer minimizes the loss but we want to maximizing the reward, # so use we use -reward here. final_nodes.append(logp * (-reward)) autograd.backward(final_nodes) self.optimizer.step(s1.shape[0]) if episode % self.env.display_count == 0: train_scores = np.array(train_scores) print( "Episodes {}\t".format(episode), "Results: mean: %.1f +/- %.1f," % (train_scores.mean(), train_scores.std()), "min: %.1f," % train_scores.min(), "max: %.1f," % train_scores.max(), "actions: ", np.unique(actions, return_counts=True)) train_scores = [] if episode % 5 == 0 and episode != 0: self.model.save_params("./params/mkEpisodes_%d.params" % episode) pass
def run(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], graphviz=True, epoch=100, input_size=[512, 512], batch_size=16, batch_log=100, batch_interval=10, subdivision=4, train_dataset_path="Dataset/train", valid_dataset_path="Dataset/valid", multiscale=True, factor_scale=[8, 5], data_augmentation=True, num_workers=4, optimizer="ADAM", lambda_off=1, lambda_size=0.1, save_period=5, load_period=10, learning_rate=0.001, decay_lr=0.999, decay_step=10, GPU_COUNT=0, base=18, pretrained_base=True, pretrained_path="modelparam", AMP=True, valid_size=8, eval_period=5, tensorboard=True, valid_graph_path="valid_Graph", using_mlflow=True, topk=100, plot_class_thresh=0.5): ''' AMP 가 모든 연산을 지원하지는 않는다. modulated convolution을 지원하지 않음 ''' if GPU_COUNT == 0: ctx = mx.cpu(0) AMP = False elif GPU_COUNT == 1: ctx = mx.gpu(0) else: ctx = [mx.gpu(i) for i in range(GPU_COUNT)] # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if isinstance(ctx, (list, tuple)): for i, c in enumerate(ctx): free_memory, total_memory = mx.context.gpu_memory_info(i) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: if GPU_COUNT == 1: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: logging.info(f'Running on {ctx}') if GPU_COUNT > 0 and batch_size < GPU_COUNT: logging.info("batch size must be greater than gpu number") exit(0) if AMP: amp.init() if multiscale: logging.info("Using MultiScale") if data_augmentation: logging.info("Using Data Augmentation") logging.info("training Center Detector") input_shape = (1, 3) + tuple(input_size) scale_factor = 4 # 고정 logging.info(f"scale factor {scale_factor}") try: train_dataloader, train_dataset = traindataloader( multiscale=multiscale, factor_scale=factor_scale, augmentation=data_augmentation, path=train_dataset_path, input_size=input_size, batch_size=batch_size, batch_interval=batch_interval, num_workers=num_workers, shuffle=True, mean=mean, std=std, scale_factor=scale_factor, make_target=True) valid_dataloader, valid_dataset = validdataloader( path=valid_dataset_path, input_size=input_size, batch_size=valid_size, num_workers=num_workers, shuffle=True, mean=mean, std=std, scale_factor=scale_factor, make_target=True) except Exception as E: logging.info(E) exit(0) train_update_number_per_epoch = len(train_dataloader) if train_update_number_per_epoch < 1: logging.warning("train batch size가 데이터 수보다 큼") exit(0) valid_list = glob.glob(os.path.join(valid_dataset_path, "*")) if valid_list: valid_update_number_per_epoch = len(valid_dataloader) if valid_update_number_per_epoch < 1: logging.warning("valid batch size가 데이터 수보다 큼") exit(0) num_classes = train_dataset.num_class # 클래스 수 name_classes = train_dataset.classes optimizer = optimizer.upper() if pretrained_base: model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_P" + "CENTER_RES" + str(base) else: model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_CENTER_RES" + str(base) weight_path = f"weights/{model}" sym_path = os.path.join(weight_path, f'{model}-symbol.json') param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params') if os.path.exists(param_path) and os.path.exists(sym_path): start_epoch = load_period logging.info(f"loading {os.path.basename(param_path)} weights\n") net = gluon.SymbolBlock.imports(sym_path, ['data'], param_path, ctx=ctx) else: start_epoch = 0 net = CenterNet(base=base, heads=OrderedDict([('heatmap', { 'num_output': num_classes, 'bias': -2.19 }), ('offset', { 'num_output': 2 }), ('wh', { 'num_output': 2 })]), head_conv_channel=64, pretrained=pretrained_base, root=pretrained_path, use_dcnv2=False, ctx=ctx) if isinstance(ctx, (list, tuple)): net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.summary(mx.nd.ones(shape=input_shape, ctx=ctx)) ''' active (bool, default True) – Whether to turn hybrid on or off. static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase. static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower. ''' if multiscale: net.hybridize(active=True, static_alloc=True, static_shape=False) else: net.hybridize(active=True, static_alloc=True, static_shape=True) if start_epoch + 1 >= epoch + 1: logging.info("this model has already been optimized") exit(0) if tensorboard: summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10, verbose=False) if isinstance(ctx, (list, tuple)): net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.forward(mx.nd.ones(shape=input_shape, ctx=ctx)) summary.add_graph(net) if graphviz: gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model) # optimizer unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size step = unit * decay_step lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate) for p in net.collect_params().values(): if p.grad_req != "null": p.grad_req = 'add' if AMP: ''' update_on_kvstore : bool, default None Whether to perform parameter updates on kvstore. If None, then trainer will choose the more suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored. ''' if optimizer.upper() == "ADAM": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "SGD": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0001, "momentum": 0.9, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling else: logging.error("optimizer not selected") exit(0) amp.init_trainer(trainer) else: if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False }) elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False }) elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0001, "momentum": 0.9, 'multi_precision': False }) else: logging.error("optimizer not selected") exit(0) heatmapfocalloss = HeatmapFocalLoss(from_sigmoid=True, alpha=2, beta=4) normedl1loss = NormedL1Loss() prediction = Prediction(batch_size=valid_size, topk=topk, scale=scale_factor) precision_recall = Voc_2007_AP(iou_thresh=0.5, class_names=name_classes) start_time = time.time() for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch): heatmap_loss_sum = 0 offset_loss_sum = 0 wh_loss_sum = 0 time_stamp = time.time() ''' target generator를 train_dataloader에서 만들어 버리는게 학습 속도가 훨씬 빠르다. ''' for batch_count, (image, _, heatmap, offset_target, wh_target, mask_target, _) in enumerate(train_dataloader, start=1): td_batch_size = image.shape[0] image_split = mx.nd.split(data=image, num_outputs=subdivision, axis=0) heatmap_split = mx.nd.split(data=heatmap, num_outputs=subdivision, axis=0) offset_target_split = mx.nd.split(data=offset_target, num_outputs=subdivision, axis=0) wh_target_split = mx.nd.split(data=wh_target, num_outputs=subdivision, axis=0) mask_target_split = mx.nd.split(data=mask_target, num_outputs=subdivision, axis=0) if subdivision == 1: image_split = [image_split] heatmap_split = [heatmap_split] offset_target_split = [offset_target_split] wh_target_split = [wh_target_split] mask_target_split = [mask_target_split] ''' autograd 설명 https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html ''' with autograd.record(train_mode=True): heatmap_all_losses = [] offset_all_losses = [] wh_all_losses = [] for image_part, heatmap_part, offset_target_part, wh_target_part, mask_target_part in zip( image_split, heatmap_split, offset_target_split, wh_target_split, mask_target_split): if GPU_COUNT <= 1: image_part = gluon.utils.split_and_load( image_part, [ctx], even_split=False) heatmap_part = gluon.utils.split_and_load( heatmap_part, [ctx], even_split=False) offset_target_part = gluon.utils.split_and_load( offset_target_part, [ctx], even_split=False) wh_target_part = gluon.utils.split_and_load( wh_target_part, [ctx], even_split=False) mask_target_part = gluon.utils.split_and_load( mask_target_part, [ctx], even_split=False) else: image_part = gluon.utils.split_and_load( image_part, ctx, even_split=False) heatmap_part = gluon.utils.split_and_load( heatmap_part, ctx, even_split=False) offset_target_part = gluon.utils.split_and_load( offset_target_part, ctx, even_split=False) wh_target_part = gluon.utils.split_and_load( wh_target_part, ctx, even_split=False) mask_target_part = gluon.utils.split_and_load( mask_target_part, ctx, even_split=False) # prediction, target space for Data Parallelism heatmap_losses = [] offset_losses = [] wh_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, heatmap_target, offset_target, wh_target, mask_target in zip( image_part, heatmap_part, offset_target_part, wh_target_part, mask_target_part): heatmap_pred, offset_pred, wh_pred = net(img) heatmap_loss = heatmapfocalloss( heatmap_pred, heatmap_target) offset_loss = normedl1loss(offset_pred, offset_target, mask_target) * lambda_off wh_loss = normedl1loss(wh_pred, wh_target, mask_target) * lambda_size heatmap_losses.append(heatmap_loss.asscalar()) offset_losses.append(offset_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) total_loss.append(heatmap_loss + offset_loss + wh_loss) if AMP: with amp.scale_loss(total_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(total_loss) heatmap_all_losses.append(sum(heatmap_losses)) offset_all_losses.append(sum(offset_losses)) wh_all_losses.append(sum(wh_losses)) trainer.step(batch_size=td_batch_size, ignore_stale_grad=False) # 비우기 for p in net.collect_params().values(): p.zero_grad() heatmap_loss_sum += sum(heatmap_all_losses) / td_batch_size offset_loss_sum += sum(offset_all_losses) / td_batch_size wh_loss_sum += sum(wh_all_losses) / td_batch_size if batch_count % batch_log == 0: logging.info( f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],' f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],' f'[Lr = {trainer.learning_rate}]' f'[heatmap loss = {sum(heatmap_all_losses) / td_batch_size:.3f}]' f'[offset loss = {sum(offset_all_losses) / td_batch_size:.3f}]' f'[wh loss = {sum(wh_all_losses) / td_batch_size:.3f}]') time_stamp = time.time() train_heatmap_loss_mean = np.divide(heatmap_loss_sum, train_update_number_per_epoch) train_offset_loss_mean = np.divide(offset_loss_sum, train_update_number_per_epoch) train_wh_loss_mean = np.divide(wh_loss_sum, train_update_number_per_epoch) train_total_loss_mean = train_heatmap_loss_mean + train_offset_loss_mean + train_wh_loss_mean logging.info( f"train heatmap loss : {train_heatmap_loss_mean} / train offset loss : {train_offset_loss_mean} / train wh loss : {train_wh_loss_mean} / train total loss : {train_total_loss_mean}" ) if i % eval_period == 0 and valid_list: heatmap_loss_sum = 0 offset_loss_sum = 0 wh_loss_sum = 0 # loss 구하기 for image, label, heatmap_all, offset_target_all, wh_target_all, mask_target_all, _ in valid_dataloader: vd_batch_size = image.shape[0] if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) heatmap_split = gluon.utils.split_and_load( heatmap_all, [ctx], even_split=False) offset_target_split = gluon.utils.split_and_load( offset_target_all, [ctx], even_split=False) wh_target_split = gluon.utils.split_and_load( wh_target_all, [ctx], even_split=False) mask_target_split = gluon.utils.split_and_load( mask_target_all, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) heatmap_split = gluon.utils.split_and_load( heatmap_all, ctx, even_split=False) offset_target_split = gluon.utils.split_and_load( offset_target_all, ctx, even_split=False) wh_target_split = gluon.utils.split_and_load( wh_target_all, ctx, even_split=False) mask_target_split = gluon.utils.split_and_load( mask_target_all, ctx, even_split=False) # prediction, target space for Data Parallelism heatmap_losses = [] offset_losses = [] wh_losses = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, lb, heatmap_target, offset_target, wh_target, mask_target in zip( image, label, heatmap_split, offset_target_split, wh_target_split, mask_target_split): gt_box = lb[:, :, :4] gt_id = lb[:, :, 4:5] heatmap_pred, offset_pred, wh_pred = net(img) id, score, bbox = prediction(heatmap_pred, offset_pred, wh_pred) precision_recall.update(pred_bboxes=bbox, pred_labels=id, pred_scores=score, gt_boxes=gt_box * scale_factor, gt_labels=gt_id) heatmap_loss = heatmapfocalloss(heatmap_pred, heatmap_target) offset_loss = normedl1loss(offset_pred, offset_target, mask_target) * lambda_off wh_loss = normedl1loss(wh_pred, wh_target, mask_target) * lambda_size heatmap_losses.append(heatmap_loss.asscalar()) offset_losses.append(offset_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) heatmap_loss_sum += sum(heatmap_losses) / vd_batch_size offset_loss_sum += sum(offset_losses) / vd_batch_size wh_loss_sum += sum(wh_losses) / vd_batch_size valid_heatmap_loss_mean = np.divide(heatmap_loss_sum, valid_update_number_per_epoch) valid_offset_loss_mean = np.divide(offset_loss_sum, valid_update_number_per_epoch) valid_wh_loss_mean = np.divide(wh_loss_sum, valid_update_number_per_epoch) valid_total_loss_mean = valid_heatmap_loss_mean + valid_offset_loss_mean + valid_wh_loss_mean logging.info( f"valid heatmap loss : {valid_heatmap_loss_mean} / valid offset loss : {valid_offset_loss_mean} / valid wh loss : {valid_wh_loss_mean} / valid total loss : {valid_total_loss_mean}" ) AP_appender = [] round_position = 2 class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list( ) for j, c, p, r in zip(range(len(recall)), class_name, precision, recall): name, AP = precision_recall.get_AP(c, p, r) logging.info( f"class {j}'s {name} AP : {round(AP * 100, round_position)}%" ) AP_appender.append(AP) mAP_result = np.mean(AP_appender) logging.info(f"mAP : {round(mAP_result * 100, round_position)}%") precision_recall.get_PR_curve(name=class_name, precision=precision, recall=recall, threshold=threshold, AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i) precision_recall.reset() if tensorboard: # gpu N 개를 대비한 코드 (Data Parallelism) dataloader_iter = iter(valid_dataloader) image, label, _, _, _, _, _ = next(dataloader_iter) if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) ground_truth_colors = {} for k in range(num_classes): ground_truth_colors[k] = (0, 0, 1) batch_image = [] heatmap_image = [] for img, lb in zip(image, label): gt_boxes = lb[:, :, :4] gt_ids = lb[:, :, 4:5] heatmap_pred, offset_pred, wh_pred = net(img) ids, scores, bboxes = prediction(heatmap_pred, offset_pred, wh_pred) for ig, gt_id, gt_box, heatmap, id, score, bbox in zip( img, gt_ids, gt_boxes, heatmap_pred, ids, scores, bboxes): ig = ig.transpose((1, 2, 0)) * mx.nd.array( std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context) ig = (ig * 255).clip(0, 255) # heatmap 그리기 heatmap = mx.nd.multiply(heatmap, 255.0) # 0 ~ 255 범위로 바꾸기 heatmap = mx.nd.max( heatmap, axis=0, keepdims=True) # channel 축으로 가장 큰것 뽑기 heatmap = mx.nd.transpose( heatmap, axes=(1, 2, 0)) # (height, width, channel=1) heatmap = mx.nd.repeat( heatmap, repeats=3, axis=-1) # (height, width, channel=3) heatmap = heatmap.asnumpy( ) # mxnet.ndarray -> numpy.ndarray heatmap = cv2.resize(heatmap, dsize=(input_size[1], input_size[0])) # 사이즈 원복 heatmap = heatmap.astype("uint8") # float32 -> uint8 heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET) heatmap[:, :, (0, 1, 2)] = heatmap[:, :, (2, 1, 0)] # BGR -> RGB heatmap = np.transpose( heatmap, axes=(2, 0, 1)) # (channel=3, height, width) # ground truth box 그리기 ground_truth = plot_bbox( ig, gt_box * scale_factor, scores=None, labels=gt_id, thresh=None, reverse_rgb=True, class_names=valid_dataset.classes, absolute_coordinates=True, colors=ground_truth_colors) # prediction box 그리기 prediction_box = plot_bbox( ground_truth, bbox, scores=score, labels=id, thresh=plot_class_thresh, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True) # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다. prediction_box = cv2.cvtColor(prediction_box, cv2.COLOR_BGR2RGB) prediction_box = np.transpose(prediction_box, axes=(2, 0, 1)) batch_image.append( prediction_box) # (batch, channel, height, width) heatmap_image.append(heatmap) all_image = np.concatenate( [np.array(batch_image), np.array(heatmap_image)], axis=-1) summary.add_image(tag="valid_result", image=all_image, global_step=i) summary.add_scalar(tag="heatmap_loss", value={ "train_heatmap_loss_mean": train_heatmap_loss_mean, "valid_heatmap_loss_mean": valid_heatmap_loss_mean }, global_step=i) summary.add_scalar(tag="offset_loss", value={ "train_offset_loss_mean": train_offset_loss_mean, "valid_offset_loss_mean": valid_offset_loss_mean }, global_step=i) summary.add_scalar(tag="wh_loss", value={ "train_wh_loss_mean": train_wh_loss_mean, "valid_wh_loss_mean": valid_wh_loss_mean }, global_step=i) summary.add_scalar(tag="total_loss", value={ "train_total_loss": train_total_loss_mean, "valid_total_loss": valid_total_loss_mean }, global_step=i) params = net.collect_params().values() if GPU_COUNT > 1: for c in ctx: for p in params: summary.add_histogram(tag=p.name, values=p.data(ctx=c), global_step=i, bins='default') else: for p in params: summary.add_histogram(tag=p.name, values=p.data(), global_step=i, bins='default') if i % save_period == 0: if not os.path.exists(weight_path): os.makedirs(weight_path) ''' Hybrid models can be serialized as JSON files using the export function Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface. When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc. ''' if GPU_COUNT >= 1: context = mx.gpu(0) else: context = mx.cpu(0) postnet = PostNet(net=net, auxnet=prediction) # 새로운 객체가 생성 try: net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True) net.save_parameters(os.path.join(weight_path, f"{i}.params")) # onnx 추출용 # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 export_block_for_cplusplus( path=os.path.join(weight_path, f"{model}_prepost"), block=postnet, data_shape=tuple(input_size) + tuple((3, )), epoch=i, preprocess= True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=context, remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공") net.collect_params().reset_ctx(ctx) end_time = time.time() learning_time = end_time - start_time logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H") logging.info("optimization completed") if using_mlflow: ml.log_metric("learning time", round(learning_time / 3600, 2))
def train(epochs, ctx): if isinstance(ctx, mx.Context): ctx = [ctx] if config.train_cfg.param_init: init_func = getattr(mx.init, config.train_cfg.init) net.initialize(init_func(), ctx=ctx, force_reinit=True) else: net.load_parameters(config.train_cfg.param_file, ctx=ctx) summary(net, stat_name, nd.uniform( shape=(1, 3, imgsize, imgsize), ctx=ctx[0])) # net = nn.HybridBlock() net.hybridize() root = config.dir_cfg.dataset train_data = gluon.data.DataLoader( gluon.data.vision.CIFAR10( root=root, train=True).transform_first(transform_train), batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers) val_data = gluon.data.DataLoader( gluon.data.vision.CIFAR10( root=root, train=False).transform_first(transform_test), batch_size=batch_size, shuffle=False, num_workers=num_workers) trainer_arg = {'learning_rate': config.lr_cfg.lr, 'wd': config.lr_cfg.wd, 'lr_scheduler': lr_sch} extra_arg = eval(config.lr_cfg.extra_arg) trainer_arg.update(extra_arg) trainer = gluon.Trainer(net.collect_params(), optimizer, trainer_arg) if config.train_cfg.amp: amp.init_trainer(trainer) metric = mx.metric.Accuracy() train_metric = mx.metric.RMSE() loss_fn = gluon.loss.SoftmaxCrossEntropyLoss( sparse_label=False if config.data_cfg.mixup else True) train_history = TrainingHistory(['training-error', 'validation-error']) # acc_history = TrainingHistory(['training-acc', 'validation-acc']) loss_history = TrainingHistory(['training-loss', 'validation-loss']) iteration = 0 best_val_score = 0 # print('start training') sig_state.emit(1) sig_pgbar.emit(0) # signal.emit('Training') for epoch in range(epochs): tic = time.time() train_metric.reset() metric.reset() train_loss = 0 num_batch = len(train_data) alpha = 1 for i, batch in enumerate(train_data): if epoch == 0 and iteration == 1 and config.save_cfg.profiler: profiler.set_state('run') is_profiler_run = True if epoch == 0 and iteration == 1 and config.save_cfg.tensorboard: sw.add_graph(net) lam = np.random.beta(alpha, alpha) if epoch >= epochs - 20 or not config.data_cfg.mixup: lam = 1 data_1 = gluon.utils.split_and_load( batch[0], ctx_list=ctx, batch_axis=0) label_1 = gluon.utils.split_and_load( batch[1], ctx_list=ctx, batch_axis=0) if not config.data_cfg.mixup: data = data_1 label = label_1 else: data = [lam*X + (1-lam)*X[::-1] for X in data_1] label = [] for Y in label_1: y1 = label_transform(Y, classes) y2 = label_transform(Y[::-1], classes) label.append(lam*y1 + (1-lam)*y2) with ag.record(): output = [net(X) for X in data] loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)] if config.train_cfg.amp: with ag.record(): with amp.scale_loss(loss, trainer) as scaled_loss: ag.backward(scaled_loss) # scaled_loss.backward() else: for l in loss: l.backward() trainer.step(batch_size) train_loss += sum([l.sum().asscalar() for l in loss]) output_softmax = [nd.SoftmaxActivation(out) for out in output] train_metric.update(label, output_softmax) metric.update(label_1, output_softmax) name, acc = train_metric.get() if config.save_cfg.tensorboard: sw.add_scalar(tag='lr', value=trainer.learning_rate, global_step=iteration) if epoch == 0 and iteration == 1 and config.save_cfg.profiler: nd.waitall() profiler.set_state('stop') profiler.dump() iteration += 1 sig_pgbar.emit(iteration) if check_flag()[0]: sig_state.emit(2) while(check_flag()[0] or check_flag()[1]): if check_flag()[1]: print('stop') return else: time.sleep(5) print('pausing') epoch_time = time.time() - tic train_loss /= batch_size * num_batch name, acc = train_metric.get() _, train_acc = metric.get() name, val_acc, _ = test(ctx, val_data) # if config.data_cfg.mixup: # train_history.update([acc, 1-val_acc]) # plt.cla() # train_history.plot(save_path='%s/%s_history.png' % # (plot_name, model_name)) # else: train_history.update([1-train_acc, 1-val_acc]) plt.cla() train_history.plot(save_path='%s/%s_history.png' % (plot_name, model_name)) if val_acc > best_val_score: best_val_score = val_acc net.save_parameters('%s/%.4f-cifar-%s-%d-best.params' % (save_dir, best_val_score, model_name, epoch)) current_lr = trainer.learning_rate name, val_acc, val_loss = test(ctx, val_data) logging.info('[Epoch %d] loss=%f train_acc=%f train_RMSE=%f\n val_acc=%f val_loss=%f lr=%f time: %f' % (epoch, train_loss, train_acc, acc, val_acc, val_loss, current_lr, epoch_time)) loss_history.update([train_loss, val_loss]) plt.cla() loss_history.plot(save_path='%s/%s_loss.png' % (plot_name, model_name), y_lim=(0, 2), legend_loc='best') if config.save_cfg.tensorboard: sw._add_scalars(tag='Acc', scalar_dict={'train_acc': train_acc, 'test_acc': val_acc}, global_step=epoch) sw._add_scalars(tag='Loss', scalar_dict={'train_loss': train_loss, 'test_loss': val_loss}, global_step=epoch) sig_table.emit([epoch, train_loss, train_acc, val_loss, val_acc, current_lr, epoch_time]) csv_writer.writerow([epoch, train_loss, train_acc, val_loss, val_acc, current_lr, epoch_time]) csv_file.flush() if save_period and save_dir and (epoch + 1) % save_period == 0: net.save_parameters('%s/cifar10-%s-%d.params' % (save_dir, model_name, epoch)) if save_period and save_dir: net.save_parameters('%s/cifar10-%s-%d.params' % (save_dir, model_name, epochs-1))
def train(self): self.net.collect_params().reset_ctx(self.ctx) num_batches = self.args.num_samples // self.args.batch_size trainer = gluon.Trainer(self.net.collect_params(), 'sgd', { 'learning_rate': self.args.learning_rate, 'wd': self.args.wd, 'momentum': self.args.momentum }, update_on_kvstore=(None)) # Learning rate decay policy lr_decay = float(self.args.lr_decay) lr_steps = sorted([ float(ls) for ls in self.args.lr_decay_epoch.split(',') if ls.strip() ]) # Losses mbox_loss = gcv.loss.SSDMultiBoxLoss() ce_metric = mx.metric.Loss('CrossEntropy') smoothl1_metric = mx.metric.Loss('SmoothL1') best_map = [0.] # Epoch loop for epoch in range(self.args.start_epoch, self.args.epochs): # Batch size can vary from epoch to epoch +/-1 num_batches = len(self.train_data) self.beforeEpoch(epoch, num_batches=num_batches) while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info('[Epoch {}] Set learning rate to {}'.format( epoch, new_lr)) ce_metric.reset() smoothl1_metric.reset() tic = time.time() btic = time.time() self.net.hybridize(static_alloc=True, static_shape=True) # Batch loop for i, batch in enumerate(self.train_data): self.beforeBatch(i, epoch, num_batches) batch_size = batch[0].shape[0] data = gluon.utils.split_and_load(batch[0], ctx_list=self.ctx, batch_axis=0) cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=self.ctx, batch_axis=0) box_targets = gluon.utils.split_and_load(batch[2], ctx_list=self.ctx, batch_axis=0) with autograd.record(): cls_preds = [] box_preds = [] for x in data: cls_pred, box_pred, foo = self.net(x) cls_preds.append(cls_pred) box_preds.append(box_pred) sum_loss, cls_loss, box_loss = mbox_loss( cls_preds, box_preds, cls_targets, box_targets) autograd.backward(sum_loss) trainer.step(1) ce_metric.update(0, [l * batch_size for l in cls_loss]) smoothl1_metric.update(0, [l * batch_size for l in box_loss]) speed = batch_size / (time.time() - btic) self.afterBatch(i, epoch, num_batches, trainer.learning_rate, speed, metrics=[ce_metric, smoothl1_metric]) btic = time.time() current_mAP = self.validateEpoch( epoch, epoch_time=(time.time() - tic), validate_params={'static_shape': True}) self.saveParams(best_map, current_mAP, epoch) self.afterEpoch(epoch) return epoch
def train(net, train_data, val_data, eval_metric, ctx, args): """Training pipeline""" net.collect_params().reset_ctx(ctx) trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum }) # lr decay policy lr_decay = float(args.lr_decay) lr_steps = sorted( [float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()]) # mbox_loss = gcv.loss.SSDMultiBoxLoss() mbox_loss = gcv.loss.YOLACTMultiBoxLoss() ce_metric = mx.metric.Loss('CrossEntropy') smoothl1_metric = mx.metric.Loss('SmoothL1') sq_metric = mx.metric.Loss('SigmoidBCE') # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info("[Epoch {}] Set learning rate to {}".format( epoch, new_lr)) ce_metric.reset() smoothl1_metric.reset() sq_metric.reset() tic = time.time() btic = time.time() net.hybridize() for i, batch in enumerate(train_data): batch_size = batch[0].shape[0] data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0) mask_targets = gluon.utils.split_and_load(batch[3], ctx_list=ctx, batch_axis=0) matches = gluon.utils.split_and_load(batch[4], ctx_list=ctx, batch_axis=0) with autograd.record(): cls_preds = [] box_preds = [] masks = [] maskeocs = [] bts = [] for x, bt in zip(data, box_targets): cls_pred, box_pred, anchor, maskeoc, mask = net(x) bts.append(net.bbox_decoder(bt, anchor)) cls_preds.append(cls_pred) box_preds.append(box_pred) masks.append(mask) maskeocs.append(maskeoc) sum_loss, cls_loss, box_loss, mask_loss = mbox_loss( cls_preds, box_preds, masks, maskeocs, cls_targets, box_targets, mask_targets, matches, bts) autograd.backward(sum_loss) # since we have already normalized the loss, we don't want to normalize # by batch-size anymore trainer.step(1) ce_metric.update(0, [l * batch_size for l in cls_loss]) smoothl1_metric.update(0, [l * batch_size for l in box_loss]) sq_metric.update(0, [l * batch_size for l in mask_loss]) if args.log_interval and not (i + 1) % args.log_interval: name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() name3, loss3 = sq_metric.get() logger.info( '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}, {}={:.3f},' .format(epoch, i, batch_size / (time.time() - btic), name1, loss1, name2, loss2, name3, loss3)) btic = time.time() break name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() name3, loss3 = sq_metric.get() logger.info( '[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}' .format(epoch, (time.time() - tic), name1, loss1, name2, loss2, name3, loss3)) if (epoch % args.val_interval == 0) or (args.save_interval and epoch % args.save_interval == 0) or (epoch >= 50): # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric) val_msg = '\n'.join( ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0. save_params(net, best_map, current_map, epoch, args.save_interval, args.save_prefix)
def train(opt, ctx): if isinstance(ctx, mx.Context): ctx = [ctx] kv = mx.kv.create(opt.kvstore) train_data, val_data = get_data_iters(dataset, batch_size, kv.num_workers, kv.rank) net.collect_params().reset_ctx(ctx) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum, 'multi_precision': True}, kvstore = kv) loss = gluon.loss.SoftmaxCrossEntropyLoss() total_time = 0 num_epochs = 0 best_acc = [0] for epoch in range(opt.start_epoch, opt.epochs): trainer = update_learning_rate(opt.lr, trainer, epoch, opt.lr_factor, lr_steps) tic = time.time() train_data.reset() metric.reset() btic = time.time() for i, batch in enumerate(train_data): data = gluon.utils.split_and_load(batch.data[0].astype(opt.dtype), ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load(batch.label[0].astype(opt.dtype), ctx_list=ctx, batch_axis=0) outputs = [] Ls = [] with ag.record(): for x, y in zip(data, label): z = net(x) L = loss(z, y) # store the loss and do backward after we have done forward # on all GPUs for better speed on multiple GPUs. Ls.append(L) outputs.append(z) ag.backward(Ls) trainer.step(batch.data[0].shape[0]) metric.update(label, outputs) if opt.log_interval and not (i+1)%opt.log_interval: name, acc = metric.get() logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f, %s=%f'%( epoch, i, batch_size/(time.time()-btic), name[0], acc[0], name[1], acc[1])) btic = time.time() epoch_time = time.time()-tic # First epoch will usually be much slower than the subsequent epics, # so don't factor into the average if num_epochs > 0: total_time = total_time + epoch_time num_epochs = num_epochs + 1 name, acc = metric.get() logger.info('[Epoch %d] training: %s=%f, %s=%f'%(epoch, name[0], acc[0], name[1], acc[1])) logger.info('[Epoch %d] time cost: %f'%(epoch, epoch_time)) name, val_acc = test(ctx, val_data) logger.info('[Epoch %d] validation: %s=%f, %s=%f'%(epoch, name[0], val_acc[0], name[1], val_acc[1])) # save model if meet requirements save_checkpoint(epoch, val_acc[0], best_acc) if num_epochs > 1: print('Average epoch time: {}'.format(float(total_time)/(num_epochs - 1)))
def train(): """Training loop for language model. """ print(model) from_epoch = 0 model.initialize(mx.init.Xavier(factor_type='out'), ctx=context) trainer_params = {'learning_rate': args.lr, 'wd': 0, 'eps': args.eps} trainer = gluon.Trainer(model.collect_params(), 'adagrad', trainer_params) if args.from_epoch: from_epoch = args.from_epoch checkpoint_name = '%s.%s'%(args.save, format(from_epoch - 1, '02d')) model.load_parameters(checkpoint_name) trainer.load_states('%s.state'%args.save) print('Loaded parameters from checkpoint %s'%(checkpoint_name)) model.hybridize(static_alloc=True, static_shape=True) encoder_params = model.encoder.collect_params().values() embedding_params = list(model.embedding.collect_params().values()) for epoch in range(from_epoch, args.epochs): sys.stdout.flush() total_L = 0.0 start_epoch_time = time.time() start_log_interval_time = time.time() hiddens = [model.begin_state(batch_size=args.batch_size, func=mx.nd.zeros, ctx=ctx) for ctx in context] nbatch = 0 has_next = True train_data_iter = iter(train_data) data, target, mask, sample = next(train_data_iter) while has_next: nbatch += 1 hiddens = detach(hiddens) Ls = [] with autograd.record(): for j, (X, y, m, s, h) in enumerate(zip(data, target, mask, sample, hiddens)): output, h, new_target = model(X, y, h, s) output = output.reshape((-3, -1)) new_target = new_target.reshape((-1,)) l = loss(output, new_target) * m.reshape((-1,)) Ls.append(l/args.batch_size) hiddens[j] = h autograd.backward(Ls) # prefetch the next batch of data try: data, target, mask, sample = next(train_data_iter) except StopIteration: has_next = False # rescale embedding grad for ctx in context: x = embedding_params[0].grad(ctx) x[:] *= args.batch_size encoder_grad = [p.grad(ctx) for p in encoder_params] # perform gradient clipping per ctx gluon.utils.clip_global_norm(encoder_grad, args.clip) trainer.step(len(context)) total_L += sum([mx.nd.sum(L).asscalar() / args.bptt for L in Ls]) if nbatch % args.log_interval == 0: cur_L = total_L / args.log_interval / len(context) ppl = math.exp(cur_L) if cur_L < 100 else float('inf') print('[Epoch %d Batch %d] loss %.2f, ppl %.2f, ' 'throughput %.2f samples/s' %(epoch, nbatch, cur_L, ppl, train_batch_size*args.log_interval/(time.time()-start_log_interval_time))) total_L = 0.0 start_log_interval_time = time.time() sys.stdout.flush() end_epoch_time = time.time() print('Epoch %d took %.2f seconds.'%(epoch, end_epoch_time - start_epoch_time)) mx.nd.waitall() checkpoint_name = '%s.%s'%(args.save, format(epoch, '02d')) model.save_parameters(checkpoint_name) trainer.save_states('%s.state'%args.save)