def get_dataloader(net, train_dataset, val_dataset, data_shape, batch_size, num_workers, args): """Get dataloader.""" width, height = data_shape, data_shape batchify_fn = Tuple( *([Stack() for _ in range(6)] + [Pad(axis=0, pad_val=-1) for _ in range(1)])) # stack image, all targets generated if args.no_random_shape: train_loader = gluon.data.DataLoader(train_dataset.transform( YOLO3DefaultTrainTransform(width, height, net, mixup=args.mixup)), batch_size, True, batchify_fn=batchify_fn, last_batch='rollover', num_workers=num_workers) else: transform_fns = [ YOLO3DefaultTrainTransform(x * 32, x * 32, net, mixup=args.mixup) for x in range(10, 20) ] train_loader = RandomTransformDataLoader(transform_fns, train_dataset, batch_size=batch_size, interval=10, last_batch='rollover', shuffle=True, batchify_fn=batchify_fn, num_workers=num_workers) val_batchify_fn = Tuple(Stack(), Pad(pad_val=-1)) val_loader = gluon.data.DataLoader(val_dataset.transform( YOLO3DefaultValTransform(width, height)), batch_size, False, batchify_fn=val_batchify_fn, last_batch='keep', num_workers=num_workers) return train_loader, val_loader
def _get_dataloader(net, test_dataset, data_shape, batch_size, num_workers, num_devices, args): """Get dataloader.""" if args.meta_arch == 'yolo3': width, height = data_shape, data_shape val_batchify_fn = Tuple(Stack(), Pad(pad_val=-1)) test_loader = gluon.data.DataLoader( test_dataset.transform(YOLO3DefaultValTransform(width, height)), batch_size, False, batchify_fn=val_batchify_fn, last_batch='keep', num_workers=num_workers) return test_loader elif args.meta_arch == 'faster_rcnn': """Get faster rcnn dataloader.""" test_bfn = Tuple(*[Append() for _ in range(3)]) short = net.short[-1] if isinstance(net.short, (tuple, list)) else net.short # validation use 1 sample per device test_loader = gluon.data.DataLoader( test_dataset.transform(FasterRCNNDefaultValTransform(short, net.max_size)), num_devices, False, batchify_fn=test_bfn, last_batch='keep', num_workers=args.num_workers) return test_loader else: raise NotImplementedError('%s not implemented.' % args.meta_arch)
def get_dataloader(net, train_dataset, val_dataset, data_shape, batch_size, num_workers): """Get dataloader.""" width, height = data_shape, data_shape batchify_fn = Tuple( *([Stack() for _ in range(6)] + [Pad(axis=0, pad_val=-1) for _ in range(1)])) # stack image, all targets generated train_loader = gluon.data.DataLoader(train_dataset.transform( YOLO3DefaultTrainTransform(width, height, net)), batch_size, True, batchify_fn=batchify_fn, last_batch='rollover', num_workers=num_workers) val_batchify_fn = Tuple(Stack(), Pad(pad_val=-1)) val_loader = gluon.data.DataLoader(val_dataset.transform( YOLO3DefaultValTransform(width, height)), batch_size, False, batchify_fn=val_batchify_fn, last_batch='keep', num_workers=num_workers) return train_loader, val_loader
def get_dataloader(self): width, height = self.width, self.height train_dataset = self.train_dataset val_dataset = self.val_dataset batch_size = self.batch_size num_workers = self.num_workers network = self.network print('aqui 0') if network == 'ssd': # use fake data to generate fixed anchors for target generation with autograd.train_mode(): _, _, anchors = self.net(mx.nd.zeros((1, 3, height, width))) batchify_fn = Tuple( Stack(), Stack(), Stack()) # stack image, cls_targets, box_targets train_loader = gluon.data.DataLoader(train_dataset.transform( SSDDefaultTrainTransform(width, height, anchors)), batch_size, True, batchify_fn=batchify_fn, last_batch='rollover', num_workers=num_workers) # Val verdadeiro val_batchify_fn = Tuple(Stack(), Pad(pad_val=-1)) val_loader = gluon.data.DataLoader(val_dataset.transform( SSDDefaultValTransform(width, height)), batch_size, False, batchify_fn=val_batchify_fn, last_batch='keep', num_workers=num_workers) # use fake data to generate fixed anchors for target generation with mx.Context(mx.gpu(0)): anchors2 = anchors val_loader_loss = gluon.data.DataLoader(val_dataset.transform( SSDCustomValTransform(width, height, anchors2)), batch_size, True, batchify_fn=batchify_fn, last_batch='rollover', num_workers=num_workers) self.val_loader_loss = val_loader_loss elif network == 'yolo': print('aqui 1') batchify_fn = Tuple( *([Stack() for _ in range(6)] + [Pad(axis=0, pad_val=-1) for _ in range(1)])) # stack image, all targets generated # if args.no_random_shape: train_loader = gluon.data.DataLoader(train_dataset.transform( YOLO3DefaultTrainTransform(width, height, self.net)), batch_size, True, batchify_fn=batchify_fn, last_batch='rollover', num_workers=num_workers) val_batchify_fn = Tuple(Stack(), Pad(pad_val=-1)) val_loader = gluon.data.DataLoader(val_dataset.transform( YOLO3DefaultValTransform(width, height)), batch_size, False, batchify_fn=val_batchify_fn, last_batch='keep', num_workers=num_workers) print('aqui 2') else: raise ValueError("Network {} not implemented".format(network)) self.val_loader = val_loader self.train_loader = train_loader
def get_dataloader(net, train_dataset, val_dataset, data_shape, batch_size, num_workers, args): import gluoncv as gcv gcv.utils.check_version("0.6.0") from gluoncv import data as gdata from gluoncv import utils as gutils from gluoncv.data.batchify import Pad, Stack, Tuple from gluoncv.data.dataloader import RandomTransformDataLoader from gluoncv.data.transforms.presets.yolo import ( YOLO3DefaultTrainTransform, YOLO3DefaultValTransform, ) from gluoncv.model_zoo import get_model from gluoncv.utils import LRScheduler, LRSequential from gluoncv.utils.metrics.coco_detection import COCODetectionMetric from gluoncv.utils.metrics.voc_detection import VOC07MApMetric """Get dataloader.""" width, height = data_shape, data_shape batchify_fn = Tuple( *([Stack() for _ in range(6)] + [Pad(axis=0, pad_val=-1) for _ in range(1)]) ) # stack image, all targets generated if args.no_random_shape: print(len(train_dataset)) img, label = train_dataset[0] print(img.shape, label.shape) train_loader = gluon.data.DataLoader( train_dataset.transform( YOLO3DefaultTrainTransform(width, height, net, mixup=args.mixup) ), batch_size, True, batchify_fn=batchify_fn, last_batch="rollover", num_workers=num_workers, ) else: transform_fns = [ YOLO3DefaultTrainTransform(x * 32, x * 32, net, mixup=args.mixup) for x in range(10, 20) ] train_loader = RandomTransformDataLoader( transform_fns, train_dataset, batch_size=batch_size, interval=10, last_batch="rollover", shuffle=True, batchify_fn=batchify_fn, num_workers=num_workers, ) val_batchify_fn = Tuple(Stack(), Pad(pad_val=-1)) val_loader = gluon.data.DataLoader( val_dataset.transform(YOLO3DefaultValTransform(width, height)), batch_size, False, batchify_fn=val_batchify_fn, last_batch="keep", num_workers=num_workers, ) return train_loader, val_loader
def train(net, async_net, ctx, args): """Training pipeline""" net.collect_params().reset_ctx(ctx) if args.no_wd: for k, v in net.collect_params(".*beta|.*gamma|.*bias").items(): v.wd_mult = 0.0 if args.label_smooth: net._target_generator._label_smooth = True if args.lr_decay_period > 0: lr_decay_epoch = list( range(args.lr_decay_period, args.epochs, args.lr_decay_period)) else: lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')] lr_scheduler = LRSequential([ LRScheduler("linear", base_lr=0, target_lr=args.lr, nepochs=args.warmup_epochs, iters_per_epoch=args.batch_size), LRScheduler(args.lr_mode, base_lr=args.lr, nepochs=args.epochs - args.warmup_epochs, iters_per_epoch=args.batch_size, step_epoch=lr_decay_epoch, step_factor=args.lr_decay, power=2), ]) if (args.optimizer == "sgd"): trainer = gluon.Trainer(net.collect_params(), args.optimizer, { "wd": args.wd, "momentum": args.momentum, "lr_scheduler": lr_scheduler }, kvstore="local") elif (args.optimizer == "adam"): trainer = gluon.Trainer(net.collect_params(), args.optimizer, {"lr_scheduler": lr_scheduler}, kvstore="local") else: trainer = gluon.Trainer(net.collect_params(), args.optimizer, kvstore="local") # targets #sigmoid_ce = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) #l1_loss = gluon.loss.L1Loss() # Intermediate Metrics: train_metrics = ( mx.metric.Loss("ObjLoss"), mx.metric.Loss("BoxCenterLoss"), mx.metric.Loss("BoxScaleLoss"), mx.metric.Loss("ClassLoss"), mx.metric.Loss("TotalLoss"), ) train_metric_ixs = range(len(train_metrics)) target_metric_ix = -1 # Train towards TotalLoss (the last one) # Evaluation Metrics: val_metric = VOC07MApMetric(iou_thresh=0.5) # Data transformations: train_dataset = gluon_pipe_mode.AugmentedManifestDetection( args.train, length=args.num_samples_train, ) train_batchify_fn = batchify.Tuple( *([batchify.Stack() for _ in range(6)] + [batchify.Pad(axis=0, pad_val=-1) for _ in range(1)])) if args.no_random_shape: logger.debug("Creating train DataLoader without random transform") train_transforms = YOLO3DefaultTrainTransform(args.data_shape, args.data_shape, net=async_net, mixup=args.mixup) train_dataloader = gluon.data.DataLoader( train_dataset.transform(train_transforms), batch_size=args.batch_size, batchify_fn=train_batchify_fn, last_batch="discard", num_workers=args.num_workers, shuffle= False, # Note that shuffle *cannot* be used with AugmentedManifestDetection ) else: logger.debug("Creating train DataLoader with random transform") train_transforms = [ YOLO3DefaultTrainTransform(x * 32, x * 32, net=async_net, mixup=args.mixup) for x in range(10, 20) ] train_dataloader = RandomTransformDataLoader( train_transforms, train_dataset, interval=10, batch_size=args.batch_size, batchify_fn=train_batchify_fn, last_batch="discard", num_workers=args.num_workers, shuffle= False, # Note that shuffle *cannot* be used with AugmentedManifestDetection ) validation_dataset = None validation_dataloader = None if args.validation: validation_dataset = gluon_pipe_mode.AugmentedManifestDetection( args.validation, length=args.num_samples_validation, ) validation_dataloader = gluon.data.DataLoader( validation_dataset.transform( YOLO3DefaultValTransform(args.data_shape, args.data_shape), ), args.batch_size, shuffle=False, batchify_fn=batchify.Tuple(batchify.Stack(), batchify.Pad(pad_val=-1)), last_batch="keep", num_workers=args.num_workers, ) # Prepare the inference-time configuration for our model's setup: # (This will be saved alongside our network structure/params) inference_config = config.InferenceConfig(image_size=args.data_shape) logger.info(args) logger.info(f"Start training from [Epoch {args.start_epoch}]") prev_best_score = float("-inf") best_epoch = args.start_epoch logger.info("Sleeping for 3s in case training data file not yet ready") time.sleep(3) for epoch in range(args.start_epoch, args.start_epoch + args.epochs): # if args.mixup: # # TODO(zhreshold): more elegant way to control mixup during runtime # try: # train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5) # except AttributeError: # train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5) # if epoch >= args.epochs - args.no_mixup_epochs: # try: # train_data._dataset.set_mixup(None) # except AttributeError: # train_data._dataset._data.set_mixup(None) tic = time.time() btic = time.time() mx.nd.waitall() net.hybridize() logger.debug( f"Input data dir contents: {os.listdir('/opt/ml/input/data/')}") for i, batch in enumerate(train_dataloader): logger.debug(f"Epoch {epoch}, minibatch {i}") batch_size = batch[0].shape[0] data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False) # objectness, center_targets, scale_targets, weights, class_targets fixed_targets = [ gluon.utils.split_and_load(batch[it], ctx_list=ctx, batch_axis=0, even_split=False) for it in range(1, 6) ] gt_boxes = gluon.utils.split_and_load(batch[6], ctx_list=ctx, batch_axis=0, even_split=False) loss_trackers = tuple([] for metric in train_metrics) with autograd.record(): for ix, x in enumerate(data): losses_raw = net(x, gt_boxes[ix], *[ft[ix] for ft in fixed_targets]) # net outputs: [obj_loss, center_loss, scale_loss, cls_loss] # Each a mx.ndarray 1xbatch_size. This is the same order as our # train_metrics, so we just need to add a total vector: total_loss = sum(losses_raw) losses = losses_raw + [total_loss] # If any sample's total loss is non-finite, sum will be: if not isfinite(sum(total_loss)): logger.error( f"[Epoch {epoch}][Minibatch {i}] got non-finite losses: {losses_raw}" ) # TODO: Terminate training if losses or gradient go infinite? for ix in train_metric_ixs: loss_trackers[ix].append(losses[ix]) autograd.backward(loss_trackers[target_metric_ix]) trainer.step(batch_size) for ix in train_metric_ixs: train_metrics[ix].update(0, loss_trackers[ix]) if args.log_interval and not (i + 1) % args.log_interval: train_metrics_current = map(lambda metric: metric.get(), train_metrics) metrics_msg = "; ".join([ f"{name}={val:.3f}" for name, val in train_metrics_current ]) logger.info( f"[Epoch {epoch}][Minibatch {i}] LR={trainer.learning_rate:.2E}; " f"Speed={batch_size/(time.time()-btic):.3f} samples/sec; {metrics_msg};" ) btic = time.time() train_metrics_current = map(lambda metric: metric.get(), train_metrics) metrics_msg = "; ".join( [f"{name}={val:.3f}" for name, val in train_metrics_current]) logger.info( f"[Epoch {epoch}] TrainingCost={time.time()-tic:.3f}; {metrics_msg};" ) if not (epoch + 1) % args.val_interval: logger.info(f"Validating [Epoch {epoch}]") metric_names, metric_values = validate( net, validation_dataloader, epoch, ctx, VOC07MApMetric(iou_thresh=0.5), args) if isinstance(metric_names, list): val_msg = "; ".join( [f"{k}={v}" for k, v in zip(metric_names, metric_values)]) current_score = float(metric_values[-1]) else: val_msg = f"{metric_names}={metric_values}" current_score = metric_values logger.info(f"[Epoch {epoch}] Validation: {val_msg};") else: current_score = float("-inf") save_progress( net, inference_config, current_score, prev_best_score, args.model_dir, epoch, args.checkpoint_interval, args.checkpoint_dir, ) if current_score > prev_best_score: prev_best_score = current_score best_epoch = epoch if (args.early_stopping and epoch >= args.early_stopping_min_epochs and (epoch - best_epoch) >= args.early_stopping_patience): logger.info( f"[Epoch {epoch}] No improvement since epoch {best_epoch}: Stopping early" ) break
def train(net, async_net, ctx, args): """Training pipeline""" net.collect_params().reset_ctx(ctx) if args.no_wd: for k, v in net.collect_params(".*beta|.*gamma|.*bias").items(): v.wd_mult = 0.0 if args.label_smooth: net._target_generator._label_smooth = True if args.lr_decay_period > 0: lr_decay_epoch = list( range(args.lr_decay_period, args.epochs, args.lr_decay_period)) else: lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')] lr_scheduler = LRSequential([ LRScheduler("linear", base_lr=0, target_lr=args.lr, nepochs=args.warmup_epochs, iters_per_epoch=args.batch_size), LRScheduler(args.lr_mode, base_lr=args.lr, nepochs=args.epochs - args.warmup_epochs, iters_per_epoch=args.batch_size, step_epoch=lr_decay_epoch, step_factor=args.lr_decay, power=2), ]) if (args.optimizer == "sgd"): trainer = gluon.Trainer(net.collect_params(), args.optimizer, { "wd": args.wd, "momentum": args.momentum, "lr_scheduler": lr_scheduler }, kvstore="local") elif (args.optimizer == "adam"): trainer = gluon.Trainer(net.collect_params(), args.optimizer, {"lr_scheduler": lr_scheduler}, kvstore="local") else: trainer = gluon.Trainer(net.collect_params(), args.optimizer, kvstore="local") # targets #sigmoid_ce = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) #l1_loss = gluon.loss.L1Loss() # Intermediate Metrics: train_metrics = ( mx.metric.Loss("ObjLoss"), mx.metric.Loss("BoxCenterLoss"), mx.metric.Loss("BoxScaleLoss"), mx.metric.Loss("ClassLoss"), mx.metric.Loss("TotalLoss"), ) train_metric_ixs = range(len(train_metrics)) target_metric_ix = -1 # Train towards TotalLoss (the last one) # Evaluation Metrics: val_metric = VOC07MApMetric(iou_thresh=0.5) # Data transformations: train_batchify_fn = Tuple(*([Stack() for _ in range(6)] + [Pad(axis=0, pad_val=-1) for _ in range(1)])) train_transforms = (YOLO3DefaultTrainTransform( args.data_shape, args.data_shape, net=async_net, mixup=args.mixup) if args.no_random_shape else [ YOLO3DefaultTrainTransform( x * 32, x * 32, net=async_net, mixup=args.mixup) for x in range(10, 20) ]) validation_batchify_fn = None validation_transforms = None if args.validation: validation_batchify_fn = Tuple(Stack(), Pad(pad_val=-1)) validation_transforms = YOLO3DefaultValTransform( args.data_shape, args.data_shape) logger.info(args) logger.info(f"Start training from [Epoch {args.start_epoch}]") prev_best_score = float("-inf") best_epoch = args.start_epoch logger.info("Sleeping for 3s in case training data file not yet ready") time.sleep(3) for epoch in range(args.start_epoch, args.epochs): # if args.mixup: # # TODO(zhreshold): more elegant way to control mixup during runtime # try: # train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5) # except AttributeError: # train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5) # if epoch >= args.epochs - args.no_mixup_epochs: # try: # train_data._dataset.set_mixup(None) # except AttributeError: # train_data._dataset._data.set_mixup(None) tic = time.time() btic = time.time() mx.nd.waitall() net.hybridize() logger.debug( f'Input data dir contents: {os.listdir("/opt/ml/input/data/")}') train_data_gen = pipe_detection_minibatch( epoch, channel=args.train, batch_size=args.stream_batch_size) for ix_streambatch, train_dataset in enumerate(train_data_gen): # TODO: Mixup is kinda rubbish if it's only within a (potentially small) batch if args.mixup: train_dataset = MixupDetection(train_dataset) # Create dataloader for the stream-batch: if args.no_random_shape: logger.debug( "Creating train DataLoader without random transform") train_dataloader = gluon.data.DataLoader( train_dataset.transform(train_transforms), batch_size=args.batch_size, batchify_fn=train_batchify_fn, last_batch="discard", num_workers=args.num_workers, shuffle=True, ) else: logger.debug("Creating train DataLoader with random transform") train_dataloader = RandomTransformDataLoader( train_transforms, train_dataset, interval=10, batch_size=args.batch_size, batchify_fn=train_batchify_fn, last_batch="discard", num_workers=args.num_workers, shuffle=True, ) if args.mixup: logger.debug("Shuffling stream-batch") # TODO(zhreshold): more elegant way to control mixup during runtime try: train_dataloader._dataset.set_mixup( np.random.beta, 1.5, 1.5) except AttributeError: train_dataloader._dataset._data.set_mixup( np.random.beta, 1.5, 1.5) if epoch >= args.epochs - args.no_mixup_epochs: try: train_dataloader._dataset.set_mixup(None) except AttributeError: train_dataloader._dataset._data.set_mixup(None) logger.debug( f"Training on stream-batch {ix_streambatch} ({len(train_dataset)} records)" ) # TODO: Improve stream-batching robustness to drop loop guard clauses # While it would be nice to simply `for i, batch in enumerate(train_dataloader):`, # corrupted image buffers are somehow sneaking through the stream-batch at the moment. # # For now, we catch and tolerate these errors - trying to resume stream-batch process # where possible and otherwise discarding the remainder of the stream-batch :-( done = False i = -1 dataiter = iter(train_dataloader) while not done: i += 1 batch = None while not batch: try: batch = next(dataiter) except StopIteration: done = True break except ValueError: # Some problem with the minibatch prevented loading - try the next logger.warn( f"[Epoch {epoch}][Streambatch {ix_streambatch}] " f"Failed to load minibatch {i}, trying next...") i += 1 except: logger.error( f"[Epoch {epoch}][Streambatch {ix_streambatch}] " f"Failed to iterate minibatch {i}: Discarding remainder" ) break if not batch: logger.debug( f"[Epoch {epoch}][Streambatch {ix_streambatch}] " f"Done after {i} minibatches") break logger.debug( f"Epoch {epoch}, stream batch {ix_streambatch}, minibatch {i}" ) batch_size = batch[0].shape[0] data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False) # objectness, center_targets, scale_targets, weights, class_targets fixed_targets = [ gluon.utils.split_and_load(batch[it], ctx_list=ctx, batch_axis=0, even_split=False) for it in range(1, 6) ] gt_boxes = gluon.utils.split_and_load(batch[6], ctx_list=ctx, batch_axis=0, even_split=False) loss_trackers = tuple([] for metric in train_metrics) with autograd.record(): for ix, x in enumerate(data): losses_raw = net(x, gt_boxes[ix], *[ft[ix] for ft in fixed_targets]) # net outputs: [obj_loss, center_loss, scale_loss, cls_loss] # Each a mx.ndarray 1xbatch_size. This is the same order as our # train_metrics, so we just need to add a total vector: total_loss = sum(losses_raw) losses = losses_raw + [total_loss] # If any sample's total loss is non-finite, sum will be: if not isfinite(sum(total_loss)): logger.error( f"[Epoch {epoch}][Streambatch {ix_streambatch}][Minibatch {i}] " f"got non-finite losses: {losses_raw}") # TODO: Terminate training if losses or gradient go infinite? for ix in train_metric_ixs: loss_trackers[ix].append(losses[ix]) autograd.backward(loss_trackers[target_metric_ix]) trainer.step(batch_size) for ix in train_metric_ixs: train_metrics[ix].update(0, loss_trackers[ix]) if args.log_interval and not (i + 1) % args.log_interval: train_metrics_current = map(lambda metric: metric.get(), train_metrics) metrics_msg = "; ".join([ f"{name}={val:.3f}" for name, val in train_metrics_current ]) logger.info( f"[Epoch {epoch}][Streambatch {ix_streambatch}][Minibatch {i}] " f"LR={trainer.learning_rate:.2E}; " f"Speed={batch_size/(time.time()-btic):.3f} samples/sec; {metrics_msg};" ) btic = time.time() train_metrics_current = map(lambda metric: metric.get(), train_metrics) metrics_msg = "; ".join( [f"{name}={val:.3f}" for name, val in train_metrics_current]) logger.info( f"[Epoch {epoch}] TrainingCost={time.time()-tic:.3f}; {metrics_msg};" ) if not (epoch + 1) % args.val_interval: logger.info(f"Validating [Epoch {epoch}]") metric_names, metric_values = validate( net, args.validation, epoch, ctx, VOC07MApMetric(iou_thresh=0.5), validation_transforms, validation_batchify_fn, args) if isinstance(metric_names, list): val_msg = "; ".join( [f"{k}={v}" for k, v in zip(metric_names, metric_values)]) current_score = float(metric_values[-1]) else: val_msg = f"{metric_names}={metric_values}" current_score = metric_values logger.info(f"[Epoch {epoch}] Validation: {val_msg};") else: current_score = float("-inf") save_progress(net, current_score, prev_best_score, args.model_dir, epoch, args.checkpoint_interval, args.checkpoint_dir) if current_score > prev_best_score: prev_best_score = current_score best_epoch = epoch if (args.early_stopping and epoch >= args.early_stopping_min_epochs and (epoch - best_epoch) >= args.early_stopping_patience): logger.info( f"[Epoch {epoch}] No improvement since epoch {best_epoch}: Stopping early" ) break
args = parse_args() # training contexts ctx = [mx.gpu(int(i)) for i in args.gpus.split(',') if i.strip()] ctx = ctx if ctx else [mx.cpu()] # network name net_name = '_'.join(('yolo3', args.network, args.dataset)) args.save_prefix += net_name # get netework net = get_model(net_name, pretrained_base=True) # training data width, height = 500, 500 # resize image to 416x416 after all data augmentation train_transform = YOLO3DefaultTrainTransform(width, height, net) val_transform = YOLO3DefaultValTransform(width, height, net) width, height = data_shape, data_shape batchify_fn = Tuple(*([Stack() for _ in range(6)] + [Pad(axis=0, pad_val=-1) for _ in range(1)])) # stack image, all targets generate #get datasets train_dataset, val_dataset, eval_metric = get_voc_dataset(args.dataset, args) #get data loaders train_loader = gluon.data.DataLoader(train_dataset.transform(train_transform),args.batch_size, True, batchify_fn=batchify_fn, last_batch='rollover', num_workers=args.num_workers) val_batchify_fn = Tuple(Stack(), Pad(pad_val=-1)) val_loader = gluon.data.DataLoader(val_dataset.transform(val_transform),args.batch_size, False, batchify_fn=val_batchify_fn, last_batch='keep', num_workers=args.num_workers) #define eval_metric #eval_metric=VOC07MApMetric(iou_thresh=0.5, class_names=val_dataset.classes)