Exemple #1
0
def get_dataloader(net, train_dataset, val_dataset, data_shape, batch_size,
                   num_workers, args):
    """Get dataloader."""
    width, height = data_shape, data_shape
    batchify_fn = Tuple(
        *([Stack() for _ in range(6)] +
          [Pad(axis=0, pad_val=-1)
           for _ in range(1)]))  # stack image, all targets generated
    if args.no_random_shape:
        train_loader = gluon.data.DataLoader(train_dataset.transform(
            YOLO3DefaultTrainTransform(width, height, net, mixup=args.mixup)),
                                             batch_size,
                                             True,
                                             batchify_fn=batchify_fn,
                                             last_batch='rollover',
                                             num_workers=num_workers)
    else:
        transform_fns = [
            YOLO3DefaultTrainTransform(x * 32, x * 32, net, mixup=args.mixup)
            for x in range(10, 20)
        ]
        train_loader = RandomTransformDataLoader(transform_fns,
                                                 train_dataset,
                                                 batch_size=batch_size,
                                                 interval=10,
                                                 last_batch='rollover',
                                                 shuffle=True,
                                                 batchify_fn=batchify_fn,
                                                 num_workers=num_workers)
    val_batchify_fn = Tuple(Stack(), Pad(pad_val=-1))
    val_loader = gluon.data.DataLoader(val_dataset.transform(
        YOLO3DefaultValTransform(width, height)),
                                       batch_size,
                                       False,
                                       batchify_fn=val_batchify_fn,
                                       last_batch='keep',
                                       num_workers=num_workers)
    return train_loader, val_loader
Exemple #2
0
 def _get_dataloader(net, test_dataset, data_shape, batch_size, num_workers, num_devices,
                     args):
     """Get dataloader."""
     if args.meta_arch == 'yolo3':
         width, height = data_shape, data_shape
         val_batchify_fn = Tuple(Stack(), Pad(pad_val=-1))
         test_loader = gluon.data.DataLoader(
             test_dataset.transform(YOLO3DefaultValTransform(width, height)),
             batch_size, False, batchify_fn=val_batchify_fn, last_batch='keep',
             num_workers=num_workers)
         return test_loader
     elif args.meta_arch == 'faster_rcnn':
         """Get faster rcnn dataloader."""
         test_bfn = Tuple(*[Append() for _ in range(3)])
         short = net.short[-1] if isinstance(net.short, (tuple, list)) else net.short
         # validation use 1 sample per device
         test_loader = gluon.data.DataLoader(
             test_dataset.transform(FasterRCNNDefaultValTransform(short, net.max_size)),
             num_devices, False, batchify_fn=test_bfn, last_batch='keep',
             num_workers=args.num_workers)
         return test_loader
     else:
         raise NotImplementedError('%s not implemented.' % args.meta_arch)
Exemple #3
0
def get_dataloader(net, train_dataset, val_dataset, data_shape, batch_size,
                   num_workers):
    """Get dataloader."""
    width, height = data_shape, data_shape
    batchify_fn = Tuple(
        *([Stack() for _ in range(6)] +
          [Pad(axis=0, pad_val=-1)
           for _ in range(1)]))  # stack image, all targets generated
    train_loader = gluon.data.DataLoader(train_dataset.transform(
        YOLO3DefaultTrainTransform(width, height, net)),
                                         batch_size,
                                         True,
                                         batchify_fn=batchify_fn,
                                         last_batch='rollover',
                                         num_workers=num_workers)
    val_batchify_fn = Tuple(Stack(), Pad(pad_val=-1))
    val_loader = gluon.data.DataLoader(val_dataset.transform(
        YOLO3DefaultValTransform(width, height)),
                                       batch_size,
                                       False,
                                       batchify_fn=val_batchify_fn,
                                       last_batch='keep',
                                       num_workers=num_workers)
    return train_loader, val_loader
    def get_dataloader(self):
        width, height = self.width, self.height
        train_dataset = self.train_dataset
        val_dataset = self.val_dataset
        batch_size = self.batch_size
        num_workers = self.num_workers
        network = self.network
        print('aqui 0')
        if network == 'ssd':
            # use fake data to generate fixed anchors for target generation
            with autograd.train_mode():
                _, _, anchors = self.net(mx.nd.zeros((1, 3, height, width)))

            batchify_fn = Tuple(
                Stack(), Stack(),
                Stack())  # stack image, cls_targets, box_targets
            train_loader = gluon.data.DataLoader(train_dataset.transform(
                SSDDefaultTrainTransform(width, height, anchors)),
                                                 batch_size,
                                                 True,
                                                 batchify_fn=batchify_fn,
                                                 last_batch='rollover',
                                                 num_workers=num_workers)

            # Val verdadeiro
            val_batchify_fn = Tuple(Stack(), Pad(pad_val=-1))
            val_loader = gluon.data.DataLoader(val_dataset.transform(
                SSDDefaultValTransform(width, height)),
                                               batch_size,
                                               False,
                                               batchify_fn=val_batchify_fn,
                                               last_batch='keep',
                                               num_workers=num_workers)

            # use fake data to generate fixed anchors for target generation
            with mx.Context(mx.gpu(0)):
                anchors2 = anchors

            val_loader_loss = gluon.data.DataLoader(val_dataset.transform(
                SSDCustomValTransform(width, height, anchors2)),
                                                    batch_size,
                                                    True,
                                                    batchify_fn=batchify_fn,
                                                    last_batch='rollover',
                                                    num_workers=num_workers)
            self.val_loader_loss = val_loader_loss
        elif network == 'yolo':
            print('aqui 1')
            batchify_fn = Tuple(
                *([Stack() for _ in range(6)] +
                  [Pad(axis=0, pad_val=-1)
                   for _ in range(1)]))  # stack image, all targets generated
            # if args.no_random_shape:
            train_loader = gluon.data.DataLoader(train_dataset.transform(
                YOLO3DefaultTrainTransform(width, height, self.net)),
                                                 batch_size,
                                                 True,
                                                 batchify_fn=batchify_fn,
                                                 last_batch='rollover',
                                                 num_workers=num_workers)

            val_batchify_fn = Tuple(Stack(), Pad(pad_val=-1))
            val_loader = gluon.data.DataLoader(val_dataset.transform(
                YOLO3DefaultValTransform(width, height)),
                                               batch_size,
                                               False,
                                               batchify_fn=val_batchify_fn,
                                               last_batch='keep',
                                               num_workers=num_workers)
            print('aqui 2')
        else:
            raise ValueError("Network {} not implemented".format(network))

        self.val_loader = val_loader
        self.train_loader = train_loader
def get_dataloader(net, train_dataset, val_dataset, data_shape, batch_size, num_workers, args):

    import gluoncv as gcv

    gcv.utils.check_version("0.6.0")
    from gluoncv import data as gdata
    from gluoncv import utils as gutils
    from gluoncv.data.batchify import Pad, Stack, Tuple
    from gluoncv.data.dataloader import RandomTransformDataLoader
    from gluoncv.data.transforms.presets.yolo import (
        YOLO3DefaultTrainTransform,
        YOLO3DefaultValTransform,
    )
    from gluoncv.model_zoo import get_model
    from gluoncv.utils import LRScheduler, LRSequential
    from gluoncv.utils.metrics.coco_detection import COCODetectionMetric
    from gluoncv.utils.metrics.voc_detection import VOC07MApMetric

    """Get dataloader."""
    width, height = data_shape, data_shape
    batchify_fn = Tuple(
        *([Stack() for _ in range(6)] + [Pad(axis=0, pad_val=-1) for _ in range(1)])
    )  # stack image, all targets generated
    if args.no_random_shape:
        print(len(train_dataset))
        img, label = train_dataset[0]
        print(img.shape, label.shape)
        train_loader = gluon.data.DataLoader(
            train_dataset.transform(
                YOLO3DefaultTrainTransform(width, height, net, mixup=args.mixup)
            ),
            batch_size,
            True,
            batchify_fn=batchify_fn,
            last_batch="rollover",
            num_workers=num_workers,
        )
    else:
        transform_fns = [
            YOLO3DefaultTrainTransform(x * 32, x * 32, net, mixup=args.mixup) for x in range(10, 20)
        ]
        train_loader = RandomTransformDataLoader(
            transform_fns,
            train_dataset,
            batch_size=batch_size,
            interval=10,
            last_batch="rollover",
            shuffle=True,
            batchify_fn=batchify_fn,
            num_workers=num_workers,
        )
    val_batchify_fn = Tuple(Stack(), Pad(pad_val=-1))
    val_loader = gluon.data.DataLoader(
        val_dataset.transform(YOLO3DefaultValTransform(width, height)),
        batch_size,
        False,
        batchify_fn=val_batchify_fn,
        last_batch="keep",
        num_workers=num_workers,
    )
    return train_loader, val_loader
Exemple #6
0
def train(net, async_net, ctx, args):
    """Training pipeline"""
    net.collect_params().reset_ctx(ctx)
    if args.no_wd:
        for k, v in net.collect_params(".*beta|.*gamma|.*bias").items():
            v.wd_mult = 0.0

    if args.label_smooth:
        net._target_generator._label_smooth = True

    if args.lr_decay_period > 0:
        lr_decay_epoch = list(
            range(args.lr_decay_period, args.epochs, args.lr_decay_period))
    else:
        lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')]

    lr_scheduler = LRSequential([
        LRScheduler("linear",
                    base_lr=0,
                    target_lr=args.lr,
                    nepochs=args.warmup_epochs,
                    iters_per_epoch=args.batch_size),
        LRScheduler(args.lr_mode,
                    base_lr=args.lr,
                    nepochs=args.epochs - args.warmup_epochs,
                    iters_per_epoch=args.batch_size,
                    step_epoch=lr_decay_epoch,
                    step_factor=args.lr_decay,
                    power=2),
    ])
    if (args.optimizer == "sgd"):
        trainer = gluon.Trainer(net.collect_params(),
                                args.optimizer, {
                                    "wd": args.wd,
                                    "momentum": args.momentum,
                                    "lr_scheduler": lr_scheduler
                                },
                                kvstore="local")
    elif (args.optimizer == "adam"):
        trainer = gluon.Trainer(net.collect_params(),
                                args.optimizer, {"lr_scheduler": lr_scheduler},
                                kvstore="local")
    else:
        trainer = gluon.Trainer(net.collect_params(),
                                args.optimizer,
                                kvstore="local")

    # targets
    #sigmoid_ce = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False)
    #l1_loss = gluon.loss.L1Loss()

    # Intermediate Metrics:
    train_metrics = (
        mx.metric.Loss("ObjLoss"),
        mx.metric.Loss("BoxCenterLoss"),
        mx.metric.Loss("BoxScaleLoss"),
        mx.metric.Loss("ClassLoss"),
        mx.metric.Loss("TotalLoss"),
    )
    train_metric_ixs = range(len(train_metrics))
    target_metric_ix = -1  # Train towards TotalLoss (the last one)

    # Evaluation Metrics:
    val_metric = VOC07MApMetric(iou_thresh=0.5)

    # Data transformations:
    train_dataset = gluon_pipe_mode.AugmentedManifestDetection(
        args.train,
        length=args.num_samples_train,
    )
    train_batchify_fn = batchify.Tuple(
        *([batchify.Stack() for _ in range(6)] +
          [batchify.Pad(axis=0, pad_val=-1) for _ in range(1)]))
    if args.no_random_shape:
        logger.debug("Creating train DataLoader without random transform")
        train_transforms = YOLO3DefaultTrainTransform(args.data_shape,
                                                      args.data_shape,
                                                      net=async_net,
                                                      mixup=args.mixup)
        train_dataloader = gluon.data.DataLoader(
            train_dataset.transform(train_transforms),
            batch_size=args.batch_size,
            batchify_fn=train_batchify_fn,
            last_batch="discard",
            num_workers=args.num_workers,
            shuffle=
            False,  # Note that shuffle *cannot* be used with AugmentedManifestDetection
        )
    else:
        logger.debug("Creating train DataLoader with random transform")
        train_transforms = [
            YOLO3DefaultTrainTransform(x * 32,
                                       x * 32,
                                       net=async_net,
                                       mixup=args.mixup)
            for x in range(10, 20)
        ]
        train_dataloader = RandomTransformDataLoader(
            train_transforms,
            train_dataset,
            interval=10,
            batch_size=args.batch_size,
            batchify_fn=train_batchify_fn,
            last_batch="discard",
            num_workers=args.num_workers,
            shuffle=
            False,  # Note that shuffle *cannot* be used with AugmentedManifestDetection
        )
    validation_dataset = None
    validation_dataloader = None
    if args.validation:
        validation_dataset = gluon_pipe_mode.AugmentedManifestDetection(
            args.validation,
            length=args.num_samples_validation,
        )
        validation_dataloader = gluon.data.DataLoader(
            validation_dataset.transform(
                YOLO3DefaultValTransform(args.data_shape, args.data_shape), ),
            args.batch_size,
            shuffle=False,
            batchify_fn=batchify.Tuple(batchify.Stack(),
                                       batchify.Pad(pad_val=-1)),
            last_batch="keep",
            num_workers=args.num_workers,
        )

    # Prepare the inference-time configuration for our model's setup:
    # (This will be saved alongside our network structure/params)
    inference_config = config.InferenceConfig(image_size=args.data_shape)

    logger.info(args)
    logger.info(f"Start training from [Epoch {args.start_epoch}]")
    prev_best_score = float("-inf")
    best_epoch = args.start_epoch
    logger.info("Sleeping for 3s in case training data file not yet ready")
    time.sleep(3)
    for epoch in range(args.start_epoch, args.start_epoch + args.epochs):
        #         if args.mixup:
        #             # TODO(zhreshold): more elegant way to control mixup during runtime
        #             try:
        #                 train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5)
        #             except AttributeError:
        #                 train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5)
        #             if epoch >= args.epochs - args.no_mixup_epochs:
        #                 try:
        #                     train_data._dataset.set_mixup(None)
        #                 except AttributeError:
        #                     train_data._dataset._data.set_mixup(None)

        tic = time.time()
        btic = time.time()
        mx.nd.waitall()
        net.hybridize()

        logger.debug(
            f"Input data dir contents: {os.listdir('/opt/ml/input/data/')}")
        for i, batch in enumerate(train_dataloader):
            logger.debug(f"Epoch {epoch}, minibatch {i}")

            batch_size = batch[0].shape[0]
            data = gluon.utils.split_and_load(batch[0],
                                              ctx_list=ctx,
                                              batch_axis=0,
                                              even_split=False)
            # objectness, center_targets, scale_targets, weights, class_targets
            fixed_targets = [
                gluon.utils.split_and_load(batch[it],
                                           ctx_list=ctx,
                                           batch_axis=0,
                                           even_split=False)
                for it in range(1, 6)
            ]
            gt_boxes = gluon.utils.split_and_load(batch[6],
                                                  ctx_list=ctx,
                                                  batch_axis=0,
                                                  even_split=False)
            loss_trackers = tuple([] for metric in train_metrics)
            with autograd.record():
                for ix, x in enumerate(data):
                    losses_raw = net(x, gt_boxes[ix],
                                     *[ft[ix] for ft in fixed_targets])
                    # net outputs: [obj_loss, center_loss, scale_loss, cls_loss]
                    # Each a mx.ndarray 1xbatch_size. This is the same order as our
                    # train_metrics, so we just need to add a total vector:
                    total_loss = sum(losses_raw)
                    losses = losses_raw + [total_loss]

                    # If any sample's total loss is non-finite, sum will be:
                    if not isfinite(sum(total_loss)):
                        logger.error(
                            f"[Epoch {epoch}][Minibatch {i}] got non-finite losses: {losses_raw}"
                        )
                        # TODO: Terminate training if losses or gradient go infinite?

                    for ix in train_metric_ixs:
                        loss_trackers[ix].append(losses[ix])

                autograd.backward(loss_trackers[target_metric_ix])
            trainer.step(batch_size)
            for ix in train_metric_ixs:
                train_metrics[ix].update(0, loss_trackers[ix])

            if args.log_interval and not (i + 1) % args.log_interval:
                train_metrics_current = map(lambda metric: metric.get(),
                                            train_metrics)
                metrics_msg = "; ".join([
                    f"{name}={val:.3f}" for name, val in train_metrics_current
                ])
                logger.info(
                    f"[Epoch {epoch}][Minibatch {i}] LR={trainer.learning_rate:.2E}; "
                    f"Speed={batch_size/(time.time()-btic):.3f} samples/sec; {metrics_msg};"
                )
            btic = time.time()

        train_metrics_current = map(lambda metric: metric.get(), train_metrics)
        metrics_msg = "; ".join(
            [f"{name}={val:.3f}" for name, val in train_metrics_current])
        logger.info(
            f"[Epoch {epoch}] TrainingCost={time.time()-tic:.3f}; {metrics_msg};"
        )

        if not (epoch + 1) % args.val_interval:
            logger.info(f"Validating [Epoch {epoch}]")

            metric_names, metric_values = validate(
                net, validation_dataloader, epoch, ctx,
                VOC07MApMetric(iou_thresh=0.5), args)
            if isinstance(metric_names, list):
                val_msg = "; ".join(
                    [f"{k}={v}" for k, v in zip(metric_names, metric_values)])
                current_score = float(metric_values[-1])
            else:
                val_msg = f"{metric_names}={metric_values}"
                current_score = metric_values
            logger.info(f"[Epoch {epoch}] Validation: {val_msg};")
        else:
            current_score = float("-inf")

        save_progress(
            net,
            inference_config,
            current_score,
            prev_best_score,
            args.model_dir,
            epoch,
            args.checkpoint_interval,
            args.checkpoint_dir,
        )
        if current_score > prev_best_score:
            prev_best_score = current_score
            best_epoch = epoch

        if (args.early_stopping and epoch >= args.early_stopping_min_epochs
                and (epoch - best_epoch) >= args.early_stopping_patience):
            logger.info(
                f"[Epoch {epoch}] No improvement since epoch {best_epoch}: Stopping early"
            )
            break
def train(net, async_net, ctx, args):
    """Training pipeline"""
    net.collect_params().reset_ctx(ctx)
    if args.no_wd:
        for k, v in net.collect_params(".*beta|.*gamma|.*bias").items():
            v.wd_mult = 0.0

    if args.label_smooth:
        net._target_generator._label_smooth = True

    if args.lr_decay_period > 0:
        lr_decay_epoch = list(
            range(args.lr_decay_period, args.epochs, args.lr_decay_period))
    else:
        lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')]

    lr_scheduler = LRSequential([
        LRScheduler("linear",
                    base_lr=0,
                    target_lr=args.lr,
                    nepochs=args.warmup_epochs,
                    iters_per_epoch=args.batch_size),
        LRScheduler(args.lr_mode,
                    base_lr=args.lr,
                    nepochs=args.epochs - args.warmup_epochs,
                    iters_per_epoch=args.batch_size,
                    step_epoch=lr_decay_epoch,
                    step_factor=args.lr_decay,
                    power=2),
    ])
    if (args.optimizer == "sgd"):
        trainer = gluon.Trainer(net.collect_params(),
                                args.optimizer, {
                                    "wd": args.wd,
                                    "momentum": args.momentum,
                                    "lr_scheduler": lr_scheduler
                                },
                                kvstore="local")
    elif (args.optimizer == "adam"):
        trainer = gluon.Trainer(net.collect_params(),
                                args.optimizer, {"lr_scheduler": lr_scheduler},
                                kvstore="local")
    else:
        trainer = gluon.Trainer(net.collect_params(),
                                args.optimizer,
                                kvstore="local")

    # targets
    #sigmoid_ce = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False)
    #l1_loss = gluon.loss.L1Loss()

    # Intermediate Metrics:
    train_metrics = (
        mx.metric.Loss("ObjLoss"),
        mx.metric.Loss("BoxCenterLoss"),
        mx.metric.Loss("BoxScaleLoss"),
        mx.metric.Loss("ClassLoss"),
        mx.metric.Loss("TotalLoss"),
    )
    train_metric_ixs = range(len(train_metrics))
    target_metric_ix = -1  # Train towards TotalLoss (the last one)

    # Evaluation Metrics:
    val_metric = VOC07MApMetric(iou_thresh=0.5)

    # Data transformations:
    train_batchify_fn = Tuple(*([Stack() for _ in range(6)] +
                                [Pad(axis=0, pad_val=-1) for _ in range(1)]))
    train_transforms = (YOLO3DefaultTrainTransform(
        args.data_shape, args.data_shape, net=async_net,
        mixup=args.mixup) if args.no_random_shape else [
            YOLO3DefaultTrainTransform(
                x * 32, x * 32, net=async_net, mixup=args.mixup)
            for x in range(10, 20)
        ])
    validation_batchify_fn = None
    validation_transforms = None
    if args.validation:
        validation_batchify_fn = Tuple(Stack(), Pad(pad_val=-1))
        validation_transforms = YOLO3DefaultValTransform(
            args.data_shape, args.data_shape)

    logger.info(args)
    logger.info(f"Start training from [Epoch {args.start_epoch}]")
    prev_best_score = float("-inf")
    best_epoch = args.start_epoch
    logger.info("Sleeping for 3s in case training data file not yet ready")
    time.sleep(3)
    for epoch in range(args.start_epoch, args.epochs):
        #         if args.mixup:
        #             # TODO(zhreshold): more elegant way to control mixup during runtime
        #             try:
        #                 train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5)
        #             except AttributeError:
        #                 train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5)
        #             if epoch >= args.epochs - args.no_mixup_epochs:
        #                 try:
        #                     train_data._dataset.set_mixup(None)
        #                 except AttributeError:
        #                     train_data._dataset._data.set_mixup(None)

        tic = time.time()
        btic = time.time()
        mx.nd.waitall()
        net.hybridize()

        logger.debug(
            f'Input data dir contents: {os.listdir("/opt/ml/input/data/")}')
        train_data_gen = pipe_detection_minibatch(
            epoch, channel=args.train, batch_size=args.stream_batch_size)
        for ix_streambatch, train_dataset in enumerate(train_data_gen):
            # TODO: Mixup is kinda rubbish if it's only within a (potentially small) batch
            if args.mixup:
                train_dataset = MixupDetection(train_dataset)

            # Create dataloader for the stream-batch:
            if args.no_random_shape:
                logger.debug(
                    "Creating train DataLoader without random transform")
                train_dataloader = gluon.data.DataLoader(
                    train_dataset.transform(train_transforms),
                    batch_size=args.batch_size,
                    batchify_fn=train_batchify_fn,
                    last_batch="discard",
                    num_workers=args.num_workers,
                    shuffle=True,
                )
            else:
                logger.debug("Creating train DataLoader with random transform")
                train_dataloader = RandomTransformDataLoader(
                    train_transforms,
                    train_dataset,
                    interval=10,
                    batch_size=args.batch_size,
                    batchify_fn=train_batchify_fn,
                    last_batch="discard",
                    num_workers=args.num_workers,
                    shuffle=True,
                )

            if args.mixup:
                logger.debug("Shuffling stream-batch")
                # TODO(zhreshold): more elegant way to control mixup during runtime
                try:
                    train_dataloader._dataset.set_mixup(
                        np.random.beta, 1.5, 1.5)
                except AttributeError:
                    train_dataloader._dataset._data.set_mixup(
                        np.random.beta, 1.5, 1.5)
                if epoch >= args.epochs - args.no_mixup_epochs:
                    try:
                        train_dataloader._dataset.set_mixup(None)
                    except AttributeError:
                        train_dataloader._dataset._data.set_mixup(None)

            logger.debug(
                f"Training on stream-batch {ix_streambatch} ({len(train_dataset)} records)"
            )
            # TODO: Improve stream-batching robustness to drop loop guard clauses
            # While it would be nice to simply `for i, batch in enumerate(train_dataloader):`,
            # corrupted image buffers are somehow sneaking through the stream-batch at the moment.
            #
            # For now, we catch and tolerate these errors - trying to resume stream-batch process
            # where possible and otherwise discarding the remainder of the stream-batch :-(
            done = False
            i = -1
            dataiter = iter(train_dataloader)
            while not done:
                i += 1
                batch = None
                while not batch:
                    try:
                        batch = next(dataiter)
                    except StopIteration:
                        done = True
                        break
                    except ValueError:
                        # Some problem with the minibatch prevented loading - try the next
                        logger.warn(
                            f"[Epoch {epoch}][Streambatch {ix_streambatch}] "
                            f"Failed to load minibatch {i}, trying next...")
                        i += 1
                    except:
                        logger.error(
                            f"[Epoch {epoch}][Streambatch {ix_streambatch}] "
                            f"Failed to iterate minibatch {i}: Discarding remainder"
                        )
                        break

                if not batch:
                    logger.debug(
                        f"[Epoch {epoch}][Streambatch {ix_streambatch}] "
                        f"Done after {i} minibatches")
                    break
                logger.debug(
                    f"Epoch {epoch}, stream batch {ix_streambatch}, minibatch {i}"
                )

                batch_size = batch[0].shape[0]
                data = gluon.utils.split_and_load(batch[0],
                                                  ctx_list=ctx,
                                                  batch_axis=0,
                                                  even_split=False)
                # objectness, center_targets, scale_targets, weights, class_targets
                fixed_targets = [
                    gluon.utils.split_and_load(batch[it],
                                               ctx_list=ctx,
                                               batch_axis=0,
                                               even_split=False)
                    for it in range(1, 6)
                ]
                gt_boxes = gluon.utils.split_and_load(batch[6],
                                                      ctx_list=ctx,
                                                      batch_axis=0,
                                                      even_split=False)
                loss_trackers = tuple([] for metric in train_metrics)
                with autograd.record():
                    for ix, x in enumerate(data):
                        losses_raw = net(x, gt_boxes[ix],
                                         *[ft[ix] for ft in fixed_targets])
                        # net outputs: [obj_loss, center_loss, scale_loss, cls_loss]
                        # Each a mx.ndarray 1xbatch_size. This is the same order as our
                        # train_metrics, so we just need to add a total vector:
                        total_loss = sum(losses_raw)
                        losses = losses_raw + [total_loss]

                        # If any sample's total loss is non-finite, sum will be:
                        if not isfinite(sum(total_loss)):
                            logger.error(
                                f"[Epoch {epoch}][Streambatch {ix_streambatch}][Minibatch {i}] "
                                f"got non-finite losses: {losses_raw}")
                            # TODO: Terminate training if losses or gradient go infinite?

                        for ix in train_metric_ixs:
                            loss_trackers[ix].append(losses[ix])

                    autograd.backward(loss_trackers[target_metric_ix])
                trainer.step(batch_size)
                for ix in train_metric_ixs:
                    train_metrics[ix].update(0, loss_trackers[ix])

                if args.log_interval and not (i + 1) % args.log_interval:
                    train_metrics_current = map(lambda metric: metric.get(),
                                                train_metrics)
                    metrics_msg = "; ".join([
                        f"{name}={val:.3f}"
                        for name, val in train_metrics_current
                    ])
                    logger.info(
                        f"[Epoch {epoch}][Streambatch {ix_streambatch}][Minibatch {i}] "
                        f"LR={trainer.learning_rate:.2E}; "
                        f"Speed={batch_size/(time.time()-btic):.3f} samples/sec; {metrics_msg};"
                    )
                btic = time.time()

        train_metrics_current = map(lambda metric: metric.get(), train_metrics)
        metrics_msg = "; ".join(
            [f"{name}={val:.3f}" for name, val in train_metrics_current])
        logger.info(
            f"[Epoch {epoch}] TrainingCost={time.time()-tic:.3f}; {metrics_msg};"
        )

        if not (epoch + 1) % args.val_interval:
            logger.info(f"Validating [Epoch {epoch}]")

            metric_names, metric_values = validate(
                net, args.validation, epoch, ctx,
                VOC07MApMetric(iou_thresh=0.5), validation_transforms,
                validation_batchify_fn, args)
            if isinstance(metric_names, list):
                val_msg = "; ".join(
                    [f"{k}={v}" for k, v in zip(metric_names, metric_values)])
                current_score = float(metric_values[-1])
            else:
                val_msg = f"{metric_names}={metric_values}"
                current_score = metric_values
            logger.info(f"[Epoch {epoch}] Validation: {val_msg};")
        else:
            current_score = float("-inf")

        save_progress(net, current_score, prev_best_score, args.model_dir,
                      epoch, args.checkpoint_interval, args.checkpoint_dir)
        if current_score > prev_best_score:
            prev_best_score = current_score
            best_epoch = epoch

        if (args.early_stopping and epoch >= args.early_stopping_min_epochs
                and (epoch - best_epoch) >= args.early_stopping_patience):
            logger.info(
                f"[Epoch {epoch}] No improvement since epoch {best_epoch}: Stopping early"
            )
            break
Exemple #8
0
    args = parse_args()

    # training contexts
    ctx = [mx.gpu(int(i)) for i in args.gpus.split(',') if i.strip()]
    ctx = ctx if ctx else [mx.cpu()]

    # network name
    net_name = '_'.join(('yolo3', args.network, args.dataset))
    args.save_prefix += net_name
    # get netework
    net = get_model(net_name, pretrained_base=True)

    # training data
    width, height = 500, 500 # resize image to 416x416 after all data augmentation
    train_transform = YOLO3DefaultTrainTransform(width, height, net)
    val_transform = YOLO3DefaultValTransform(width, height, net)
    
    width, height = data_shape, data_shape
    batchify_fn = Tuple(*([Stack() for _ in range(6)] + [Pad(axis=0, pad_val=-1) for _ in range(1)]))  # stack image, all targets generate
    #get datasets
    train_dataset, val_dataset, eval_metric = get_voc_dataset(args.dataset, args)
    
    #get data loaders
    train_loader = gluon.data.DataLoader(train_dataset.transform(train_transform),args.batch_size, True, batchify_fn=batchify_fn, last_batch='rollover', num_workers=args.num_workers)
    
    val_batchify_fn = Tuple(Stack(), Pad(pad_val=-1))
    val_loader = gluon.data.DataLoader(val_dataset.transform(val_transform),args.batch_size, False, batchify_fn=val_batchify_fn, last_batch='keep', num_workers=args.num_workers)
    
    #define eval_metric
    #eval_metric=VOC07MApMetric(iou_thresh=0.5, class_names=val_dataset.classes)