Beispiel #1
0
 def batch_fn(batch, ctx):
     if opt.num_segments > 1:
         data = split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False, multiplier=opt.num_segments)
     else:
         data = split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False)
     label = split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False)
     return data, label
Beispiel #2
0
def test(ctx,val_data):
    acc_top1.reset()
    acc_top5.reset()
    L = gluon.loss.SoftmaxCrossEntropyLoss()
    num_test_iter = len(val_data)
    val_loss_epoch = 0
    for i, batch in enumerate(val_data):
        data = split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
        label = split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
        val_outputs = []
        for _, X in enumerate(data):
#            X = X.reshape((-1,) + X.shape[2:])
#            X = X.reshape((-1,15)+X.shape[-2:])
            X = X.reshape((-3,-3,-2))
            pred = net(X)
            val_outputs.append(pred)
            
        loss = [L(yhat, y) for yhat, y in zip(val_outputs, label)]
        
        acc_top1.update(label, val_outputs)
        acc_top5.update(label, val_outputs)
        
        val_loss_epoch += sum([l.mean().asscalar() for l in loss]) / len(loss)
    
    _, top1 = acc_top1.get()
    _, top5 = acc_top5.get()
    val_loss = val_loss_epoch / num_test_iter
    
    return (top1, top5, val_loss)
Beispiel #3
0
def test(ctx, val_data):
    acc_top1.reset()
    acc_top5.reset()
    L = gluon.loss.SoftmaxCrossEntropyLoss()
    num_test_iter = len(val_data)
    val_loss_epoch = 0
    for i, batch in enumerate(val_data):
        data_bgs = split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
        data_fgs = split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
        label = split_and_load(batch[2], ctx_list=ctx, batch_axis=0)
        val_outputs = []
        for _, (X_bgs, X_fgs) in enumerate(zip(data_bgs, data_fgs)):
            #print('X_bgs',X_bgs.shape) # (10, 8, 3, 224, 224)
            #print('X_fgs',X_fgs.shape) # (10, 8, 3, 224, 224)
            X_bgs = X_bgs.reshape((-1, ) + X_bgs.shape[2:])
            X_fgs = X_fgs.reshape((-1, ) + X_fgs.shape[2:])
            #print('X_bgs',X_bgs.shape) #(80, 3, 224, 224)
            #print('X_fgs',X_fgs.shape) #(80, 3, 224, 224)
            pred = net(X_bgs, X_fgs)
            val_outputs.append(pred)

        loss = [L(yhat, y) for yhat, y in zip(val_outputs, label)]

        acc_top1.update(label, val_outputs)
        acc_top5.update(label, val_outputs)

        val_loss_epoch += sum([l.mean().asscalar() for l in loss]) / len(loss)

    _, top1 = acc_top1.get()
    _, top5 = acc_top5.get()
    val_loss = val_loss_epoch / num_test_iter

    return (top1, top5, val_loss)
Beispiel #4
0
 def _get_data_and_label(self, batch, ctx, batch_axis=0):
     data = batch[0]
     gt_bboxes = batch[-1]
     data = split_and_load(data, ctx_list=ctx, batch_axis=batch_axis)
     targets = list(zip(*[split_and_load(batch[i], ctx_list=ctx, batch_axis=batch_axis)
                          for i in range(1, len(batch) - 1)]))
     gt_bboxes = split_and_load(gt_bboxes, ctx_list=ctx, batch_axis=batch_axis)
     return data, targets, gt_bboxes
Beispiel #5
0
def train_batch_fn(data, ctx):
    """split and load data in GPU"""
    template = split_and_load(data[0], ctx_list=ctx, batch_axis=0)
    search = split_and_load(data[1], ctx_list=ctx, batch_axis=0)
    label_cls = split_and_load(data[2], ctx_list=ctx, batch_axis=0)
    label_loc = split_and_load(data[3], ctx_list=ctx, batch_axis=0)
    label_loc_weight = split_and_load(data[4], ctx_list=ctx, batch_axis=0)
    return template, search, label_cls, label_loc, label_loc_weight
Beispiel #6
0
 def batch_fn(batch, ctx):
     data = split_and_load(batch[0],
                           ctx_list=ctx,
                           batch_axis=0,
                           even_split=False)
     label = split_and_load(batch[1],
                            ctx_list=ctx,
                            batch_axis=0,
                            even_split=False)
     return data, label
Beispiel #7
0
def test(ctx, val_data):
    acc_top1.reset()
    acc_top5.reset()
    L = gluon.loss.SoftmaxCrossEntropyLoss()
    num_test_iter = len(val_data)
    val_loss_epoch = 0
    for i, batch in enumerate(val_data):
        data_bgs = split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
        data_fgs = split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
        label = split_and_load(batch[2], ctx_list=ctx, batch_axis=0)
        val_outputs = []
        for _, (X_bgs, X_fgs) in enumerate(zip(data_bgs, data_fgs)):
            #print('X_bgs',X_bgs.shape) # (10, 8, 3, 224, 224)
            #print('X_fgs',X_fgs.shape) # (10, 8, 3, 224, 224)
            X_bgs = X_bgs.reshape((-1, ) + X_bgs.shape[2:])
            X_fgs = X_fgs.reshape((-1, ) + X_fgs.shape[2:])
            #print('X_bgs',X_bgs.shape) #(80, 3, 224, 224)
            #print('X_fgs',X_fgs.shape) #(80, 3, 224, 224)
            x_bgs = net_bgs(X_bgs)
            x_fgs = net_fgs(X_fgs)
            if opt.fusion_method == 'avg':
                x = nd.stack(x_bgs, x_fgs)
                x = nd.mean(x, axis=0)
            elif opt.fusion_method == 'max':
                x = nd.stack(x_bgs, x_fgs)
                x = nd.max(x, axis=0)
            elif opt.fusion_method == 'bgs':
                x = x_bgs
            elif opt.fusion_method == 'fgs':
                x = x_fgs
            else:
                raise ValueError('fusion_method not supported')
            pred = x
            #pred = net(X_bgs,X_fgs)
            val_outputs.append(pred)

        loss = [L(yhat, y) for yhat, y in zip(val_outputs, label)]

        acc_top1.update(label, val_outputs)
        acc_top5.update(label, val_outputs)

        val_loss_epoch += sum([l.mean().asscalar() for l in loss]) / len(loss)

    _, top1 = acc_top1.get()
    _, top5 = acc_top5.get()
    val_loss = val_loss_epoch / num_test_iter

    return (top1, top5, val_loss)
Beispiel #8
0
 def _validation(self):
     """validation"""
     cudnn_auto_tune(False)
     tbar = tqdm(self.val_iter)
     for i, (data, label) in enumerate(tbar):
         gpu_datas = split_and_load(data=data,
                                    ctx_list=self.ctx,
                                    even_split=False)
         gpu_labels = split_and_load(data=label,
                                     ctx_list=self.ctx,
                                     even_split=False)
         for gpu_data, gpu_label in zip(gpu_datas, gpu_labels):
             self.metric.update(gpu_label, self.net.evaluate(gpu_data))
         tbar.set_description('pixAcc: %.4f, mIoU: %.4f' %
                              (self.metric.get()))
     pixel_acc, mean_iou = self.metric.get()
     self.metric.reset()
     cudnn_auto_tune(True)
     return pixel_acc, mean_iou
Beispiel #9
0
    def evaluate_batch(self, estimator,
                       val_batch,
                       batch_axis=0):
        """Evaluate the estimator model on a batch of validation data.

        Parameters
        ----------
        estimator : Estimator
            Reference to the estimator
        val_batch : tuple
            Data and label of a batch from the validation data loader.
        batch_axis : int, default 0
            Batch axis to split the validation data into devices.
        """
        data = split_and_load(val_batch[0], ctx_list=estimator.context, batch_axis=0, even_split=False)
        label = split_and_load(val_batch[1], ctx_list=estimator.context, batch_axis=0, even_split=False)
        mx.nd.waitall()
        det_bboxes = []
        det_ids = []
        det_scores = []
        gt_bboxes = []
        gt_ids = []
        gt_difficults = []
        for x, y in zip(data, label):
            # get prediction results
            with autograd.predict_mode():
                ids, scores, bboxes = estimator.val_net(x)
            det_ids.append(ids.copy().asnumpy())
            det_scores.append(scores.copy().asnumpy())
            # clip to image size
            det_bboxes.append(bboxes.clip(0, val_batch[0].shape[2]).copy().asnumpy())
            # split ground truths
            gt_ids.append(y.slice_axis(axis=-1, begin=4, end=5).copy().asnumpy())
            gt_bboxes.append(y.slice_axis(axis=-1, begin=0, end=4).copy().asnumpy())
            gt_difficults.append(y.slice_axis(axis=-1, begin=5, end=6).copy().asnumpy() if y.shape[-1] > 5 else None)
        # pred = [estimator.val_net(x) for x in data]
        # loss = [estimator.val_loss(y_hat, y) for y_hat, y in zip(pred, label)]
        pred = (det_bboxes, det_ids, det_scores)
        label = (gt_bboxes, gt_ids, gt_difficults)

        return data, label, pred, 0.
Beispiel #10
0
def fit(run, ctx, log_interval=5, no_val=False, logger=None):
    net = FitFactory.get_model(wandb.config, ctx)
    train_iter, num_train = FitFactory.data_iter(
        wandb.config.data_name,
        wandb.config.bs_train,
        root=get_dataset_info(wandb.config.data_name)[0],
        split='train',  # sometimes would be 'trainval'
        mode='train',
        base_size=wandb.config.base_size,
        crop_size=wandb.config.crop_size)
    val_iter, num_valid = FitFactory.data_iter(
        wandb.config.data_name,
        wandb.config.bs_val,
        shuffle=False,
        last_batch='keep',
        root=get_dataset_info(wandb.config.data_name)[0],
        split='val',
        base_size=wandb.config.base_size,
        crop_size=wandb.config.crop_size)
    criterion = FitFactory.get_criterion(
        wandb.config.aux,
        wandb.config.aux_weight,
        # focal_kwargs={'alpha': 1.0, 'gamma': 0.5},
        # sensitive_kwargs={
        #     'nclass': get_dataset_info(wandb.config.data_name)[1],
        #     'alpha': 1.0,
        #     'gamma': 1.0}
    )
    trainer = FitFactory.create_trainer(net,
                                        wandb.config,
                                        iters_per_epoch=len(train_iter))
    metric = SegmentationMetric(
        nclass=get_dataset_info(wandb.config.data_name)[1])

    wandb.config.num_train = num_train
    wandb.config.num_valid = num_valid

    t_start = get_strftime()
    logger.info(f'Training start: {t_start}')
    for k, v in wandb.config.items():
        logger.info(f'{k}: {v}')
    logger.info('-----> end hyper-parameters <-----')
    wandb.config.start_time = get_strftime()

    best_score = .0
    for epoch in range(wandb.config.epochs):
        train_loss = .0
        tbar = tqdm(train_iter)
        for i, (data, target) in enumerate(tbar):
            gpu_datas = split_and_load(data, ctx_list=ctx)
            gpu_targets = split_and_load(target, ctx_list=ctx)
            with autograd.record():
                loss_gpus = [
                    criterion(*net(gpu_data), gpu_target)
                    for gpu_data, gpu_target in zip(gpu_datas, gpu_targets)
                ]
            for loss in loss_gpus:
                autograd.backward(loss)
            trainer.step(wandb.config.bs_train)
            nd.waitall()
            loss_temp = .0  # sum up all sample loss
            for loss in loss_gpus:
                loss_temp += loss.sum().asscalar()
            train_loss += (loss_temp / wandb.config.bs_train)
            tbar.set_description('Epoch %d, training loss %.5f' %
                                 (epoch, train_loss / (i + 1)))
            if (i % log_interval == 0) or (i + 1 == len(train_iter)):
                wandb.log({
                    f'train_loss_batch, interval={log_interval}':
                    train_loss / (i + 1)
                })
        wandb.log({
            'train_loss_epoch': train_loss / (len(train_iter) + 1),
            'custom_step': epoch
        })

        if not no_val:
            cudnn_auto_tune(False)
            val_loss = .0
            vbar = tqdm(val_iter)
            for i, (data, target) in enumerate(vbar):
                gpu_datas = split_and_load(data=data,
                                           ctx_list=ctx,
                                           even_split=False)
                gpu_targets = split_and_load(data=target,
                                             ctx_list=ctx,
                                             even_split=False)
                loss_temp = .0
                for gpu_data, gpu_target in zip(gpu_datas, gpu_targets):
                    loss_gpu = criterion(*net(gpu_data), gpu_target)
                    loss_temp += loss_gpu.sum().asscalar()
                    metric.update(gpu_target, net.evaluate(gpu_data))
                vbar.set_description('Epoch %d, val PA %.4f, mIoU %.4f' %
                                     (epoch, metric.get()[0], metric.get()[1]))
                val_loss += (loss_temp / wandb.config.bs_val)
                nd.waitall()
            pix_acc, mean_iou = metric.get()
            wandb.log({
                'val_PA': pix_acc,
                'val_mIoU': mean_iou,
                'val_loss': val_loss / len(val_iter) + 1
            })
            metric.reset()
            if mean_iou > best_score:
                save_checkpoint(model=net,
                                model_name=wandb.config.model_name.lower(),
                                backbone=wandb.config.backbone.lower(),
                                data_name=wandb.config.data_name.lower(),
                                time_stamp=wandb.config.start_time,
                                is_best=True)
                best_score = mean_iou
            cudnn_auto_tune(True)

    save_checkpoint(model=net,
                    model_name=wandb.config.model_name.lower(),
                    backbone=wandb.config.backbone.lower(),
                    data_name=wandb.config.data_name.lower(),
                    time_stamp=wandb.config.start_time,
                    is_best=False)

    run.finish()
Beispiel #11
0
lr_decay_count = 0

for epoch in range(epochs):
    tic = time.time()
    train_metric.reset()
    train_loss = 0

    # Learning rate decay
    if epoch == lr_decay_epoch[lr_decay_count]:
        trainer.set_learning_rate(trainer.learning_rate*lr_decay)
        lr_decay_count += 1

    # Loop through each batch of training data
    for i, batch in enumerate(train_data):
        # Extract data and label
        data = split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
        label = split_and_load(batch[1], ctx_list=ctx, batch_axis=0)

        # AutoGrad
        with ag.record():
            output = []
            for _, X in enumerate(data):
                X = X.reshape((-1,) + X.shape[2:])
                pred = net(X)
                output.append(pred)
            loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]

        # Backpropagation
        for l in loss:
            l.backward()
Beispiel #12
0
def train(cfg,
          ctx_lst,
          project_name,
          log_interval=5,
          no_val=False,
          lr=None,
          wd=None):
    wandb.init(job_type='train',
               dir=my_tools.root_dir(),
               config=cfg,
               project=project_name)
    if lr and wd:
        wandb.config.lr = lr
        wandb.config.wd = wd

    ctx = my_tools.get_contexts(ctx_lst)
    wandb.config.ctx = ctx

    data_factory = DataFactory(wandb.config.data_name)
    model_factory = ModelFactory(wandb.config.model_name)

    norm_layer, norm_kwargs = my_tools.get_norm_layer(wandb.config.norm,
                                                      len(ctx))
    model_kwargs = {
        'nclass': data_factory.num_class,
        'backbone': wandb.config.backbone,
        'pretrained_base': wandb.config.backbone_init.get('manner') == 'cls',
        'aux': wandb.config.aux,
        'crop_size': wandb.config.crop_size,
        'base_size': wandb.config.base_size,
        'dilate': wandb.config.dilate,
        'norm_layer': norm_layer,
        'norm_kwargs': norm_kwargs,
    }
    net = model_factory.get_model(
        model_kwargs,
        resume=wandb.config.resume,
        lr_mult=wandb.config.lr_mult,
        backbone_init_manner=wandb.config.backbone_init.get('manner'),
        backbone_ckpt=wandb.config.backbone_init.get('backbone_ckpt'),
        prior_classes=wandb.config.backbone_init.get('prior_classes'),
        ctx=ctx)
    if net.symbolize:
        net.hybridize()

    num_worker = 0 if platform.system() == 'Windows' else 16
    train_set = data_factory.seg_dataset(
        split='train',  # sometimes would be 'trainval'
        mode='train',
        transform=my_tools.image_transform(),
        base_size=wandb.config.base_size,
        crop_size=wandb.config.crop_size)
    train_iter = DataLoader(train_set,
                            wandb.config.bs_train,
                            shuffle=True,
                            last_batch='discard',
                            num_workers=num_worker)
    val_set = data_factory.seg_dataset(split='val',
                                       mode='val',
                                       transform=my_tools.image_transform(),
                                       base_size=wandb.config.base_size,
                                       crop_size=wandb.config.crop_size)
    val_iter = DataLoader(val_set,
                          wandb.config.bs_val,
                          shuffle=False,
                          last_batch='keep',
                          num_workers=num_worker)
    wandb.config.num_train = len(train_set)
    wandb.config.num_valid = len(val_set)

    criterion = _get_criterion(wandb.config.aux, wandb.config.aux_weight)
    criterion.initialize(ctx=ctx)
    wandb.config.criterion = type(criterion)

    if wandb.config.optimizer == 'adam':
        trainer = Trainer(net.collect_params(),
                          'adam',
                          optimizer_params={
                              'learning_rate': wandb.config.lr,
                              'wd': wandb.config.wd,
                              'beta1': wandb.config.adam.get('adam_beta1'),
                              'beta2': wandb.config.adam.get('adam_beta2')
                          })
    elif wandb.config.optimizer in ('sgd', 'nag'):
        scheduler = _lr_scheduler(
            mode=wandb.config.lr_scheduler,
            base_lr=wandb.config.lr,
            target_lr=wandb.config.target_lr,
            nepochs=wandb.config.epochs,
            iters_per_epoch=len(train_iter),
            step_epoch=wandb.config.step.get('step_epoch'),
            step_factor=wandb.config.step.get('step_factor'),
            power=wandb.config.poly.get('power'))
        trainer = Trainer(net.collect_params(),
                          wandb.config.optimizer,
                          optimizer_params={
                              'lr_scheduler': scheduler,
                              'wd': wandb.config.wd,
                              'momentum': wandb.config.momentum,
                              'multi_precision': True
                          })
    else:
        raise RuntimeError(f"Unknown optimizer: {wandb.config.optimizer}")

    metric = SegmentationMetric(data_factory.num_class)

    logger = get_logger(name='train', level=10)
    t_start = my_tools.get_strftime()
    logger.info(f'Training start: {t_start}')
    for k, v in wandb.config.items():
        logger.info(f'{k}: {v}')
    logger.info('-----> end hyper-parameters <-----')
    wandb.config.start_time = t_start

    best_score = .0
    best_epoch = 0
    for epoch in range(wandb.config.epochs):
        train_loss = .0
        tbar = tqdm(train_iter)
        for i, (data, target) in enumerate(tbar):
            gpu_datas = split_and_load(data, ctx_list=ctx)
            gpu_targets = split_and_load(target, ctx_list=ctx)
            with autograd.record():
                loss_gpus = [
                    criterion(*net(gpu_data), gpu_target)
                    for gpu_data, gpu_target in zip(gpu_datas, gpu_targets)
                ]
            for loss in loss_gpus:
                autograd.backward(loss)
            trainer.step(wandb.config.bs_train)
            nd.waitall()
            train_loss += sum([loss.mean().asscalar()
                               for loss in loss_gpus]) / len(loss_gpus)
            tbar.set_description(
                'Epoch-%d [training], loss %.5f, %s' %
                (epoch, train_loss /
                 (i + 1), my_tools.get_strftime('%Y-%m-%d %H:%M:%S')))
            if (i % log_interval == 0) or (i + 1 == len(train_iter)):
                wandb.log({
                    f'train_loss_batch, interval={log_interval}':
                    train_loss / (i + 1)
                })

        wandb.log({
            'train_loss_epoch': train_loss / (len(train_iter)),
            'custom_step': epoch
        })

        if not no_val:
            val_loss = .0
            vbar = tqdm(val_iter)
            for i, (data, target) in enumerate(vbar):
                gpu_datas = split_and_load(data=data,
                                           ctx_list=ctx,
                                           even_split=False)
                gpu_targets = split_and_load(data=target,
                                             ctx_list=ctx,
                                             even_split=False)
                loss_gpus = []
                for gpu_data, gpu_target in zip(gpu_datas, gpu_targets):
                    gpu_output = net(gpu_data)
                    loss_gpus.append(criterion(*gpu_output, gpu_target))
                    metric.update(gpu_target, gpu_output[0])
                val_loss += sum([loss.mean().asscalar()
                                 for loss in loss_gpus]) / len(loss_gpus)
                vbar.set_description(
                    'Epoch-%d [validation], PA %.4f, mIoU %.4f' %
                    (epoch, metric.get()[0], metric.get()[1]))
                nd.waitall()
            pix_acc, mean_iou = metric.get()
            wandb.log({
                'val_PA': pix_acc,
                'val_mIoU': mean_iou,
                'val_loss': val_loss / len(val_iter),
                'custom_step': epoch
            })
            metric.reset()
            if mean_iou > best_score:
                my_tools.save_checkpoint(
                    model=net,
                    model_name=wandb.config.model_name.lower(),
                    backbone=wandb.config.backbone.lower(),
                    data_name=wandb.config.data_name.lower(),
                    time_stamp=wandb.config.start_time,
                    is_best=True)
                best_score = mean_iou
                best_epoch = epoch

    logger.info(
        f'Best val mIoU={round(best_score * 100, 2)} at epoch: {best_epoch}')
    wandb.config.best_epoch = best_epoch
    my_tools.save_checkpoint(model=net,
                             model_name=wandb.config.model_name.lower(),
                             backbone=wandb.config.backbone.lower(),
                             data_name=wandb.config.data_name.lower(),
                             time_stamp=wandb.config.start_time,
                             is_best=False)
Beispiel #13
0
def train(args):
    # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
    # the current container environment
    ctx = [mx.gpu() if mx.context.num_gpus() > 0 else mx.cpu()]
    # retrieve the hyperparameters we set in notebook (with some defaults)

    #number of training examples utilized in one iteration.
    batch_size = args.batch_size
    #number of times an entire dataset is passed forward and backward through the neural network
    epochs = args.epochs
    #tuning parameter in an optimization algorithm that determines the step size at each iteration while moving toward a   minimum of a loss function.
    learning_rate = args.learning_rate
    #Momentum remembers the update Δ w at each iteration, and determines the next update as a linear combination of the gradient and the previous update
    momentum = args.momentum
    #Optimizers are algorithms or methods used to change the attributes of your neural network such as weights and learning rate in order to reduce the losses.
    optimizer = args.optimizer
    #after each update, the weights are multiplied by a factor slightly less than 1.
    wd = args.wd
    optimizer_params = {
        'learning_rate': learning_rate,
        'wd': wd,
        'momentum': momentum
    }
    log_interval = args.log_interval

    #In this example, we use Inflated 3D model (I3D) with ResNet50 backbone trained on Kinetics400 dataset. We want to replace the last classification (dense) layer to the number of classes in the dataset.
    model_name = args.network
    #number of classes in the dataset
    nclass = 2
    #number of workers for the data loader
    num_workers = 1

    current_host = args.current_host
    hosts = args.hosts
    model_dir = args.model_dir
    CHECKPOINTS_DIR = '/opt/ml/checkpoints'
    checkpoints_enabled = os.path.exists(CHECKPOINTS_DIR)

    data_dir = args.train
    segments = 'rawframes'
    train = 'annotations/{}_train_list_rawframes.txt'.format(args.task)

    #load the data with data loader
    train_data = load_data(data_dir, batch_size, num_workers, segments, train)
    # define the network
    net = define_network(ctx, model_name, nclass)
    #define the gluon trainer
    trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)
    #define loss function
    loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
    #define training metric
    train_metric = mx.metric.Accuracy()
    train_history = TrainingHistory(['training-acc'])
    net.hybridize()
    #learning rate decay hyperparameters
    lr_decay_count = 0
    lr_decay = 0.1
    lr_decay_epoch = [40, 80, 100]
    for epoch in range(epochs):
        tic = time.time()
        train_metric.reset()
        train_loss = 0

        # Learning rate decay
        if epoch == lr_decay_epoch[lr_decay_count]:
            trainer.set_learning_rate(trainer.learning_rate * lr_decay)
            lr_decay_count += 1

        # Loop through each batch of training data
        for i, batch in enumerate(train_data):
            # Extract data and label
            data = split_and_load(batch[0],
                                  ctx_list=ctx,
                                  batch_axis=0,
                                  even_split=False)
            label = split_and_load(batch[1],
                                   ctx_list=ctx,
                                   batch_axis=0,
                                   even_split=False)

            # AutoGrad
            with ag.record():
                output = []
                for _, X in enumerate(data):
                    X = X.reshape((-1, ) + X.shape[2:])
                    pred = net(X)
                    output.append(pred)
                loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]

            # Backpropagation
            for l in loss:
                l.backward()

            # Optimize
            trainer.step(batch_size)

            # Update metrics
            train_loss += sum([l.mean().asscalar() for l in loss])
            train_metric.update(label, output)

            if i == 100:
                break

        name, acc = train_metric.get()

        # Update history and print metrics
        train_history.update([acc])
        print('[Epoch %d] train=%f loss=%f time: %f' %
              (epoch, acc, train_loss / (i + 1), time.time() - tic))

    print('saving the model')
    save(net, model_dir)
Beispiel #14
0
for epoch in range(opt.resume_epoch, opt.epochs):
    tic = time.time()
    train_metric.reset()
    train_loss = 0
    btic = time.time()

    # Learning rate decay
    #    if epoch == lr_decay_epoch[lr_decay_count]:
    #        trainer.set_learning_rate(trainer.learning_rate*lr_decay)
    #        lr_decay_count += 1

    # Loop through each batch of training data
    for i, batch in enumerate(train_data):
        # Extract data and label
        data_bgs = split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
        data_fgs = split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
        label = split_and_load(batch[2], ctx_list=ctx, batch_axis=0)
        # AutoGrad
        with ag.record():
            output = []
            for _, (X_bgs, X_fgs) in enumerate(zip(data_bgs, data_fgs)):
                if opt.reshape_type == 'tsn':
                    X_bgs = X_bgs.reshape((-1, ) + X_bgs.shape[2:])
                    X_fgs = X_fgs.reshape((-1, ) + X_fgs.shape[2:])
                elif opt.reshape_type == 'c3d' or '3d' in opt.model:
                    X_bgs = nd.transpose(data=X_bgs, axes=(0, 2, 1, 3, 4))
                    X_fgs = nd.transpose(data=X_fgs, axes=(0, 2, 1, 3, 4))
                    #X = nd.transpose(data=X,axes=(0,2,1,3,4))
                elif opt.new_length != 1 and opt.reshape_type == 'tsn_newlength':
                    #X = X.reshape((-3,-3,-2))
Beispiel #15
0
for epoch in range(epochs):
    tic = time.time()
    train_metric.reset()
    train_loss = 0

    # Learning rate decay
    if epoch == lr_decay_epoch[lr_decay_count]:
        trainer.set_learning_rate(trainer.learning_rate * lr_decay)
        lr_decay_count += 1

    # Loop through each batch of training data
    for i, batch in enumerate(train_data):
        # Extract data and label
        data = split_and_load(batch[0],
                              ctx_list=ctx,
                              batch_axis=0,
                              multiplier=3)
        label = split_and_load(batch[1], ctx_list=ctx, batch_axis=0)

        # AutoGrad
        with ag.record():
            output = [net(X) for X in data]
            loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]

        # Backpropagation
        for l in loss:
            l.backward()

        # Optimize
        trainer.step(batch_size)
 def _get_data_and_label(self, batch, ctx, batch_axis=0):
     data = batch[0]
     label = batch[1]
     data = split_and_load(data, ctx_list=ctx, batch_axis=batch_axis)
     label = split_and_load(label, ctx_list=ctx, batch_axis=batch_axis)
     return data, label
Beispiel #17
0
    def fit(self):
        last_miou = .0  # record the best validation mIoU
        loss_step = 0  # step count
        for epoch in range(self.conf.epochs):
            train_loss = .0
            start = time.time()
            for i, (data, target) in enumerate(self.train_iter):
                gpu_datas = split_and_load(data, ctx_list=self.ctx)
                gpu_targets = split_and_load(target, ctx_list=self.ctx)
                with autograd.record():
                    loss_gpu = [
                        self.criterion(*self.net(gpu_data), gpu_target)
                        for gpu_data, gpu_target in zip(
                            gpu_datas, gpu_targets)
                    ]
                for loss in loss_gpu:
                    autograd.backward(loss)
                self.trainer.step(self.conf.bs_train)
                nd.waitall()
                loss_temp = .0
                for losses in loss_gpu:
                    loss_temp += losses.sum().asscalar()
                train_loss += (loss_temp / self.conf.bs_train)
                # log every n batch
                # add loss to draw curve, train_loss <class numpy.float64>
                interval = 5 if loss_step < 5000 else 50
                if (i % interval == 0) or (i + 1 == len(self.train_iter)):
                    fitlog.add_loss(name='loss',
                                    value=round(train_loss / (i + 1), 5),
                                    step=loss_step)
                    loss_step += 1
                    self.logger.info(
                        "Epoch %d, batch %d, training loss %.5f." %
                        (epoch, i, train_loss / (i + 1)))
            # log each epoch
            self.logger.info(
                ">>>>>> Epoch %d complete, time cost: %.1f sec. <<<<<<" %
                (epoch, time.time() - start))
            # validation each epoch
            if self.val:
                pixel_acc, mean_iou = self._validation()
                self.logger.info(
                    "Epoch %d validation, PixelAccuracy: %.4f, mIoU: %.4f." %
                    (epoch, pixel_acc, mean_iou))
                fitlog.add_metric(value=mean_iou, step=epoch, name='mIoU')
                fitlog.add_metric(value=pixel_acc, step=epoch, name='PA')
                if mean_iou > last_miou:
                    f_name = self._save_model(tag='best')
                    self.logger.info(
                        "Epoch %d mIoU: %.4f > %.4f(previous), save model: %s"
                        % (epoch, mean_iou, last_miou, f_name))
                    last_miou = mean_iou

        # save the final-epoch params
        f_name = self._save_model(tag='last')
        self.logger.info(">>>>>> Training complete, save model: %s. <<<<<<" %
                         f_name)
        # record
        fitlog.add_best_metric(value=round(last_miou, 4), name='mIoU')
        fitlog.add_other(value=self.id, name='record_id')
        fitlog.add_other(value=self.num_train, name='train')
        fitlog.add_other(value=self.num_val, name='val')