Ejemplo n.º 1
0
    def __init__(self, args):

        # Create solvers (only Convolution kernels require weight decay).
        param_convweights = {
            k: v
            for k, v in nn.get_parameters().items() if k.endswith("conv/W")
        }
        param_others = {
            k: v
            for k, v in nn.get_parameters().items() if not k.endswith("conv/W")
        }
        convweights = S.Momentum(args.learning_rate, args.momentum)
        others = S.Momentum(args.learning_rate, args.momentum)
        convweights.set_parameters(param_convweights)
        others.set_parameters(param_others)

        # Init parameter gradients.
        convweights.zero_grad()
        others.zero_grad()

        # Set attributes.
        self.convweights = convweights
        self.others = others
        self.args = args
        self.batch_size = args.batch_size * args.accum_times
        self.rate = args.accum_times
        self.count = 0
Ejemplo n.º 2
0
def sample_arch_and_train(args, data_dict, controller_weights_dict):
    """
        Execute these process.
        1. For a certain number of times, let the controller construct sample architectures 
           and test their performances. (By calling get_sample_and_feedback)
        2. By using the performances acquired by the previous process, train the controller.
        3. Select one architecture with the best validation accuracy and train its parameters.
    """

    solver = S.Momentum(args.control_lr)  # create solver for the controller
    solver.set_parameters(controller_weights_dict,
                          reset=False,
                          retain_state=True)
    solver.zero_grad()

    val_list = list()
    arch_list = list()

    with nn.auto_forward():
        for c in range(args.num_candidate):
            output_line = " Architecture {} / {} ".format((c + 1),
                                                          args.num_candidate)
            print("{0:-^80s}".format(output_line))

            # sample one architecture and get its feedback for RL as loss
            loss, val_acc, sample_arch = get_sample_and_feedback(
                args, data_dict)

            val_list.append(val_acc)
            arch_list.append(sample_arch)
            loss.backward()  # accumulate gradient each time

        print("{0:-^80s}\n".format(" Reinforcement Learning Phase "))
        print("current accumulated loss:", loss.d)

        solver.weight_decay(0.025)
        solver.update()  # train the controller

        print("\n{0:-^80s}\n".format(" CNN Learning Phase "))
        best_idx = np.argmax(val_list)
        sample_arch = arch_list[best_idx]
        print("Train the model whose architecture is:")
        show_arch(sample_arch)
        print("and its accuracy is: {:.2f} %\n".format(100 * np.max(val_list)))
        print("Learnable Parameters:", params_count(nn.get_parameters()))

    # train a child network which achieves the best validation accuracy.
    val_acc = CNN_run(args, sample_arch, data_dict, with_train=True)

    return sample_arch, val_acc
Ejemplo n.º 3
0
def train():
    """
    Main script.
    """

    args = get_args()

    # Get context.
    from nnabla.contrib.context import extension_context
    extension_module = args.context
    if args.context is None:
        extension_module = 'cpu'
    logger.info("Running in %s" % extension_module)
    ctx = extension_context(extension_module, device_id=args.device_id)
    nn.set_default_context(ctx)

    # Dataset
    # We use Tiny ImageNet from Stanford CS231N class.
    # https://tiny-imagenet.herokuapp.com/
    # Tiny ImageNet consists of 200 categories, each category has 500 images
    # in training set. The image size is 64x64. To adapt ResNet into 64x64
    # image inputs, the input image size of ResNet is set as 56x56, and
    # the stride in the first conv and the first max pooling are removed.
    data = data_iterator_tiny_imagenet(args.batch_size, 'train')
    vdata = data_iterator_tiny_imagenet(args.batch_size, 'val')

    num_classes = 200
    tiny = True  # TODO: Switch ILSVRC2012 dataset and TinyImageNet.
    t_model = get_model(
        args, num_classes, test=False, tiny=tiny)
    t_model.pred.persistent = True  # Not clearing buffer of pred in backward
    v_model = get_model(
        args, num_classes, test=True, tiny=tiny)
    v_model.pred.persistent = True  # Not clearing buffer of pred in forward

    # Create Solver.
    solver = S.Momentum(args.learning_rate, 0.9)
    solver.set_parameters(nn.get_parameters())

    # Create monitor.
    import nnabla.monitor as M
    monitor = M.Monitor(args.monitor_path)
    monitor_loss = M.MonitorSeries("Training loss", monitor, interval=10)
    monitor_err = M.MonitorSeries("Training error", monitor, interval=10)
    monitor_vloss = M.MonitorSeries("Validation loss", monitor, interval=10)
    monitor_verr = M.MonitorSeries("Validation error", monitor, interval=10)
    monitor_time = M.MonitorTimeElapsed("Training time", monitor, interval=10)

    # Training loop.
    for i in range(args.max_iter):
        # Save parameters
        if i % args.model_save_interval == 0:
            nn.save_parameters(os.path.join(
                args.model_save_path, 'param_%06d.h5' % i))

        # Validation
        if i % args.val_interval == 0:

            # Clear all intermediate memory to save memory.
            # t_model.loss.clear_recursive()

            l = 0.0
            e = 0.0
            for j in range(args.val_iter):
                images, labels = vdata.next()
                v_model.image.d = images
                v_model.label.d = labels
                v_model.image.data.cast(np.uint8, ctx)
                v_model.label.data.cast(np.int32, ctx)
                v_model.loss.forward(clear_buffer=True)
                l += v_model.loss.d
                e += categorical_error(v_model.pred.d, v_model.label.d)
            monitor_vloss.add(i, l / args.val_iter)
            monitor_verr.add(i, e / args.val_iter)

            # Clear all intermediate memory to save memory.
            # v_model.loss.clear_recursive()

        # Training
        l = 0.0
        e = 0.0
        solver.zero_grad()

        # Gradient accumulation loop
        for j in range(args.accum_grad):
            images, labels = data.next()
            t_model.image.d = images
            t_model.label.d = labels
            t_model.image.data.cast(np.uint8, ctx)
            t_model.label.data.cast(np.int32, ctx)
            t_model.loss.forward(clear_no_need_grad=True)
            t_model.loss.backward(clear_buffer=True)  # Accumulating gradients
            l += t_model.loss.d
            e += categorical_error(t_model.pred.d, t_model.label.d)
        solver.weight_decay(args.weight_decay)
        solver.update()
        monitor_loss.add(i, l / args.accum_grad)
        monitor_err.add(i, e / args.accum_grad)
        monitor_time.add(i)

        # Learning rate decay at scheduled iter
        if i in args.learning_rate_decay_at:
            solver.set_learning_rate(solver.learning_rate() * 0.1)
    nn.save_parameters(os.path.join(args.model_save_path,
                                    'param_%06d.h5' % args.max_iter))
Ejemplo n.º 4
0
def CNN_run(args, both_archs, data_dict, with_train=False, after_search=False):
    """
    """

    num_cells = args.num_cells
    num_nodes = args.num_nodes

    if after_search:
        assert with_train is True, "when you train the network after architecture search, set with_train=True"
    tdata, mean_val_train, std_val_train = data_dict["train_data"]
    vdata, mean_val_valid, std_val_valid = data_dict["valid_data"]
    channels, image_height, image_width, num_class = data_dict["basic_info"]
    batch_size = args.batch_size

    output_filter = args.output_filter

    if with_train:
        if after_search:
            num_epoch = args.epoch_on_retrain
            if args.additional_filters_on_retrain > 0:
                output_filter += args.additional_filters_on_retrain
        else:
            num_epoch = args.epoch_per_search

        one_epoch = tdata.size // batch_size
        max_iter = num_epoch * one_epoch

    val_iter = args.val_iter

    monitor_path = args.monitor_path
    model_save_path = args.monitor_path
    decay_rate = args.weight_decay
    initial_lr = args.child_lr

    model_save_interval = args.model_save_interval

    image_valid = nn.Variable(
        (batch_size, channels, image_height, image_width))
    input_image_valid = {"image": image_valid}

    vdata._reset()  # rewind data

    test = True
    pred_valid, _, _ = construct_architecture(image_valid, num_class, num_cells, num_nodes,
                                              both_archs, output_filter, test)

    if with_train:
        if after_search:
            # setting for training after architecture search
            with_grad_clip = args.with_grad_clip_on_retrain
            grad_clip = args.grad_clip_value
            lr_control = args.lr_control_on_retrain
        else:
            with_grad_clip = args.with_grad_clip_on_search
            grad_clip = args.grad_clip_value
            lr_control = args.lr_control_on_search

        # prepare variables used for training
        image_train = nn.Variable(
            (batch_size, channels, image_height, image_width))
        label_train = nn.Variable((batch_size, 1))
        input_image_train = {"image": image_train, "label": label_train}

        tdata._reset()  # rewind data

        test = False
        pred_train, aux_logits, used_weights = construct_architecture(image_train, num_class, num_cells, num_nodes,
                                                                      both_archs, output_filter, test)
        loss_train = loss_function(pred_train, aux_logits, label_train)

        used_weights_dict = {key_name: nn.get_parameters(
        )[key_name] for key_name in used_weights}

        # Create monitor.
        monitor = Monitor(monitor_path)
        monitor_loss = MonitorSeries("Training loss", monitor, interval=100)
        # modified to display accuracy.
        monitor_err = MonitorSeries("Training accuracy", monitor, interval=100)
        # modified to display accuracy.
        monitor_verr = MonitorSeries("Test accuracy", monitor, interval=1)

        # Solvers
        solver = S.Momentum(initial_lr)
        solver.set_parameters(
            used_weights_dict, reset=False, retain_state=True)

        # Training-loop
        for i in range(max_iter):
            if i > 0 and i % one_epoch == 0:
                # Validation during training.
                ve = 0.
                for j in range(val_iter):
                    image, label = vdata.next()
                    image = image / 255.0
                    image = (image - mean_val_valid) / std_val_valid
                    input_image_valid["image"].d = image
                    pred_valid.forward()
                    ve += categorical_error(pred_valid.d, label)
                ve /= val_iter
                monitor_verr.add(i, 1.0 - ve)  # modified to display accuracy.

            if after_search and int(i % args.model_save_interval) == 0:
                nn.save_parameters(os.path.join(
                    args.model_save_path, 'params_%06d.h5' % i))

            # Forward/Zerograd/Backward
            image, label = tdata.next()
            image = image / 255.0
            image = (image - mean_val_train) / std_val_train
            input_image_train["image"].d = image
            input_image_train["label"].d = label
            loss_train.forward()

            if lr_control:
                new_lr = learning_rate_scheduler(i, max_iter, initial_lr, 0)
                solver.set_learning_rate(new_lr)

            solver.zero_grad()
            loss_train.backward()

            if with_grad_clip:
                for k, v in used_weights_dict.items():
                    if np.linalg.norm(v.g) > grad_clip:
                        v.grad.copy_from(F.clip_by_norm(v.grad, grad_clip))

            # Solvers update
            solver.weight_decay(decay_rate)
            solver.update()
            e = categorical_error(pred_train.d, input_image_train["label"].d)
            monitor_loss.add(i, loss_train.d.copy())
            monitor_err.add(i, 1.0 - e)  # modified to display accuracy.

    # Validation (After training or when called for evaluation only)
    ve = 0.
    for j in range(val_iter):
        image, label = vdata.next()
        image = image / 255.0
        image = (image - mean_val_valid) / std_val_valid
        input_image_valid["image"].d = image
        pred_valid.forward()
        ve += categorical_error(pred_valid.d, label)
    ve /= val_iter

    if with_train:
        print("Validation Accuracy on Trained CNN:",
              '{:.2f}'.format(100*(1.0 - ve)), "%\n")

    if after_search:
        nn.save_parameters(os.path.join(
            args.model_save_path, 'params_%06d.h5' % (max_iter)))

    return 1.0 - ve
Ejemplo n.º 5
0
def train():
    """
    Main script.
    """

    args = get_args()

    # Get context.
    from nnabla.ext_utils import get_extension_context
    extension_module = args.context
    if args.context is None:
        extension_module = 'cpu'
    logger.info("Running in %s" % extension_module)
    ctx = get_extension_context(extension_module,
                                device_id=args.device_id,
                                type_config=args.type_config)
    nn.set_default_context(ctx)

    if args.tiny_mode:
        # We use Tiny ImageNet from Stanford CS231N class.
        # (Tiny ImageNet, https://tiny-imagenet.herokuapp.com/)
        # Tiny ImageNet consists of 200 categories, each category has 500 images
        # in training set. The image size is 64x64. To adapt ResNet into 64x64
        # image inputs, the input image size of ResNet is set as 56x56, and
        # the stride in the first conv and the first max pooling are removed.
        # Please check README.
        data = data_iterator_tiny_imagenet(args.batch_size, 'train')
        vdata = data_iterator_tiny_imagenet(args.batch_size, 'val')
        num_classes = 200
    else:
        # We use ImageNet.
        # (ImageNet, https://imagenet.herokuapp.com/)
        # ImageNet consists of 1000 categories, each category has 1280 images
        # in training set. The image size is various. To adapt ResNet into
        # 320x320 image inputs, the input image size of ResNet is set as
        # 224x224. We need to get tar file and create cache file(320x320 images).
        # Please check README.
        data = data_iterator_imagenet(args.batch_size,
                                      args.train_cachefile_dir)
        vdata = data_iterator_imagenet(args.batch_size, args.val_cachefile_dir)
        num_classes = 1000
    t_model = get_model(args, num_classes, test=False, tiny=args.tiny_mode)
    t_model.pred.persistent = True  # Not clearing buffer of pred in backward

    # TODO: need_grad should be passed to get_unlinked_variable after v1.0.3 fix.
    t_pred2 = t_model.pred.get_unlinked_variable()
    t_pred2.need_grad = False

    t_e = F.mean(F.top_n_error(t_pred2, t_model.label))
    v_model = get_model(args, num_classes, test=True, tiny=args.tiny_mode)
    v_model.pred.persistent = True  # Not clearing buffer of pred in forward

    # TODO: need_grad should be passed to get_unlinked_variable after v1.0.3 fix.
    v_pred2 = v_model.pred.get_unlinked_variable()
    v_pred2.need_grad = False

    v_e = F.mean(F.top_n_error(v_pred2, v_model.label))

    # Save_nnp_Epoch0
    contents = save_nnp({'x': v_model.image}, {'y': v_model.pred},
                        args.batch_size)
    save.save(os.path.join(args.model_save_path, 'Imagenet_result_epoch0.nnp'),
              contents)

    # Create Solver.
    solver = S.Momentum(args.learning_rate, 0.9)
    solver.set_parameters(nn.get_parameters())

    start_point = 0
    if args.checkpoint is not None:
        # load weights and solver state info from specified checkpoint file.
        start_point = load_checkpoint(args.checkpoint, solver)

    # Create monitor.
    import nnabla.monitor as M
    monitor = M.Monitor(args.monitor_path)
    monitor_loss = M.MonitorSeries("Training loss", monitor, interval=10)
    monitor_err = M.MonitorSeries("Training error", monitor, interval=10)
    monitor_vloss = M.MonitorSeries("Validation loss", monitor, interval=10)
    monitor_verr = M.MonitorSeries("Validation error", monitor, interval=10)
    monitor_time = M.MonitorTimeElapsed("Training time", monitor, interval=10)
    monitor_vtime = M.MonitorTimeElapsed("Validation time",
                                         monitor,
                                         interval=10)

    # Training loop.
    for i in range(start_point, args.max_iter):
        # Save parameters
        if i % args.model_save_interval == 0:
            # save checkpoint file
            save_checkpoint(args.model_save_path, i, solver)

        # Validation
        if i % args.val_interval == 0 and i != 0:

            # Clear all intermediate memory to save memory.
            # t_model.loss.clear_recursive()

            l = 0.0
            e = 0.0
            for j in range(args.val_iter):
                images, labels = vdata.next()
                v_model.image.d = images
                v_model.label.d = labels
                v_model.image.data.cast(np.uint8, ctx)
                v_model.label.data.cast(np.int32, ctx)
                v_model.loss.forward(clear_buffer=True)
                v_e.forward(clear_buffer=True)
                l += v_model.loss.d
                e += v_e.d
            monitor_vloss.add(i, l / args.val_iter)
            monitor_verr.add(i, e / args.val_iter)
            monitor_vtime.add(i)

            # Clear all intermediate memory to save memory.
            # v_model.loss.clear_recursive()

        # Training
        l = 0.0
        e = 0.0
        solver.zero_grad()

        def accumulate_error(l, e, t_model, t_e):
            l += t_model.loss.d
            e += t_e.d
            return l, e

        # Gradient accumulation loop
        for j in range(args.accum_grad):
            images, labels = data.next()
            t_model.image.d = images
            t_model.label.d = labels
            t_model.image.data.cast(np.uint8, ctx)
            t_model.label.data.cast(np.int32, ctx)
            t_model.loss.forward(clear_no_need_grad=True)
            t_model.loss.backward(clear_buffer=True)  # Accumulating gradients
            t_e.forward(clear_buffer=True)
            l, e = accumulate_error(l, e, t_model, t_e)

        solver.weight_decay(args.weight_decay)
        solver.update()

        monitor_loss.add(i, l / args.accum_grad)
        monitor_err.add(i, e / args.accum_grad)
        monitor_time.add(i)

        # Learning rate decay at scheduled iter
        if i in args.learning_rate_decay_at:
            solver.set_learning_rate(solver.learning_rate() * 0.1)
    nn.save_parameters(
        os.path.join(args.model_save_path, 'param_%06d.h5' % args.max_iter))

    # Save_nnp
    contents = save_nnp({'x': v_model.image}, {'y': v_model.pred},
                        args.batch_size)
    save.save(os.path.join(args.model_save_path, 'Imagenet_result.nnp'),
              contents)
Ejemplo n.º 6
0
def CNN_run(args, ops, arch_dict):
    """
        Based on the given model architecture,
        construct CNN and execute training.
        input:
            args: arguments set by user.
            ops: operations used in the network.
            arch_dict: a dictionary containing architecture information.
    """

    data_iterator = data_iterator_cifar10
    tdata = data_iterator(args.batch_size, True)
    vdata = data_iterator(args.batch_size, False)

    # CIFAR10 statistics, mean and variance
    CIFAR_MEAN = np.reshape([0.49139968, 0.48215827, 0.44653124], (1, 3, 1, 1))
    CIFAR_STD = np.reshape([0.24703233, 0.24348505, 0.26158768], (1, 3, 1, 1))

    channels, image_height, image_width = 3, 32, 32
    batch_size = args.batch_size
    initial_model_lr = args.model_lr

    one_epoch = tdata.size // batch_size
    max_iter = args.epoch * one_epoch
    val_iter = 10000 // batch_size

    # Create monitor.
    monitor = Monitor(args.monitor_path)
    monitor_loss = MonitorSeries("Training loss", monitor, interval=100)
    monitor_err = MonitorSeries("Training error", monitor, interval=100)
    monitor_vloss = MonitorSeries("Test loss", monitor, interval=100)
    monitor_verr = MonitorSeries("Test error", monitor, interval=100)

    # prepare variables and graph used for test
    image_valid = nn.Variable(
        (batch_size, channels, image_height, image_width))
    label_valid = nn.Variable((batch_size, 1))
    input_image_valid = {"image": image_valid, "label": label_valid}
    pred_valid, _ = construct_networks(args,
                                       ops,
                                       arch_dict,
                                       image_valid,
                                       test=True)
    loss_valid = loss_function(pred_valid, label_valid)

    # set dropout rate in advance
    nn.parameter.get_parameter_or_create("drop_rate",
                                         shape=(1, 1, 1, 1),
                                         need_grad=False)
    initial_drop_rate = nn.Variable((1, 1, 1, 1)).apply(d=args.dropout_rate)
    nn.parameter.set_parameter("drop_rate", initial_drop_rate)

    # prepare variables and graph used for training
    image_train = nn.Variable(
        (batch_size, channels, image_height, image_width))
    label_train = nn.Variable((batch_size, 1))
    input_image_train = {"image": image_train, "label": label_train}
    pred_train, aux_logits = construct_networks(args,
                                                ops,
                                                arch_dict,
                                                image_train,
                                                test=False)
    loss_train = loss_function(pred_train, label_train, aux_logits,
                               args.auxiliary_weight)

    # prepare solvers
    model_params_dict = nn.get_parameters()
    solver_model = S.Momentum(initial_model_lr)
    solver_model.set_parameters(model_params_dict,
                                reset=False,
                                retain_state=True)

    # Training-loop
    for curr_epoch in range(args.epoch):
        print("epoch {}".format(curr_epoch))

        curr_dropout_rate = F.add_scalar(
            F.mul_scalar(initial_drop_rate, (curr_epoch / args.epoch)), 1e-8)
        nn.parameter.set_parameter("drop_rate", curr_dropout_rate)

        for i in range(one_epoch):
            image, label = tdata.next()
            image = image / 255.0
            image = (image - CIFAR_MEAN) / CIFAR_STD
            if args.cutout:
                image = cutout(image, args)
            input_image_train["image"].d = image
            input_image_train["label"].d = label
            loss_train.forward(clear_no_need_grad=True)

            e = categorical_error(pred_train.d, input_image_train["label"].d)
            monitor_loss.add(one_epoch * curr_epoch + i, loss_train.d.copy())
            monitor_err.add(one_epoch * curr_epoch + i, e)

            if args.lr_control_model:
                new_lr = learning_rate_scheduler(one_epoch * curr_epoch + i,
                                                 max_iter, initial_model_lr, 0)
                solver_model.set_learning_rate(new_lr)

            solver_model.zero_grad()
            loss_train.backward(clear_buffer=True)

            if args.with_grad_clip_model:
                for k, v in model_params_dict.items():
                    v.grad.copy_from(
                        F.clip_by_norm(v.grad, args.grad_clip_value_model))

            # update parameters
            solver_model.weight_decay(args.weight_decay_model)
            solver_model.update()

            if (one_epoch * curr_epoch + i) % args.model_save_interval == 0:
                nn.save_parameters(
                    os.path.join(
                        args.model_save_path,
                        'params_{}.h5'.format(one_epoch * curr_epoch + i)))

        # Validation during training.
        ve = 0.
        vloss = 0.
        for j in range(val_iter):
            image, label = vdata.next()
            image = image / 255.0
            image = (image - CIFAR_MEAN) / CIFAR_STD
            input_image_valid["image"].d = image
            input_image_valid["label"].d = label
            loss_valid.forward(clear_no_need_grad=True)
            vloss += loss_valid.d.copy()
            ve += categorical_error(pred_valid.d.copy(), label)
        ve /= val_iter
        vloss /= val_iter
        monitor_vloss.add(one_epoch * curr_epoch + i, vloss)
        monitor_verr.add(one_epoch * curr_epoch + i, ve)

    return
Ejemplo n.º 7
0
def main(args):
    from numpy.random import seed
    seed(46)

    # Get context.
    from nnabla.ext_utils import get_extension_context
    ctx = get_extension_context('cudnn', device_id='0', type_config='float')
    nn.set_default_context(ctx)

    # Create CNN network
    # === TRAIN ===
    # Create input variables.
    image = nn.Variable([args.batch_size, 3, args.img_height, args.img_width])
    label = nn.Variable([args.batch_size, 1, args.img_height, args.img_width])
    # Create prediction graph.
    pred = depth_cnn_model(image, test=False)
    pred.persistent = True
    # Create loss function.
    loss = l1_loss(pred, label)
    # === VAL ===
    #vimage = nn.Variable([args.batch_size, 3, args.img_height, args.img_width])
    #vlabel = nn.Variable([args.batch_size, 1, args.img_height, args.img_width])
    #vpred = depth_cnn_model(vimage, test=True)
    #vloss = l1_loss(vpred, vlabel)

    # Prepare monitors.
    monitor = Monitor(os.path.join(args.log_dir, 'nnmonitor'))
    monitors = {
        'train_epoch_loss':
        MonitorSeries('Train epoch loss', monitor, interval=1),
        'train_itr_loss':
        MonitorSeries('Train itr loss', monitor, interval=100),
        # 'val_epoch_loss': MonitorSeries('Val epoch loss', monitor, interval=1),
        'train_viz':
        MonitorImageTile('Train images', monitor, interval=1000, num_images=4)
    }

    # Create Solver. If training from checkpoint, load the info.
    if args.optimizer == "adam":
        solver = S.Adam(alpha=args.learning_rate, beta1=0.9, beta2=0.999)
    elif args.optimizer == "sgd":
        solver = S.Momentum(lr=args.learning_rate, momentum=0.9)
    solver.set_parameters(nn.get_parameters())

    # Initialize DataIterator
    data_dic = prepare_dataloader(args.dataset_path,
                                  datatype_list=['train', 'val'],
                                  batch_size=args.batch_size,
                                  img_size=(args.img_height, args.img_width))

    # Training loop.
    logger.info("Start training!!!")
    total_itr_index = 0
    for epoch in range(1, args.epochs + 1):
        ## === training === ##
        total_train_loss = 0
        index = 0
        while index < data_dic['train']['size']:
            # Preprocess
            image.d, label.d = data_dic['train']['itr'].next()
            loss.forward(clear_no_need_grad=True)
            # Initialize gradients
            solver.zero_grad()
            # Backward execution
            loss.backward(clear_buffer=True)
            # Update parameters by computed gradients
            if args.optimizer == 'sgd':
                solver.weight_decay(1e-4)
            solver.update()

            # Update log
            index += 1
            total_itr_index += 1
            total_train_loss += loss.d

            # Pass to monitor
            monitors['train_itr_loss'].add(total_itr_index, loss.d)

            # Visualization
            pred.forward(clear_buffer=True)
            train_viz = np.concatenate([
                image.d,
                convert_depth2colormap(label.d),
                convert_depth2colormap(pred.d)
            ],
                                       axis=3)
            monitors['train_viz'].add(total_itr_index, train_viz)

            # Logger
            logger.info("[{}] {}/{} Train Loss {} ({})".format(
                epoch, index, data_dic['train']['size'],
                total_train_loss / index, loss.d))

        # Pass training loss to a monitor.
        train_error = total_train_loss / data_dic['train']['size']
        monitors['train_epoch_loss'].add(epoch, train_error)

        # Save Parameter
        out_param_file = os.path.join(args.log_dir,
                                      'checkpoint' + str(epoch) + '.h5')
        nn.save_parameters(out_param_file)
Ejemplo n.º 8
0
 def __init__(self, learning_rate, momentum=0.9):
     self.solver = S.Momentum(learning_rate, momentum)
     self.solver_bn = S.Momentum(learning_rate, momentum)
def train():
    """
    Main script.

    Naive Multi-Device Training

    NOTE: the communicator exposes low-level interfaces

    * Parse command line arguments.
    * Instantiate a communicator and set parameter variables.
    * Specify contexts for computation.
    * Initialize DataIterator.
    * Construct a computation graph for training and one for validation.
    * Initialize solver and set parameter variables to that.
    * Create monitor instances for saving and displaying training stats.
    * Training loop
      * Computate error rate for validation data (periodically)
      * Get a next minibatch.
      * Execute forwardprop
      * Set parameter gradients zero
      * Execute backprop.
      * Inplace allreduce (THIS IS THE MAIN difference from a single device training)
      * Solver updates parameters by using gradients computed by backprop.
      * Compute training error

    """

    args = get_args()
    if args.tiny_mode:
        n_train_samples = 100000
    else:
        n_train_samples = 1282167

    # Communicator and Context
    from nnabla.ext_utils import get_extension_context
    extension_module = "cudnn"
    ctx = get_extension_context(extension_module, type_config=args.type_config)
    comm = C.MultiProcessDataParalellCommunicator(ctx)
    comm.init()
    n_devices = comm.size
    mpi_rank = comm.rank
    device_id = mpi_rank
    ctx.device_id = str(device_id)
    nn.set_default_context(ctx)

    # workarond to start with the same parameters.
    rng = np.random.RandomState(device_id)
    if args.tiny_mode:
        # We use Tiny ImageNet from Stanford CS231N class.
        # (Tiny ImageNet, https://tiny-imagenet.herokuapp.com/)
        # Tiny ImageNet consists of 200 categories, each category has 500 images
        # in training set. The image size is 64x64. To adapt ResNet into 64x64
        # image inputs, the input image size of ResNet is set as 56x56, and
        # the stride in the first conv and the first max pooling are removed.
        # Please check README.
        data = data_iterator_tiny_imagenet(args.batch_size, 'train')
        vdata = data_iterator_tiny_imagenet(args.batch_size, 'val')
        num_classes = 200
    else:
        # We use ImageNet.
        # (ImageNet, https://imagenet.herokuapp.com/)
        # ImageNet consists of 1000 categories, each category has 1280 images
        # in training set. The image size is various. To adapt ResNet into
        # 320x320 image inputs, the input image size of ResNet is set as
        # 224x224. We need to get tar file and create cache file(320x320 images).
        # Please check README.
        data = data_iterator_imagenet(args.batch_size,
                                      args.train_cachefile_dir,
                                      rng=rng)
        vdata = data_iterator_imagenet(args.batch_size, args.val_cachefile_dir)
        vdata = vdata.slice(rng=None,
                            num_of_slices=n_devices,
                            slice_pos=device_id)
        num_classes = 1000
    # Workaround to start with the same initialized weights for all workers.
    np.random.seed(313)
    t_model = get_model(args, num_classes, test=False, tiny=args.tiny_mode)
    t_model.pred.persistent = True  # Not clearing buffer of pred in backward
    t_pred2 = t_model.pred.unlinked()
    t_e = F.mean(F.top_n_error(t_pred2, t_model.label))
    v_model = get_model(args, num_classes, test=True, tiny=args.tiny_mode)
    v_model.pred.persistent = True  # Not clearing buffer of pred in forward
    v_pred2 = v_model.pred.unlinked()
    v_e = F.mean(F.top_n_error(v_pred2, v_model.label))

    # Add parameters to communicator.
    comm.add_context_and_parameters((ctx, nn.get_parameters()))

    # Create Solver.
    solver = S.Momentum(args.learning_rate, 0.9)
    solver.set_parameters(nn.get_parameters())

    # Setting warmup.
    base_lr = args.learning_rate / n_devices
    warmup_iter = int(1. * n_train_samples / args.batch_size /
                      args.accum_grad / n_devices) * args.warmup_epoch
    warmup_slope = base_lr * (n_devices - 1) / warmup_iter
    solver.set_learning_rate(base_lr)

    # Create monitor.
    import nnabla.monitor as M
    monitor = M.Monitor(args.monitor_path)
    monitor_loss = M.MonitorSeries("Training loss", monitor, interval=10)
    monitor_err = M.MonitorSeries("Training error", monitor, interval=10)
    monitor_vloss = M.MonitorSeries("Validation loss", monitor, interval=1)
    monitor_verr = M.MonitorSeries("Validation error", monitor, interval=1)
    monitor_time = M.MonitorTimeElapsed("Training time", monitor, interval=10)
    monitor_vtime = M.MonitorTimeElapsed("Validation time",
                                         monitor,
                                         interval=1)

    # Training loop.
    vl = nn.Variable()
    ve = nn.Variable()
    for i in range(int(args.max_iter / n_devices)):
        # Save parameters
        if i % (args.model_save_interval // n_devices) == 0 and device_id == 0:
            nn.save_parameters(
                os.path.join(args.model_save_path, 'param_%06d.h5' % i))

        # Validation
        if i % (args.val_interval // n_devices) == 0 and i != 0:
            ve_local = 0.
            vl_local = 0.
            val_iter_local = args.val_iter // n_devices
            for j in range(val_iter_local):
                images, labels = vdata.next()
                v_model.image.d = images
                v_model.label.d = labels
                v_model.image.data.cast(np.uint8, ctx)
                v_model.label.data.cast(np.int32, ctx)
                v_model.loss.forward(clear_buffer=True)
                v_e.forward(clear_buffer=True)
                vl_local += v_model.loss.d.copy()
                ve_local += v_e.d.copy()
            vl_local /= val_iter_local
            vl.d = vl_local
            comm.all_reduce(vl.data, division=True, inplace=True)
            ve_local /= val_iter_local
            ve.d = ve_local
            comm.all_reduce(ve.data, division=True, inplace=True)

            if device_id == 0:
                monitor_vloss.add(i * n_devices, vl.d.copy())
                monitor_verr.add(i * n_devices, ve.d.copy())
                monitor_vtime.add(i * n_devices)

        # Training
        l = 0.0
        e = 0.0
        solver.zero_grad()

        def accumulate_error(l, e, t_model, t_e):
            l += t_model.loss.d
            e += t_e.d
            return l, e

        # Gradient accumulation loop
        for j in range(args.accum_grad):
            images, labels = data.next()
            if j != 0:
                # Update e and l according to previous results of forward
                # propagation.
                # The update of last iteration is performed
                # after solver update to avoid unnecessary CUDA synchronization.
                # This is performed after data.next() in order to overlap
                # the data loading and graph execution.
                # TODO: Move this to the bottom of the loop when prefetch
                # data loader is available.
                l, e = accumulate_error(l, e, t_model, t_e)
            t_model.image.d = images
            t_model.label.d = labels
            t_model.image.data.cast(np.uint8, ctx)
            t_model.label.data.cast(np.int32, ctx)
            t_model.loss.forward(clear_no_need_grad=True)
            t_model.loss.backward(clear_buffer=True)  # Accumulating gradients
            t_e.forward(clear_buffer=True)

        # AllReduce
        params = [x.grad for x in nn.get_parameters().values()]
        comm.all_reduce(params, division=False, inplace=False)

        # Update
        solver.weight_decay(args.weight_decay)
        solver.update()

        # Accumulate errors after solver update
        l, e = accumulate_error(l, e, t_model, t_e)

        # Linear Warmup
        if i <= warmup_iter:
            lr = base_lr + warmup_slope * i
            solver.set_learning_rate(lr)

        # Synchronize by averaging the weights over devices using allreduce
        if (i + 1) % args.sync_weight_every_itr == 0:
            weights = [x.data for x in nn.get_parameters().values()]
            comm.all_reduce(weights, division=True, inplace=True)

        if device_id == 0:
            monitor_loss.add(i * n_devices, l / args.accum_grad)
            monitor_err.add(i * n_devices, e / args.accum_grad)
            monitor_time.add(i * n_devices)

        # Learning rate decay at scheduled iter
        if i * n_devices in args.learning_rate_decay_at:
            solver.set_learning_rate(solver.learning_rate() * 0.1)

    if device_id == 0:
        nn.save_parameters(
            os.path.join(args.model_save_path,
                         'param_%06d.h5' % (args.max_iter / n_devices)))
Ejemplo n.º 10
0
def train():
    bs_train, bs_valid = args.train_batch_size, args.val_batch_size
    extension_module = args.context
    ctx = get_extension_context(
        extension_module, device_id=args.device_id, type_config=args.type_config
    )
    nn.set_default_context(ctx)

    if args.input:
        train_loader, val_loader, n_train_samples, n_val_samples = load_data(
            bs_train, bs_valid
        )

    else:
        train_data_source = data_source_cifar10(
            train=True, shuffle=True, label_shuffle=True
        )
        val_data_source = data_source_cifar10(train=False, shuffle=False)
        n_train_samples = len(train_data_source.labels)
        n_val_samples = len(val_data_source.labels)
        # Data Iterator
        train_loader = data_iterator(
            train_data_source, bs_train, None, False, False)
        val_loader = data_iterator(
            val_data_source, bs_valid, None, False, False)

        if args.shuffle_label:
            if not os.path.exists(args.output):
                os.makedirs(args.output)
            np.save(os.path.join(args.output, "x_train.npy"),
                    train_data_source.images)
            np.save(
                os.path.join(args.output, "y_shuffle_train.npy"),
                train_data_source.labels,
            )
            np.save(os.path.join(args.output, "y_train.npy"),
                    train_data_source.raw_label)
            np.save(os.path.join(args.output, "x_val.npy"),
                    val_data_source.images)
            np.save(os.path.join(args.output, "y_val.npy"),
                    val_data_source.labels)

    if args.model == "resnet23":
        model_prediction = resnet23_prediction
    elif args.model == "resnet56":
        model_prediction = resnet56_prediction
    prediction = functools.partial(
        model_prediction, ncls=10, nmaps=64, act=F.relu, seed=args.seed)

    # Create training graphs
    test = False
    image_train = nn.Variable((bs_train, 3, 32, 32))
    label_train = nn.Variable((bs_train, 1))
    pred_train, _ = prediction(image_train, test)

    loss_train = loss_function(pred_train, label_train)

    # Create validation graph
    test = True
    image_valid = nn.Variable((bs_valid, 3, 32, 32))
    label_valid = nn.Variable((bs_valid, 1))
    pred_valid, _ = prediction(image_valid, test)
    loss_val = loss_function(pred_valid, label_valid)

    for param in nn.get_parameters().values():
        param.grad.zero()

    cfg = read_yaml("./learning_rate.yaml")
    print(cfg)
    lr_sched = create_learning_rate_scheduler(cfg.learning_rate_config)
    solver = S.Momentum(momentum=0.9, lr=lr_sched.get_lr())
    solver.set_parameters(nn.get_parameters())
    start_point = 0

    if args.checkpoint is not None:
        # load weights and solver state info from specified checkpoint file.
        start_point = load_checkpoint(args.checkpoint, solver)

    # Create monitor
    from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed

    monitor = Monitor(args.monitor_path)
    monitor_loss = MonitorSeries("Training loss", monitor, interval=1)
    monitor_err = MonitorSeries("Training error", monitor, interval=1)
    monitor_time = MonitorTimeElapsed("Training time", monitor, interval=1)
    monitor_verr = MonitorSeries("Test error", monitor, interval=1)
    monitor_vloss = MonitorSeries("Test loss", monitor, interval=1)

    # save_nnp
    contents = save_nnp({"x": image_valid}, {"y": pred_valid}, bs_valid)
    save.save(
        os.path.join(args.model_save_path,
                     (args.model+"_epoch0_result.nnp")), contents
    )

    train_iter = math.ceil(n_train_samples / bs_train)
    val_iter = math.ceil(n_val_samples / bs_valid)

    # Training-loop
    for i in range(start_point, args.train_epochs):
        lr_sched.set_epoch(i)
        solver.set_learning_rate(lr_sched.get_lr())
        print("Learning Rate: ", lr_sched.get_lr())
        # Validation
        ve = 0.0
        vloss = 0.0
        print("## Validation")
        for j in range(val_iter):
            image, label = val_loader.next()
            image_valid.d = image
            label_valid.d = label
            loss_val.forward()
            vloss += loss_val.data.data.copy() * bs_valid
            ve += categorical_error(pred_valid.d, label)
        ve /= args.val_iter
        vloss /= n_val_samples

        monitor_verr.add(i, ve)
        monitor_vloss.add(i, vloss)

        if int(i % args.model_save_interval) == 0:
            # save checkpoint file
            save_checkpoint(args.model_save_path, i, solver)

        # Forward/Zerograd/Backward
        print("## Training")
        e = 0.0
        loss = 0.0
        for k in range(train_iter):

            image, label = train_loader.next()
            image_train.d = image
            label_train.d = label
            loss_train.forward()
            solver.zero_grad()
            loss_train.backward()
            solver.update()
            e += categorical_error(pred_train.d, label_train.d)
            loss += loss_train.data.data.copy() * bs_train
        e /= train_iter
        loss /= n_train_samples

        e = categorical_error(pred_train.d, label_train.d)
        monitor_loss.add(i, loss)
        monitor_err.add(i, e)
        monitor_time.add(i)

    nn.save_parameters(
        os.path.join(args.model_save_path, "params_%06d.h5" %
                     (args.train_epochs))
    )

    # save_nnp_lastepoch
    contents = save_nnp({"x": image_valid}, {"y": pred_valid}, bs_valid)
    save.save(os.path.join(args.model_save_path,
              (args.model+"_result.nnp")), contents)
Ejemplo n.º 11
0
def train():
    '''
    Run D3Net Semantic Segmentation Training
    '''
    # Check NNabla version
    if get_nnabla_version_integer() < 12100:
        raise ValueError(
            'This code does not work with nnabla version less than v1.21.0 since [ignore index less than 0](https://github.com/sony/nnabla/pull/945) is added in v1.21.0 . Please update the nnabla version.')

    args = get_args()
    # Load D3Net Hyper parameters (D3Net-L or D3Net-S)
    with open(args.config_file) as file:
        hparams = yaml.load(file, Loader=yaml.FullLoader)

    # Get context.
    ctx = get_extension_context(args.context, device_id=0)
    comm = CommunicatorWrapper(ctx)
    nn.set_default_context(comm.ctx)

    # Change max_iter, learning_rate and weight_decay according no. of gpu devices for multi-gpu training.
    default_batch_size = 8
    train_scale_factor = comm.n_procs * \
        (hparams['batch_size'] / default_batch_size)
    hparams['max_iter'] = int(hparams['max_iter'] // train_scale_factor)
    hparams['lr'] = hparams['lr'] * train_scale_factor
    hparams['min_lr'] = hparams['min_lr'] * train_scale_factor
    hparams['weight_decay'] = hparams['weight_decay'] * comm.n_procs

    # ---------------------
    # Create data iterators
    # ---------------------
    rng = np.random.RandomState()
    data = data_iterator_cityscapes(
        hparams['batch_size'], args.data_dir, rng=rng, train=True)

    if comm.n_procs > 1:
        data = data.slice(rng=rng, num_of_slices=comm.n_procs,
                          slice_pos=comm.rank)

    if comm.rank == 0:
        if not os.path.isdir(args.output_dir):
            os.makedirs(args.output_dir)

    # Create monitors
    monitor = M.Monitor(args.output_dir)
    monitor_training_loss = M.MonitorSeries(
        'Training loss', monitor, interval=args.log_interval)
    monitor_lr = M.MonitorSeries(
        'Learning rate', monitor, interval=args.log_interval)
    monitor_time = M.MonitorTimeElapsed(
        "Training time per iteration", monitor, interval=args.log_interval)

    # ---------------------
    # Create Training Graph
    # ---------------------
    # Create input variables
    image = nn.Variable(
        (hparams['batch_size'], 3, hparams['image_height'], hparams['image_width']))
    seg_gt = nn.Variable(
        (hparams['batch_size'], 1, hparams['image_height'], hparams['image_width']))

    # D3Net prediction/output
    seg_pred = d3net_segmentation(image, hparams, recompute=args.recompute)

    # Configure loss
    loss = F.mean(F.softmax_cross_entropy(seg_pred, seg_gt, axis=1))
    loss.persistent = True

    # Create Solver
    solver = S.Momentum(hparams['lr'], hparams['momentum'])
    solver.set_parameters(nn.get_parameters())

    # Initialize LR Scheduler
    lr_scheduler = PolynomialScheduler(hparams)

    if args.pretrained is not None:
        # Initialize the D3Net backbone weights
        with nn.parameter_scope('backbone'):
            nn.load_parameters(args.pretrained)

    # -------------
    # Training loop
    # -------------
    for i in range(hparams['max_iter']):
        image.d, seg_gt.d = data.next()
        solver.zero_grad()
        lr = lr_scheduler.get_learning_rate(i)
        solver.set_learning_rate(lr)
        loss.forward(clear_no_need_grad=True)

        if comm.n_procs > 1:
            all_reduce_callback = comm.get_all_reduce_callback()
            loss.backward(clear_buffer=True,
                          communicator_callbacks=all_reduce_callback)
        else:
            loss.backward(clear_buffer=True)
        solver.weight_decay(hparams['weight_decay'])
        solver.update()

        if comm.rank == 0:
            # Log monitors
            monitor_training_loss.add(i, loss.d.copy())
            monitor_lr.add(i, lr)
            monitor_time.add(i)

            if (i % hparams['save_interval']) == 0:
                # Save intermediate model parameters
                nn.save_parameters(os.path.join(
                    args.output_dir, "model_param_%08d.h5" % i))
                solver.save_states(os.path.join(
                    args.output_dir, "solver_states.h5"))

    if comm.rank == 0:
        # save final model parameters
        nn.save_parameters(os.path.join(args.output_dir, "final.h5"))
def train():
    """
    Main script.

    Naive Multi-Device Training

    NOTE: the communicator exposes low-level interfaces

    * Parse command line arguments.
    * Instantiate a communicator and set parameter variables.
    * Specify contexts for computation.
    * Initialize DataIterator.
    * Construct a computation graph for training and one for validation.
    * Initialize solver and set parameter variables to that.
    * Create monitor instances for saving and displaying training stats.
    * Training loop
      * Computate error rate for validation data (periodically)
      * Get a next minibatch.
      * Execute forwardprop
      * Set parameter gradients zero
      * Execute backprop.
      * Inplace allreduce (THIS IS THE MAIN difference from a single device training)
      * Solver updates parameters by using gradients computed by backprop.
      * Compute training error

    """

    args = get_args()
    n_train_samples = 1281167
    num_classes = 1000

    # Communicator and Context
    from nnabla.ext_utils import get_extension_context
    extension_module = "cudnn"
    ctx = get_extension_context(extension_module, type_config=args.type_config)
    comm = C.MultiProcessDataParalellCommunicator(ctx)
    comm.init()
    n_devices = comm.size
    mpi_rank = comm.rank
    device_id = mpi_rank
    ctx.device_id = str(device_id)
    nn.set_default_context(ctx)

    # Pipelines and Iterators for training
    train_pipes = [
        TrainPipeline(args.batch_size,
                      args.num_threads,
                      device_id,
                      args.train_cachefile_dir,
                      args.train_list,
                      seed=device_id + 1,
                      num_gpu=n_devices,
                      random_area=args.random_area)
    ]
    train_pipes[0].build()
    data = DALIClassificationIterator(train_pipes,
                                      train_pipes[0].epoch_size("Reader") //
                                      n_devices,
                                      auto_reset=True,
                                      stop_at_epoch=False)
    # Pipelines and Iterators for validation
    val_pipes = [
        ValPipeline(args.batch_size,
                    args.num_threads,
                    device_id,
                    args.val_cachefile_dir,
                    args.val_list,
                    seed=device_id + 1,
                    num_gpu=n_devices)
    ]
    val_pipes[0].build()
    vdata = DALIClassificationIterator(val_pipes,
                                       val_pipes[0].epoch_size("Reader") //
                                       n_devices,
                                       auto_reset=True,
                                       stop_at_epoch=False)
    # Network for training
    t_model = get_model(args,
                        num_classes,
                        n_devices,
                        args.accum_grad,
                        test=False)
    t_model.pred.persistent = True  # Not clearing buffer of pred in backward
    t_pred2 = t_model.pred.get_unlinked_variable(need_grad=False)
    t_e = F.mean(F.top_n_error(t_pred2, t_model.label))
    # Network for validation
    v_model = get_model(args,
                        num_classes,
                        n_devices,
                        args.accum_grad,
                        test=True)
    v_model.pred.persistent = True  # Not clearing buffer of pred in forward
    v_pred2 = v_model.pred.get_unlinked_variable(need_grad=False)
    v_e = F.mean(F.top_n_error(v_pred2, v_model.label))

    # Solver
    solver = S.Momentum(args.learning_rate, 0.9)
    solver.set_learning_rate(args.learning_rate)
    solver.set_parameters(nn.get_parameters())

    # Monitors
    import nnabla.monitor as M
    monitor = M.Monitor(args.monitor_path)
    monitor_loss = M.MonitorSeries("Training loss", monitor, interval=10)
    monitor_err = M.MonitorSeries("Training error", monitor, interval=10)
    monitor_vloss = M.MonitorSeries("Validation loss", monitor, interval=1)
    monitor_verr = M.MonitorSeries("Validation error", monitor, interval=1)
    monitor_time = M.MonitorTimeElapsed("Training time", monitor, interval=10)
    monitor_vtime = M.MonitorTimeElapsed("Validation time",
                                         monitor,
                                         interval=1)

    # Training loop
    vl = nn.Variable()
    ve = nn.Variable()
    for i in range(int(args.max_iter / n_devices)):
        # Save parameters
        if i % (args.model_save_interval // n_devices) == 0 and device_id == 0:
            nn.save_parameters(
                os.path.join(args.model_save_path, 'param_%06d.h5' % i))

        # Validation
        if i % (args.val_interval // n_devices) == 0 and i != 0:
            ve_local = 0.
            vl_local = 0.
            val_iter_local = args.val_iter // n_devices
            for j in range(val_iter_local):
                nextImage, nextLabel = vdata.next()
                v_model.image.data = nextImage
                v_model.label.data = nextLabel
                v_model.loss.forward(clear_buffer=True)
                v_e.forward(clear_buffer=True)
                vl_local += v_model.loss.d.copy()
                ve_local += v_e.d.copy()
            vl_local /= val_iter_local
            vl.d = vl_local
            comm.all_reduce(vl.data, division=True, inplace=True)
            ve_local /= val_iter_local
            ve.d = ve_local
            comm.all_reduce(ve.data, division=True, inplace=True)

            if device_id == 0:
                monitor_vloss.add(i * n_devices, vl.d.copy())
                monitor_verr.add(i * n_devices, ve.d.copy())
                monitor_vtime.add(i * n_devices)

        # Training
        l = 0.0
        e = 0.0
        solver.zero_grad()

        def accumulate_error(l, e, t_model, t_e):
            l += t_model.loss.d
            e += t_e.d
            return l, e

        # Gradient accumulation loop
        for j in range(args.accum_grad):
            nextImage, nextLabel = data.next()
            t_model.image.data = nextImage
            t_model.label.data = nextLabel
            t_model.loss.forward(clear_no_need_grad=True)
            t_model.loss.backward(clear_buffer=True)  # Accumulating gradients
            t_e.forward(clear_buffer=True)
            l, e = accumulate_error(l, e, t_model, t_e)

        # AllReduce
        params = [x.grad for x in nn.get_parameters().values()]
        comm.all_reduce(params, division=False, inplace=False)

        # Update
        solver.weight_decay(args.weight_decay)
        solver.update()

        if device_id == 0:
            monitor_loss.add(i * n_devices, l / args.accum_grad)
            monitor_err.add(i * n_devices, e / args.accum_grad)
            monitor_time.add(i * n_devices)

        # Learning rate decay at scheduled iter
        if i * n_devices in args.learning_rate_decay_at:
            solver.set_learning_rate(solver.learning_rate() * 0.1)

    if device_id == 0:
        nn.save_parameters(
            os.path.join(args.model_save_path,
                         'param_%06d.h5' % (args.max_iter / n_devices)))
Ejemplo n.º 13
0
def infl_icml(model_info_dict, file_dir_dict, use_all_params, need_evaluate,
              alpha):
    num_epochs = 2
    # params
    lr = 0.005
    seed = model_info_dict['seed']
    net_func = model_info_dict['net_func']
    batch_size = model_info_dict['batch_size']
    test_batch_size = 1000
    target_epoch = model_info_dict['num_epochs']
    # files and dirs
    save_dir = file_dir_dict['save_dir']
    infl_filename = file_dir_dict['infl_filename']
    final_model_name = file_dir_dict['model_filename']
    final_model_path = os.path.join(save_dir, 'epoch%02d' % (target_epoch - 1),
                                    'weights', final_model_name)
    input_dir_name = os.path.dirname(file_dir_dict['train_csv'])

    # setup
    trainset, valset, image_shape, n_classes, ntr, nval = init_dataset(
        file_dir_dict['train_csv'], file_dir_dict['val_csv'], seed)
    n_channels, _h, _w = image_shape
    resize_size = get_image_size((_h, _w))
    idx_train = get_indices(ntr, seed)
    idx_val = get_indices(nval, seed)

    nn.load_parameters(final_model_path)
    trained_params = nn.get_parameters(grad_only=False)

    test = True

    grad_model = functools.partial(setup_model,
                                   net_func=net_func,
                                   n_classes=n_classes,
                                   n_channels=n_channels,
                                   resize_size=resize_size,
                                   test=test,
                                   reduction='mean')
    solver = S.Momentum(lr=lr, momentum=0.9)
    solver.set_parameters(trained_params)
    # gradient
    u = compute_gradient(grad_model, solver, valset, test_batch_size, idx_val,
                         resize_size)

    # Hinv * u with SGD
    seed_train = 0
    v = dict()
    for key, param in nn.get_parameters(grad_only=False).items():
        v[key] = nn.Variable(param.d.shape, need_grad=True)
        v[key].d = 0
        v[key].g = 0

    solver.set_parameters(v)

    loss_train = []
    loss_fn = None
    for epoch in range(num_epochs):
        # training
        seed_train = 0
        np.random.seed(epoch)
        idx = get_batch_indices(ntr, batch_size, seed=epoch)
        for j, i in enumerate(idx):
            seeds = list(range(seed_train, seed_train + i.size))
            seed_train += i.size
            X, y = get_batch_data(trainset,
                                  idx_train,
                                  i,
                                  resize_size,
                                  test=False,
                                  seeds=seeds)
            _, loss_fn, input_image = adjust_batch_size(
                grad_model, len(X), loss_fn)
            input_image["image"].d = X
            input_image["label"].d = y
            loss_fn.forward()

            grad_params = nn.grad(loss_fn, [
                param for param in nn.get_parameters(grad_only=False).values()
            ])
            vg = 0
            for vv, g in zip(v.values(), grad_params):
                vg += F.sum(vv * g)

            for parameters in trained_params.values():
                parameters.grad.zero()

            vgrad_params = nn.grad(vg, [
                param for param in nn.get_parameters(grad_only=False).values()
            ])
            loss_i = 0
            for vgp, vv, uu in zip(vgrad_params, v.values(), u.values()):
                loss_i += 0.5 * F.sum(vgp * vv + alpha * vv * vv) - F.sum(
                    uu * vv)
            loss_i.forward()

            solver.zero_grad()
            loss_i.backward(clear_buffer=True)
            solver.update()
            loss_train.append(loss_i.d.copy())

    # influence
    infl_dict = dict()
    infl = np.zeros(ntr)
    for i in tqdm(range(ntr), desc='calc influence (3/3 steps)'):
        csv_idx = idx_train[i]
        file_name = trainset.get_filepath_to_data(csv_idx)
        file_name = os.path.join(input_dir_name, file_name)
        file_name = os.path.normpath(file_name)
        X, y = get_data(trainset, idx_train[i], resize_size, True, seed=i)
        _, loss_fn, input_image = adjust_batch_size(grad_model, len(X),
                                                    loss_fn)
        input_image["image"].d = X
        input_image["label"].d = y
        loss_fn.forward()
        for parameters in trained_params.values():
            parameters.grad.zero()
        loss_fn.backward(clear_buffer=True)
        infl_i = 0
        for j, param in enumerate(nn.get_parameters(grad_only=False).values()):
            infl_i += (param.g.copy() * list(v.values())[j].d.copy()).sum()
        infl[i] = -infl_i / ntr
        infl_dict[csv_idx] = [file_name, y, infl[i]]
    infl_list = [val + [key] for key, val in infl_dict.items()]
    infl_list = sorted(infl_list, key=lambda x: (x[-2]))

    # save
    header = ['x:image', 'y:label', 'influence', 'datasource_index']
    data_type = 'object,int,float,int'
    if need_evaluate:
        save_infl_for_analysis(infl_list, use_all_params, save_dir,
                               infl_filename, epoch, header, data_type)
    save_to_csv(filename=infl_filename,
                header=header,
                list_to_save=infl_list,
                data_type=data_type)
Ejemplo n.º 14
0
def train():
    """
    Main script.

    Steps:

    * Parse command line arguments.
    * Specify contexts for computation.
    * Initialize DataIterator.
    * Construct a computation graph for training and one for validation.
    * Initialize solver and set parameter variables to that.
    * Create monitor instances for saving and displaying training stats.
    * Training loop
      * Computate error rate for validation data (periodically)
      * Get a next minibatch.
      * Execute forwardprop
      * Set parameter gradients zero
      * Execute backprop.
      * Solver updates parameters by using gradients computed by backprop.
      * Compute training error
    """
    # Parse args
    args = get_args()
    n_valid_samples = 10000
    bs_valid = args.batch_size
    extension_module = args.context
    ctx = get_extension_context(extension_module,
                                device_id=args.device_id,
                                type_config=args.type_config)
    nn.set_default_context(ctx)

    # Dataset
    data_iterator = data_iterator_cifar10
    n_class = 10

    # Model architecture
    if args.net == "resnet18":
        prediction = functools.partial(resnet18_prediction,
                                       ncls=n_class,
                                       nmaps=64,
                                       act=F.relu)
    if args.net == "resnet34":
        prediction = functools.partial(resnet34_prediction,
                                       ncls=n_class,
                                       nmaps=64,
                                       act=F.relu)

    # Create training graphs
    test = False
    if args.mixtype == "mixup":
        mdl = MixupLearning(args.batch_size, alpha=args.alpha)
    elif args.mixtype == "cutmix":
        mdl = CutmixLearning((args.batch_size, 3, 32, 32),
                             alpha=args.alpha,
                             cutmix_prob=1.0)
    elif args.mixtype == "vhmixup":
        mdl = VHMixupLearning((args.batch_size, 3, 32, 32), alpha=args.alpha)
    else:
        print("[ERROR] Unknown mixtype: " + args.mixtype)
        return
    image_train = nn.Variable((args.batch_size, 3, 32, 32))
    label_train = nn.Variable((args.batch_size, 1))
    mix_image, mix_label = mdl.mix_data(single_image_augment(image_train),
                                        F.one_hot(label_train, (n_class, )))
    pred_train = prediction(mix_image, test)
    loss_train = mdl.loss(pred_train, mix_label)
    input_train = {"image": image_train, "label": label_train}

    # Create validation graph
    test = True
    image_valid = nn.Variable((bs_valid, 3, 32, 32))
    pred_valid = prediction(image_valid, test)
    input_valid = {"image": image_valid}

    # Solvers
    if args.solver == "Adam":
        solver = S.Adam()
    elif args.solver == "Momentum":
        solver = S.Momentum(lr=args.learning_rate)
    solver.set_parameters(nn.get_parameters())

    # Create monitor
    from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed
    monitor = Monitor(args.save_path)
    monitor_loss = MonitorSeries("Training loss", monitor, interval=10)
    monitor_time = MonitorTimeElapsed("Training time", monitor, interval=10)
    monitor_verr = MonitorSeries("Test error", monitor, interval=1)

    # Data Iterator
    tdata = data_iterator(args.batch_size, True)
    vdata = data_iterator(args.batch_size, False)

    print("Size of the training data: %d " % tdata.size)
    # Training-loop
    for i in range(args.max_iter):
        # Forward/Zerograd/Backward
        image, label = tdata.next()
        input_train["image"].d = image
        input_train["label"].d = label
        mdl.set_mix_ratio()
        loss_train.forward()
        solver.zero_grad()
        loss_train.backward()

        # Model update by solver
        if args.solver == "Momentum":
            if i == args.max_iter / 2:
                solver.set_learning_rate(args.learning_rate / 10.0)
            if i == args.max_iter / 4 * 3:
                solver.set_learning_rate(args.learning_rate / 10.0**2)
        solver.update()

        # Validation
        if (i + 1) % args.val_interval == 0 or i == 0:
            ve = 0.
            vdata._reset()
            vdata_pred = np.zeros((n_valid_samples, n_class))
            vdata_label = np.zeros((n_valid_samples, 1), dtype=np.int32)
            for j in range(0, n_valid_samples, args.batch_size):
                image, label = vdata.next()
                input_valid["image"].d = image
                pred_valid.forward()
                vdata_pred[j:min(j + args.batch_size, n_valid_samples
                                 )] = pred_valid.d[:min(
                                     args.batch_size, n_valid_samples - j)]
                vdata_label[j:min(j + args.batch_size, n_valid_samples
                                  )] = label[:min(args.
                                                  batch_size, n_valid_samples -
                                                  j)]
            ve = categorical_error(vdata_pred, vdata_label)
            monitor_verr.add(i + 1, ve)

        if int((i + 1) % args.model_save_interval) == 0:
            nn.save_parameters(
                os.path.join(args.save_path, 'params_%06d.h5' % (i + 1)))

        # Monitering
        monitor_loss.add(i + 1, loss_train.d.copy())
        monitor_time.add(i + 1)

    nn.save_parameters(
        os.path.join(args.save_path, 'params_%06d.h5' % (args.max_iter)))
Ejemplo n.º 15
0
    init_width = args.width
    init_height = args.height
    init_epoch = seen/nsamples

    yolo_x_nnabla, yolo_features_nnabla, yolo_vars, yolo_tvars, loss_nnabla = create_network(
        batch_size, init_height, init_width, args)

    from nnabla.ext_utils import get_extension_context
    ctx = get_extension_context("cudnn")
    nn.set_default_context(ctx)

    # Load parameters
    print("Load", args.weight, "...")
    nn.load_parameters(args.weight)
    print(nn.get_parameters())

    param_convweights = {
        k: v for k, v in nn.get_parameters().items() if k.endswith("conv/W")}
    param_others = {k: v for k, v in nn.get_parameters().items()
                    if not k.endswith("conv/W")}

    solver_convweights = S.Momentum(learning_rate, args.momentum)
    solver_others = S.Momentum(learning_rate, args.momentum)
    solver_convweights.set_parameters(param_convweights)
    solver_others.set_parameters(param_others)
    print(init_epoch, max_epochs)

    for epoch in range(int(init_epoch), int(max_epochs)):
        train(epoch)
Ejemplo n.º 16
0
def train():
    """
    Main script.
    """

    args = get_args()

    _ = nn.load_parameters(args.pretrained_model_path)
    if args.fine_tune:
        nnabla.parameter.pop_parameter('decoder/logits/affine/conv/W')
        nnabla.parameter.pop_parameter('decoder/logits/affine/conv/b')

    n_train_samples = args.train_samples
    n_val_samples = args.val_samples
    distributed = args.distributed
    compute_acc = args.compute_acc

    if distributed:
        # Communicator and Context
        from nnabla.ext_utils import get_extension_context
        extension_module = "cudnn"
        ctx = get_extension_context(
            extension_module, type_config=args.type_config)
        comm = C.MultiProcessDataParalellCommunicator(ctx)
        comm.init()
        n_devices = comm.size
        mpi_rank = comm.rank
        device_id = mpi_rank
        ctx.device_id = str(device_id)
        nn.set_default_context(ctx)
    else:
        # Get context.
        from nnabla.ext_utils import get_extension_context
        extension_module = args.context
        if args.context is None:
            extension_module = 'cpu'
        logger.info("Running in %s" % extension_module)
        ctx = get_extension_context(
            extension_module, device_id=args.device_id, type_config=args.type_config)
        nn.set_default_context(ctx)
        n_devices = 1
        device_id = 0

    # training data
    data = data_iterator_segmentation(
            args.train_samples, args.batch_size, args.train_dir, args.train_label_dir, target_width=args.image_width, target_height=args.image_height)
    # validation data
    vdata = data_iterator_segmentation(args.val_samples, args.batch_size, args.val_dir,
                                       args.val_label_dir, target_width=args.image_width, target_height=args.image_height)

    if distributed:
        data = data.slice(
            rng=None, num_of_slices=n_devices, slice_pos=device_id)
        vdata = vdata.slice(
            rng=None, num_of_slices=n_devices, slice_pos=device_id)
    num_classes = args.num_class

    # Workaround to start with the same initialized weights for all workers.
    np.random.seed(313)
    t_model = get_model(
        args, test=False)
    t_model.pred.persistent = True  # Not clearing buffer of pred in backward
    t_pred2 = t_model.pred.unlinked()
    t_e = F.sum(F.top_n_error(t_pred2, t_model.label, axis=1)
                * t_model.mask) / F.sum(t_model.mask)

    v_model = get_model(
        args, test=True)
    v_model.pred.persistent = True  # Not clearing buffer of pred in forward
    v_pred2 = v_model.pred.unlinked()
    v_e = F.sum(F.top_n_error(v_pred2, v_model.label, axis=1)
                * v_model.mask) / F.sum(t_model.mask)

    # Create Solver
    solver = S.Momentum(args.learning_rate, 0.9)
    solver.set_parameters(nn.get_parameters())

    # Load checkpoint
    start_point = 0
    if args.checkpoint is not None:
        # load weights and solver state info from specified checkpoint file.
        start_point = load_checkpoint(args.checkpoint, solver)

    # Setting warmup.
    base_lr = args.learning_rate / n_devices
    warmup_iter = int(1. * n_train_samples /
                      args.batch_size / args.accum_grad / n_devices) * args.warmup_epoch
    warmup_slope = base_lr * (n_devices - 1) / warmup_iter
    solver.set_learning_rate(base_lr)

    # Create monitor
    import nnabla.monitor as M
    monitor = M.Monitor(args.monitor_path)
    monitor_loss = M.MonitorSeries("Training loss", monitor, interval=10)
    monitor_err = M.MonitorSeries("Training error", monitor, interval=10)
    monitor_vloss = M.MonitorSeries("Validation loss", monitor, interval=1)
    monitor_verr = M.MonitorSeries("Validation error", monitor, interval=1)
    monitor_time = M.MonitorTimeElapsed("Training time", monitor, interval=10)
    monitor_miou = M.MonitorSeries("mean IOU", monitor, interval=10)
    monitor_vtime = M.MonitorTimeElapsed(
        "Validation time", monitor, interval=1)

    # save_nnp
    contents = save_nnp({'x': v_model.image}, {
                        'y': v_model.pred}, args.batch_size)
    save.save(os.path.join(args.model_save_path,
                           'Deeplabv3plus_result_epoch0.nnp'), contents, variable_batch_size=False)

    # Training loop
    for i in range(start_point, int(args.max_iter / n_devices)):
        # Save parameters
        if i % (args.model_save_interval // n_devices) == 0 and device_id == 0:
            save_checkpoint(args.model_save_path, i, solver)
        # Validation
        if i % (args.val_interval // n_devices) == 0 and i != 0:
            vmiou_local = 0.
            val_iter_local = n_val_samples // args.batch_size
            vl_local = nn.NdArray()
            vl_local.zero()
            ve_local = nn.NdArray()
            ve_local.zero()
            for j in range(val_iter_local):
                images, labels, masks = vdata.next()
                v_model.image.d = images
                v_model.label.d = labels
                v_model.mask.d = masks
                v_model.image.data.cast(np.float32, ctx)
                v_model.label.data.cast(np.int32, ctx)
                v_model.loss.forward(clear_buffer=True)
                v_e.forward(clear_buffer=True)
                vl_local += v_model.loss.data
                ve_local += v_e.data
                # Mean IOU computation
                if compute_acc:
                    vmiou_local += compute_miou(num_classes, labels,
                                                np.argmax(v_model.pred.d, axis=1), masks)

            vl_local /= val_iter_local
            ve_local /= val_iter_local
            if compute_acc:
                vmiou_local /= val_iter_local
                vmiou_ndarray = nn.NdArray.from_numpy_array(
                    np.array(vmiou_local))
            if distributed:
                comm.all_reduce(vl_local, division=True, inplace=True)
                comm.all_reduce(ve_local, division=True, inplace=True)
                if compute_acc:
                    comm.all_reduce(vmiou_ndarray, division=True, inplace=True)

            if device_id == 0:
                monitor_vloss.add(i * n_devices, vl_local.data.copy())
                monitor_verr.add(i * n_devices, ve_local.data.copy())
                if compute_acc:
                    monitor_miou.add(i * n_devices, vmiou_local)
                monitor_vtime.add(i * n_devices)

        # Training
        l = 0.0
        e = 0.0
        solver.zero_grad()

        e_acc = nn.NdArray(t_e.shape)
        e_acc.zero()
        l_acc = nn.NdArray(t_model.loss.shape)
        l_acc.zero()
        # Gradient accumulation loop
        for j in range(args.accum_grad):
            images, labels, masks = data.next()
            t_model.image.d = images
            t_model.label.d = labels
            t_model.mask.d = masks
            t_model.image.data.cast(np.float32, ctx)
            t_model.label.data.cast(np.int32, ctx)
            t_model.loss.forward(clear_no_need_grad=True)
            t_model.loss.backward(clear_buffer=True)  # Accumulating gradients
            t_e.forward(clear_buffer=True)
            e_acc += t_e.data
            l_acc += t_model.loss.data

        # AllReduce
        if distributed:
            params = [x.grad for x in nn.get_parameters().values()]
            comm.all_reduce(params, division=False, inplace=False)
            comm.all_reduce(l_acc, division=True, inplace=True)
            comm.all_reduce(e_acc, division=True, inplace=True)
        solver.scale_grad(1./args.accum_grad)
        solver.weight_decay(args.weight_decay)
        solver.update()

        # Linear Warmup
        if i <= warmup_iter:
            lr = base_lr + warmup_slope * i
            solver.set_learning_rate(lr)

        if distributed:
            # Synchronize by averaging the weights over devices using allreduce
            if (i+1) % args.sync_weight_every_itr == 0:
                weights = [x.data for x in nn.get_parameters().values()]
                comm.all_reduce(weights, division=True, inplace=True)

        if device_id == 0:
            monitor_loss.add(
                i * n_devices, (l_acc / args.accum_grad).data.copy())
            monitor_err.add(
                i * n_devices, (e_acc / args.accum_grad).data.copy())
            monitor_time.add(i * n_devices)

        # Learning rate decay at scheduled iter --> changed to poly learning rate decay policy
        # if i in args.learning_rate_decay_at:
        solver.set_learning_rate(base_lr * ((1 - i / args.max_iter)**0.1))

    if device_id == 0:
        nn.save_parameters(os.path.join(args.model_save_path,
                                        'param_%06d.h5' % args.max_iter))

    contents = save_nnp({'x': v_model.image}, {
                        'y': v_model.pred}, args.batch_size)
    save.save(os.path.join(args.model_save_path,
                           'Deeplabv3plus_result.nnp'), contents, variable_batch_size=False)
Ejemplo n.º 17
0
def CNN_run(args, model):

    data_iterator_train, data_iterator_valid, num_class = \
                get_data_iterator_and_num_class(args)

    channels, image_height, image_width = 3, args.height, args.width
    batch_size = args.batch_size
    initial_model_lr = args.model_lr

    one_epoch = data_iterator_train.size // batch_size
    max_iter = args.epoch * one_epoch
    val_iter = data_iterator_valid.size // batch_size

    # Create monitor.
    monitor = Monitor(args.monitor_path)
    monitor_loss = MonitorSeries("Training loss", monitor, interval=100)
    monitor_err = MonitorSeries("Training error", monitor, interval=100)
    monitor_vloss = MonitorSeries("Test loss", monitor, interval=100)
    monitor_verr = MonitorSeries("Test error", monitor, interval=100)

    # prepare variables and graph used for test
    image_valid = nn.Variable(
        (batch_size, channels, image_height, image_width))
    label_valid = nn.Variable((batch_size, 1))
    input_image_valid = {"image": image_valid, "label": label_valid}

    pred_valid = construct_networks(args,
                                    image_valid,
                                    model,
                                    num_class,
                                    test=True)
    pred_valid.persistent = True
    loss_valid = loss_function(pred_valid, label_valid)
    top_1e_valid = F.mean(F.top_n_error(pred_valid, label_valid))

    # prepare variables and graph used for training
    image_train = nn.Variable(
        (batch_size, channels, image_height, image_width))
    label_train = nn.Variable((batch_size, 1))
    input_image_train = {"image": image_train, "label": label_train}

    pred_train = construct_networks(args,
                                    image_train,
                                    model,
                                    num_class,
                                    test=False)
    loss_train = loss_function(pred_train, label_train)
    top_1e_train = F.mean(F.top_n_error(pred_train, label_train))

    # prepare solvers
    solver = S.Momentum(initial_model_lr)
    solver.set_parameters(nn.get_parameters())

    # Training-loop
    for i in range(max_iter):
        image, label = data_iterator_train.next()
        input_image_train["image"].d = image
        input_image_train["label"].d = label
        nn.forward_all([loss_train, top_1e_train], clear_no_need_grad=True)

        monitor_loss.add(i, loss_train.d.copy())
        monitor_err.add(i, top_1e_train.d.copy())

        if args.lr_control_model:
            new_lr = learning_rate_scheduler(i, max_iter, initial_model_lr, 0)
            solver.set_learning_rate(new_lr)

        solver.zero_grad()
        loss_train.backward(clear_buffer=True)

        if args.with_grad_clip_model:
            for k, v in nn.get_parameters().items():
                v.grad.copy_from(
                    F.clip_by_norm(v.grad, args.grad_clip_value_model))

        # update parameters
        solver.weight_decay(args.weight_decay_model)
        solver.update()

        if i % args.model_save_interval == 0:
            # Validation during training.
            ve = 0.
            vloss = 0.
            for j in range(val_iter):
                v_image, v_label = data_iterator_valid.next()
                input_image_valid["image"].d = v_image
                input_image_valid["label"].d = v_label
                nn.forward_all([loss_valid, top_1e_valid], clear_buffer=True)
                vloss += loss_valid.d.copy()
                ve += top_1e_valid.d.copy()

            ve /= val_iter
            vloss /= val_iter
            monitor_vloss.add(i, vloss)
            monitor_verr.add(i, ve)

            nn.save_parameters(
                os.path.join(args.model_save_path, 'params_{}.h5'.format(i)))

    ve = 0.
    vloss = 0.
    for j in range(val_iter):
        v_image, v_label = data_iterator_valid.next()
        input_image_valid["image"].d = v_image
        input_image_valid["label"].d = v_label
        nn.forward_all([loss_valid, top_1e_valid], clear_buffer=True)
        vloss += loss_valid.d.copy()
        ve += top_1e_valid.d.copy()

    ve /= val_iter
    vloss /= val_iter
    monitor_vloss.add(i, vloss)
    monitor_verr.add(i, ve)

    nn.save_parameters(
        os.path.join(args.model_save_path, 'params_{}.h5'.format(i)))

    return
Ejemplo n.º 18
0
def CNN_run(args, ops, alphas_dict):
    """
        Based on the given model architecture,
        construct CNN and execute training.
        input:
            args: arguments set by user.
            ops: operations used in the network.
            arch_dict: a dictionary containing architecture information.
    """

    data_iterator = data_iterator_cifar10
    all_data = data_iterator(args.batch_size, True)
    tdata = all_data.slice(rng=None, slice_start=0, slice_end=25000)
    vdata = all_data.slice(rng=None, slice_start=25000, slice_end=50000)

    # CIFAR10 statistics, mean and variance
    CIFAR_MEAN = np.reshape([0.49139968, 0.48215827, 0.44653124], (1, 3, 1, 1))
    CIFAR_STD = np.reshape([0.24703233, 0.24348505, 0.26158768], (1, 3, 1, 1))

    channels, image_height, image_width = 3, 32, 32
    batch_size = args.batch_size
    initial_model_lr = args.model_lr

    one_epoch = tdata.size // batch_size
    max_iter = args.epoch * one_epoch

    # Create monitor.
    monitor = Monitor(args.monitor_path)
    monitor_loss = MonitorSeries("Training loss", monitor, interval=100)
    monitor_err = MonitorSeries("Training error", monitor, interval=100)
    monitor_vloss = MonitorSeries("Validation loss", monitor, interval=100)
    monitor_verr = MonitorSeries("Validation error", monitor, interval=100)

    # prepare variables and graph used for training
    image_train = nn.Variable(
        (batch_size, channels, image_height, image_width))
    label_train = nn.Variable((batch_size, 1))
    input_image_train = {"image": image_train, "label": label_train}
    pred_train = construct_networks(args, ops, image_train, test=False)
    loss_train = loss_function(pred_train, label_train)

    # prepare solvers for model parameters
    model_params_dict = \
        {k: v for k, v in nn.get_parameters().items() if "alpha_" not in k}
    solver_model = S.Momentum(initial_model_lr)
    solver_model.set_parameters(
        {
            k: v
            for k, v in nn.get_parameters().items()
            if k in model_params_dict.keys()
        },
        reset=False,
        retain_state=True)

    # prepare solvers for architecture parameters
    solver_archs = S.Adam(alpha=args.arch_lr, beta1=0.5, beta2=0.999)
    solver_archs.set_parameters(
        {
            k: v
            for k, v in nn.get_parameters().items() if k in alphas_dict.keys()
        },
        reset=False,
        retain_state=True)

    # Training-loop
    for i in range(max_iter):

        # Update Model Parameters.

        if args.second_order:
            # store the weights before update.
            original_weights = {
                k: nn.Variable(v.shape, need_grad=True).apply(
                    data=nn.NdArray(v.shape).copy_from(v.data))
                for k, v in nn.get_parameters().items() if "alpha_" not in k
            }

            # gradients refuge
            accumulated_gradient = \
                {k: nn.Variable(v.shape).apply(d=0)
                 for k, v in alphas_dict.items()}

        image, label = tdata.next()
        image = image / 255.0
        image = (image - CIFAR_MEAN) / CIFAR_STD
        input_image_train["image"].d = image
        input_image_train["label"].d = label
        loss_train.forward()

        e = categorical_error(pred_train.d, input_image_train["label"].d)
        monitor_loss.add(i, loss_train.d.copy())
        monitor_err.add(i, e)

        if args.lr_control_model:
            new_lr = learning_rate_scheduler(i, max_iter, initial_model_lr, 0)
            solver_model.set_learning_rate(new_lr)

        solver_model.zero_grad()
        loss_train.backward(clear_buffer=True)

        if args.with_grad_clip_model:
            for k, v in model_params_dict.items():
                v.grad.copy_from(
                    F.clip_by_norm(v.grad, args.grad_clip_value_model))

        solver_model.weight_decay(args.weight_decay_model)
        solver_model.update()  # weights update ( w -> w')

        if args.second_order:
            updated_weights = {
                k: nn.Variable(v.shape, need_grad=True).apply(
                    data=nn.NdArray(v.shape).copy_from(v.data))
                for k, v in nn.get_parameters().items() if "alpha_" not in k
            }

        # Update Architecture Parameters.

        ve, vloss = 0., 0.
        v_image, v_label = vdata.next()
        v_image = v_image / 255.0
        v_image = (v_image - CIFAR_MEAN) / CIFAR_STD
        input_image_train["image"].d = v_image
        input_image_train["label"].d = v_label
        # compute Loss_on_valid(w', alpha)
        loss_train.forward(clear_no_need_grad=True)

        ve = categorical_error(pred_train.d, input_image_train["label"].d)
        monitor_vloss.add(i, loss_train.d.copy())
        monitor_verr.add(i, ve)

        solver_archs.zero_grad()
        solver_model.zero_grad()
        loss_train.backward(clear_buffer=True)  # its gradient is stored

        if args.second_order:
            accumulated_gradient = store_gradient(accumulated_gradient,
                                                  alphas_dict,
                                                  coeff=1.)

            # grad_alpha_L_val(w', alpha).  Note that gradient stored into .data
            delta_gradient_w = {
                k: nn.Variable(v.shape).apply(data=nn.NdArray(
                    v.shape).copy_from(v.grad),
                                              need_grad=True)
                for k, v in nn.get_parameters().items() if "alpha_" not in k
            }

            epsilon = 0.01 / np.sum(
                [np.linalg.norm(v.d) for v in delta_gradient_w.values()])

            coeff = 1.0 * epsilon
            # w -> w+ (= w + epsilon*grad_Loss_on_val(w', alpha))
            weight_modify(original_weights, delta_gradient_w,
                          model_params_dict, coeff)

            input_image_train["image"].d = image  # reuse the same data
            input_image_train["label"].d = label

            # compute Loss_on_train(w+, alpha)
            loss_train.forward()
            solver_archs.zero_grad()
            solver_model.zero_grad()
            loss_train.backward(clear_buffer=True)  # its gradient is stored

            # accumulate currently registered gradient
            coeff = (-1.) * args.eta / 2. * epsilon
            accumulated_gradient = store_gradient(accumulated_gradient,
                                                  alphas_dict, coeff)

            coeff = -1.0 * epsilon
            # w -> w- (= w - epsilon*grad_Loss_on_val(w', alpha))
            weight_modify(original_weights, delta_gradient_w,
                          model_params_dict, coeff)

            # compute Loss_on_train(w-, alpha)
            loss_train.forward()
            solver_archs.zero_grad()
            solver_model.zero_grad()
            loss_train.backward(clear_buffer=True)  # its gradient is stored

            # accumulate currently registered gradient again
            coeff = (+1.) * args.eta / 2. * epsilon
            accumulated_gradient = store_gradient(accumulated_gradient,
                                                  alphas_dict, coeff)

            # replace the weights
            for k, v in alphas_dict.items():
                nn.parameter.set_parameter(
                    k,
                    nn.Variable(v.shape).apply(data=v.data,
                                               grad=accumulated_gradient[k],
                                               need_grad=True))
            for k, v in model_params_dict.items():
                nn.parameter.set_parameter(
                    k,
                    nn.Variable(v.shape).apply(data=updated_weights[k].data,
                                               need_grad=True))

        solver_archs.weight_decay(args.weight_decay_archs)
        solver_archs.update()

        if i % 1000 == 0:
            for k, v in alphas_dict.items():
                keynames = k.split("_")
                print("\nParameters for {} cell, node {} to {};".format(
                    keynames[1], keynames[2], keynames[3]))
                show_ops_and_prob(v.d, ops)

    return alphas_dict
Ejemplo n.º 19
0
Archivo: exp074.py Proyecto: kzky/works
def main(args):
    # Settings
    device_id = args.device_id
    batch_size = args.batch_size
    batch_size_eval = args.batch_size_eval
    n_l_train_data = 4000
    n_train_data = 50000
    n_cls = 10
    learning_rate = 1. * 1e-3
    n_epoch = 300
    act = F.relu
    iter_epoch = n_train_data / batch_size
    n_iter = n_epoch * iter_epoch
    extension_module = args.context
    lambda_ = args.lambda_

    # Model
    ## supervised 
    batch_size, m, h, w = batch_size, 3, 32, 32
    ctx = extension_context(extension_module, device_id=device_id)
    x_l = nn.Variable((batch_size, m, h, w))
    y_l = nn.Variable((batch_size, 1))
    pred, log_var = cnn_model_003(ctx, x_l)
    one = F.constant(1., log_var.shape)
    loss_ce = ce_loss(ctx, pred, y_l)
    reg_sigma = sigma_regularization(ctx, log_var, one)
    loss_supervised = loss_ce + er_loss(ctx, pred) + lambda_ * reg_sigma

    ## stochastic regularization
    x_u0 = nn.Variable((batch_size, m, h, w))
    x_u1 = nn.Variable((batch_size, m, h, w))
    pred_x_u0, log_var0 = cnn_model_003(ctx, x_u0)
    pred_x_u1, log_var1 = cnn_model_003(ctx, x_u1)
    loss_sr = sr_loss_with_uncertainty(ctx, 
                                       pred_x_u0, pred_x_u1, log_var0, log_var1)
    reg_sigma0 = sigma_regularization(ctx, log_var0, one)
    reg_sigma1 = sigma_regularization(ctx, log_var1, one)
    reg_sigmas = sigmas_regularization(ctx, log_var0, log_var1)
    loss_unsupervised = loss_sr + er_loss(ctx, pred_x_u0) + er_loss(ctx, pred_x_u1) \
                        + lambda_ * (reg_sigma0 + reg_sigma1) + lambda_ * reg_sigmas
    ## evaluate
    batch_size_eval, m, h, w = batch_size, 3, 32, 32
    x_eval = nn.Variable((batch_size_eval, m, h, w))
    pred_eval, _ = cnn_model_003(ctx, x_eval, test=True)
    
    # Solver
    with nn.context_scope(ctx):
        solver = S.Momentum(learning_rate)
        solver.set_parameters(nn.get_parameters())

    # Dataset
    ## separate dataset
    home = os.environ.get("HOME")
    fpath = os.path.join(home, "datasets/cifar10/cifar-10.npz")
    separator = Separator(n_l_train_data)
    separator.separate_then_save(fpath)

    l_train_path = os.path.join(home, "datasets/cifar10/l_cifar-10.npz")
    u_train_path = os.path.join(home, "datasets/cifar10/cifar-10.npz")
    test_path = os.path.join(home, "datasets/cifar10/cifar-10.npz")

    # data reader
    data_reader = Cifar10DataReader(l_train_path, u_train_path, test_path,
                                  batch_size=batch_size,
                                  n_cls=n_cls,
                                  da=True,
                                  shape=True)

    # Training loop
    print("# Training loop")
    epoch = 1
    st = time.time()
    acc_prev = 0.
    for i in range(n_iter):
        # Get data and set it to the varaibles
        x_l0_data, x_l1_data, y_l_data = data_reader.get_l_train_batch()
        x_u0_data, x_u1_data, y_u_data = data_reader.get_u_train_batch()
        
        x_l.d, _ , y_l.d= x_l0_data, x_l1_data, y_l_data
        x_u0.d, x_u1.d= x_u0_data, x_u1_data

        # Train
        loss_supervised.forward(clear_no_need_grad=True)
        loss_unsupervised.forward(clear_no_need_grad=True)
        solver.zero_grad()
        loss_supervised.backward(clear_buffer=True)
        loss_unsupervised.backward(clear_buffer=True)
        solver.update()
        
        # Evaluate
        if (i+1) % iter_epoch == 0:
            # Get data and set it to the varaibles
            x_data, y_data = data_reader.get_test_batch()

            # Evaluation loop
            ve = 0.
            iter_val = 0
            for k in range(0, len(x_data), batch_size_eval):
                x_eval.d = get_test_data(x_data, k, batch_size_eval)
                label = get_test_data(y_data, k, batch_size_eval)
                pred_eval.forward(clear_buffer=True)
                ve += categorical_error(pred_eval.d, label)
                iter_val += 1
            msg = "Epoch:{},ElapsedTime:{},Acc:{:02f}".format(
                epoch,
                time.time() - st, 
                (1. - ve / iter_val) * 100)
            print(msg)
            st = time.time()
            epoch +=1
            if epoch in [100, 200]:
                learning_rate /= 10.
                solver.set_learning_rate(learning_rate)
Ejemplo n.º 20
0
                                       with_file_cache=False)

x = nn.Variable((batch_size, sentence_length))
t = nn.Variable((batch_size, sentence_length, 1))
h = PF.embed(x, vocab_size, embedding_size)
h = LSTM(h, hidden, return_sequences=True)
h = TimeDistributed(PF.affine)(h, hidden, name='hidden')
y = TimeDistributed(PF.affine)(h, vocab_size, name='output')

mask = F.sum(F.sign(t), axis=2)  # do not predict 'pad'.
entropy = TimeDistributedSoftmaxCrossEntropy(y, t) * mask
count = F.sum(mask, axis=1)
loss = F.mean(F.div2(F.sum(entropy, axis=1), count))

# Create solver.
solver = S.Momentum(1e-2, momentum=0.9)
solver.set_parameters(nn.get_parameters())

# Create monitor.
from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed
monitor = Monitor('./tmp-lstmlm')
monitor_perplexity = MonitorSeries('perplexity', monitor, interval=1)
monitor_perplexity_valid = MonitorSeries('perplexity_valid',
                                         monitor,
                                         interval=1)

for epoch in range(max_epoch):
    train_loss_set = []
    for i in tqdm(range(num_train_batch)):
        x_batch, y_batch = train_data_iter.next()
        y_batch = y_batch.reshape(list(y_batch.shape) + [1])
Ejemplo n.º 21
0
def train():
    """
    Main script.
    """

    args = get_args()

    # Get context.
    from nnabla.ext_utils import get_extension_context
    extension_module = args.context
    if args.context is None:
        extension_module = 'cpu'
    logger.info("Running in %s" % extension_module)
    ctx = get_extension_context(extension_module,
                                device_id=args.device_id,
                                type_config=args.type_config)
    nn.set_default_context(ctx)

    if args.tiny_mode:
        # We use Tiny ImageNet from Stanford CS231N class.
        # (Tiny ImageNet, https://tiny-imagenet.herokuapp.com/)
        # Tiny ImageNet consists of 200 categories, each category has 500 images
        # in training set. The image size is 64x64. To adapt ResNet into 64x64
        # image inputs, the input image size of ResNet is set as 56x56, and
        # the stride in the first conv and the first max pooling are removed.
        # Please check README.
        data = data_iterator_tiny_imagenet(args.batch_size, 'train')
        vdata = data_iterator_tiny_imagenet(args.batch_size, 'val')
        num_classes = 200
    else:
        # We use ImageNet.
        # (ImageNet, https://imagenet.herokuapp.com/)
        # ImageNet consists of 1000 categories, each category has 1280 images
        # in training set. The image size is various. To adapt ResNet into
        # 320x320 image inputs, the input image size of ResNet is set as
        # 224x224. We need to get tar file and create cache file(320x320 images).
        # Please check README.
        data = data_iterator_imagenet(args.batch_size,
                                      args.train_cachefile_dir)
        vdata = data_iterator_imagenet(args.batch_size, args.val_cachefile_dir)
        num_classes = 1000
    t_model = get_model(args, num_classes, test=False, tiny=args.tiny_mode)
    t_model.pred.persistent = True  # Not clearing buffer of pred in backward
    t_pred2 = t_model.pred.unlinked()
    t_e = F.mean(F.top_n_error(t_pred2, t_model.label))
    v_model = get_model(args, num_classes, test=True, tiny=args.tiny_mode)
    v_model.pred.persistent = True  # Not clearing buffer of pred in forward
    v_pred2 = v_model.pred.unlinked()
    v_e = F.mean(F.top_n_error(v_pred2, v_model.label))

    # Create Solver.
    solver = S.Momentum(args.learning_rate, 0.9)
    solver.set_parameters(nn.get_parameters())

    # Create monitor.
    import nnabla.monitor as M
    monitor = M.Monitor(args.monitor_path)
    monitor_loss = M.MonitorSeries("Training loss", monitor, interval=10)
    monitor_err = M.MonitorSeries("Training error", monitor, interval=10)
    monitor_vloss = M.MonitorSeries("Validation loss", monitor, interval=10)
    monitor_verr = M.MonitorSeries("Validation error", monitor, interval=10)
    monitor_time = M.MonitorTimeElapsed("Training time", monitor, interval=10)
    monitor_vtime = M.MonitorTimeElapsed("Validation time",
                                         monitor,
                                         interval=10)

    # Training loop.
    for i in range(args.max_iter):
        # Save parameters
        if i % args.model_save_interval == 0:
            nn.save_parameters(
                os.path.join(args.model_save_path, 'param_%06d.h5' % i))

        # Validation
        if i % args.val_interval == 0 and i != 0:

            # Clear all intermediate memory to save memory.
            # t_model.loss.clear_recursive()

            l = 0.0
            e = 0.0
            for j in range(args.val_iter):
                images, labels = vdata.next()
                v_model.image.d = images
                v_model.label.d = labels
                v_model.image.data.cast(np.uint8, ctx)
                v_model.label.data.cast(np.int32, ctx)
                v_model.loss.forward(clear_buffer=True)
                v_e.forward(clear_buffer=True)
                l += v_model.loss.d
                e += v_e.d
            monitor_vloss.add(i, l / args.val_iter)
            monitor_verr.add(i, e / args.val_iter)
            monitor_vtime.add(i)

            # Clear all intermediate memory to save memory.
            # v_model.loss.clear_recursive()

        # Training
        l = 0.0
        e = 0.0
        solver.zero_grad()

        def accumulate_error(l, e, t_model, t_e):
            l += t_model.loss.d
            e += t_e.d
            return l, e

        # Gradient accumulation loop
        for j in range(args.accum_grad):
            images, labels = data.next()
            if j != 0:
                # Update e and l according to previous results of forward
                # propagation.
                # The update of last iteration is performed
                # after solver update to avoid unnecessary CUDA synchronization.
                # This is performed after data.next() in order to overlap
                # the data loading and graph execution.
                # TODO: Move this to the bottom of the loop when prefetch
                # data loader is available.
                l, e = accumulate_error(l, e, t_model, t_e)
            t_model.image.d = images
            t_model.label.d = labels
            t_model.image.data.cast(np.uint8, ctx)
            t_model.label.data.cast(np.int32, ctx)
            t_model.loss.forward(clear_no_need_grad=True)
            t_model.loss.backward(clear_buffer=True)  # Accumulating gradients
            t_e.forward(clear_buffer=True)

        solver.weight_decay(args.weight_decay)
        solver.update()

        # Accumulate errors after solver update
        l, e = accumulate_error(l, e, t_model, t_e)

        monitor_loss.add(i, l / args.accum_grad)
        monitor_err.add(i, e / args.accum_grad)
        monitor_time.add(i)

        # Learning rate decay at scheduled iter
        if i in args.learning_rate_decay_at:
            solver.set_learning_rate(solver.learning_rate() * 0.1)
    nn.save_parameters(
        os.path.join(args.model_save_path, 'param_%06d.h5' % args.max_iter))