Ejemplo n.º 1
0
def train():
    """
    Main script.

    Steps:

    * Parse command line arguments.
    * Specify a context for computation.
    * Initialize DataIterator for CIFAR10.
    * Construct a computation graph for training and validation.
    * Initialize a solver and set parameter variables to it.
    * Training loop
      * Computate error rate for validation data (periodically)
      * Get a next minibatch.
      * Execute forwardprop on the training graph.
      * Compute training error
      * Set parameter gradients zero
      * Execute backprop.
      * Solver updates parameters by using gradients computed by backprop.
    """

    # define training parameters
    augmented_shift = True
    augmented_flip = True
    batch_size = 128
    vbatch_size = 100
    num_classes = 10
    weight_decay = 0.0002
    momentum = 0.9
    learning_rates = (cfg.initial_learning_rate,)*80 + \
        (cfg.initial_learning_rate / 10.,)*40 + \
        (cfg.initial_learning_rate / 100.,)*40
    print('lr={}'.format(learning_rates))
    print('weight_decay={}'.format(weight_decay))
    print('momentum={}'.format(momentum))

    # create nabla context
    from nnabla.ext_utils import get_extension_context
    ctx = get_extension_context('cudnn', device_id=args.gpu)
    nn.set_default_context(ctx)

    # Initialize DataIterator for CIFAR10.
    logger.info("Get CIFAR10 Data ...")
    data = cifar_data.DataIterator(batch_size,
                                   augmented_shift=augmented_shift,
                                   augmented_flip=augmented_flip)
    vdata = cifar_data.DataIterator(vbatch_size, val=True)

    if cfg.weightfile is not None:
        logger.info(f"Loading weights from {cfg.weightfile}")
        nn.load_parameters(cfg.weightfile)

    # TRAIN
    # Create input variables.
    image = nn.Variable([batch_size, 3, 32, 32])
    label = nn.Variable([batch_size, 1])

    # Create prediction graph.
    pred, hidden = resnet_cifar10(image,
                                  num_classes=num_classes,
                                  cfg=cfg,
                                  test=False)
    pred.persistent = True

    # Compute initial network size
    num_weights, kbytes_weights = network_size_weights()
    kbytes_weights.forward()
    print(f"Initial network size (weights) is {float(kbytes_weights.d):.3f}KB "
          f"(total number of weights: {int(num_weights):d}).")

    num_activations, kbytes_activations = network_size_activations()
    kbytes_activations.forward()
    print(
        f"Initial network size (activations) is {float(kbytes_activations.d):.3f}KB "
        f"(total number of activations: {int(num_activations):d}).")

    # Create loss function.
    cost_lambda2 = nn.Variable(())
    cost_lambda2.d = cfg.initial_cost_lambda2
    cost_lambda2.persistent = True
    cost_lambda3 = nn.Variable(())
    cost_lambda3.d = cfg.initial_cost_lambda3
    cost_lambda3.persistent = True

    loss1 = F.mean(F.softmax_cross_entropy(pred, label))
    loss1.persistent = True

    if cfg.target_weight_kbytes > 0:
        loss2 = F.relu(kbytes_weights - cfg.target_weight_kbytes)**2
        loss2.persistent = True
    else:
        loss2 = nn.Variable(())
        loss2.d = 0
        loss2.persistent = True
    if cfg.target_activation_kbytes > 0:
        loss3 = F.relu(kbytes_activations - cfg.target_activation_kbytes)**2
        loss3.persistent = True
    else:
        loss3 = nn.Variable(())
        loss3.d = 0
        loss3.persistent = True

    loss = loss1 + cost_lambda2 * loss2 + cost_lambda3 * loss3

    # VALID
    # Create input variables.
    vimage = nn.Variable([vbatch_size, 3, 32, 32])
    vlabel = nn.Variable([vbatch_size, 1])
    # Create predition graph.
    vpred, vhidden = resnet_cifar10(vimage,
                                    num_classes=num_classes,
                                    cfg=cfg,
                                    test=True)
    vpred.persistent = True

    # Create Solver.
    if cfg.optimizer == "adam":
        solver = S.Adam(alpha=learning_rates[0])
    else:
        solver = S.Momentum(learning_rates[0], momentum)

    solver.set_parameters(nn.get_parameters())

    # Training loop (epochs)
    logger.info("Start Training ...")
    i = 0
    best_v_err = 1.0

    # logs of the results
    iters = []
    res_train_err = []
    res_train_loss = []
    res_val_err = []

    # print all variables that exist
    for k in nn.get_parameters():
        print(k)

    res_n_b = collections.OrderedDict()
    res_n_w = collections.OrderedDict()
    res_n_a = collections.OrderedDict()
    res_d_b = collections.OrderedDict()
    res_d_w = collections.OrderedDict()
    res_d_a = collections.OrderedDict()
    res_xmin_b = collections.OrderedDict()
    res_xmin_w = collections.OrderedDict()
    res_xmin_a = collections.OrderedDict()
    res_xmax_b = collections.OrderedDict()
    res_xmax_w = collections.OrderedDict()
    res_xmax_a = collections.OrderedDict()

    for k in nn.get_parameters():
        if (k.split('/')[-1] == 'n') and (k.split('/')[-3] == 'bquant'):
            res_n_b[k] = []
    for k in nn.get_parameters():
        if (k.split('/')[-1] == 'n') and (k.split('/')[-3] == 'Wquant'):
            res_n_w[k] = []
    for k in nn.get_parameters():
        if (k.split('/')[-1] == 'n') and (k.split('/')[-3] == 'Aquant'):
            res_n_a[k] = []
    for k in nn.get_parameters():
        if (k.split('/')[-1] == 'd') and (k.split('/')[-3] == 'bquant'):
            res_d_b[k] = []
    for k in nn.get_parameters():
        if (k.split('/')[-1] == 'd') and (k.split('/')[-3] == 'Wquant'):
            res_d_w[k] = []
    for k in nn.get_parameters():
        if (k.split('/')[-1] == 'd') and (k.split('/')[-3] == 'Aquant'):
            res_d_a[k] = []
    for k in nn.get_parameters():
        if (k.split('/')[-1] == 'xmin') and (k.split('/')[-3] == 'bquant'):
            res_xmin_b[k] = []
    for k in nn.get_parameters():
        if (k.split('/')[-1] == 'xmin') and (k.split('/')[-3] == 'Wquant'):
            res_xmin_w[k] = []
    for k in nn.get_parameters():
        if (k.split('/')[-1] == 'xmin') and (k.split('/')[-3] == 'Aquant'):
            res_xmin_a[k] = []
    for k in nn.get_parameters():
        if (k.split('/')[-1] == 'xmax') and (k.split('/')[-3] == 'bquant'):
            res_xmax_b[k] = []
    for k in nn.get_parameters():
        if (k.split('/')[-1] == 'xmax') and (k.split('/')[-3] == 'Wquant'):
            res_xmax_w[k] = []
    for k in nn.get_parameters():
        if (k.split('/')[-1] == 'xmax') and (k.split('/')[-3] == 'Aquant'):
            res_xmax_a[k] = []

    for epoch in range(len(learning_rates)):
        train_loss = list()
        train_loss1 = list()
        train_loss2 = list()
        train_loss3 = list()
        train_err = list()

        # check whether we need to adapt the learning rate
        if epoch > 0 and learning_rates[epoch - 1] != learning_rates[epoch]:
            solver.set_learning_rate(learning_rates[epoch])

        # Training loop (iterations)
        start_epoch = True
        while data.current != 0 or start_epoch:
            start_epoch = False
            # Next batch
            image.d, label.d = data.next()

            # Training forward/backward
            solver.zero_grad()

            loss.forward()
            loss.backward()

            if weight_decay is not None:
                solver.weight_decay(weight_decay)

            # scale gradients
            if cfg.target_weight_kbytes > 0 or cfg.target_activation_kbytes > 0:
                clip_quant_grads()

            solver.update()
            e = categorical_error(pred.d, label.d)
            train_loss += [loss.d]
            train_loss1 += [loss1.d]
            train_loss2 += [loss2.d]
            train_loss3 += [loss3.d]
            train_err += [e]

            # make sure that parametric values are clipped to correct values (if outside)
            clip_quant_vals()

            # Intermediate Validation (when constraint is set and fulfilled)
            kbytes_weights.forward()
            kbytes_activations.forward()
            if ((cfg.target_weight_kbytes > 0
                 and (cfg.target_weight_kbytes <= 0
                      or float(kbytes_weights.d) <= cfg.target_weight_kbytes)
                 and (cfg.target_activation_kbytes <= 0 or float(
                     kbytes_activations.d) <= cfg.target_activation_kbytes))):

                ve = list()
                start_epoch_ = True
                while vdata.current != 0 or start_epoch_:
                    start_epoch_ = False
                    vimage.d, vlabel.d = vdata.next()
                    vpred.forward()
                    ve += [categorical_error(vpred.d, vlabel.d)]

                v_err = np.array(ve).mean()
                if v_err < best_v_err:
                    best_v_err = v_err
                    nn.save_parameters(
                        os.path.join(cfg.params_dir, 'params_best.h5'))
                    print(
                        f'Best validation error (fulfilling constraints: {best_v_err}'
                    )
                    sys.stdout.flush()
                    sys.stderr.flush()

            i += 1

        # Validation
        ve = list()
        start_epoch = True
        while vdata.current != 0 or start_epoch:
            start_epoch = False
            vimage.d, vlabel.d = vdata.next()
            vpred.forward()
            ve += [categorical_error(vpred.d, vlabel.d)]

        v_err = np.array(ve).mean()
        kbytes_weights.forward()
        kbytes_activations.forward()
        if ((v_err < best_v_err
             and (cfg.target_weight_kbytes <= 0
                  or float(kbytes_weights.d) <= cfg.target_weight_kbytes) and
             (cfg.target_activation_kbytes <= 0 or
              float(kbytes_activations.d) <= cfg.target_activation_kbytes))):
            best_v_err = v_err
            nn.save_parameters(os.path.join(cfg.params_dir, 'params_best.h5'))
            sys.stdout.flush()
            sys.stderr.flush()

        if cfg.target_weight_kbytes > 0:
            print(
                f"Current network size (weights) is {float(kbytes_weights.d):.3f}KB "
                f"(#params: {int(num_weights)}, "
                f"avg. bitwidth: {8. * 1024. * kbytes_weights.d / num_weights})"
            )
            sys.stdout.flush()
            sys.stderr.flush()
        if cfg.target_activation_kbytes > 0:
            print(
                f"Current network size (activations) is {float(kbytes_activations.d):.3f}KB"
            )
            sys.stdout.flush()
            sys.stderr.flush()

        for k in nn.get_parameters():
            if k.split('/')[-1] == 'n':
                print(f'{k}', f'{nn.get_parameters()[k].d}',
                      f'{nn.get_parameters()[k].g}')
                sys.stdout.flush()
                sys.stderr.flush()
                if k.split('/')[-3] == 'bquant':
                    res_n_b[k].append(np.asscalar(nn.get_parameters()[k].d))
                elif k.split('/')[-3] == 'Wquant':
                    res_n_w[k].append(np.asscalar(nn.get_parameters()[k].d))
                elif k.split('/')[-3] == 'Aquant':
                    res_n_a[k].append(np.asscalar(nn.get_parameters()[k].d))

            elif k.split('/')[-1] == 'd':
                print(f'{k}', f'{nn.get_parameters()[k].d}',
                      f'{nn.get_parameters()[k].g}')
                sys.stdout.flush()
                sys.stderr.flush()
                if k.split('/')[-3] == 'bquant':
                    res_d_b[k].append(np.asscalar(nn.get_parameters()[k].d))
                elif k.split('/')[-3] == 'Wquant':
                    res_d_w[k].append(np.asscalar(nn.get_parameters()[k].d))
                elif k.split('/')[-3] == 'Aquant':
                    res_d_a[k].append(np.asscalar(nn.get_parameters()[k].d))

            elif k.split('/')[-1] == 'xmin':
                print(f'{k}', f'{nn.get_parameters()[k].d}',
                      f'{nn.get_parameters()[k].g}')
                sys.stdout.flush()
                sys.stderr.flush()
                if k.split('/')[-3] == 'bquant':
                    res_xmin_b[k].append(np.asscalar(nn.get_parameters()[k].d))
                elif k.split('/')[-3] == 'Wquant':
                    res_xmin_w[k].append(np.asscalar(nn.get_parameters()[k].d))
                elif k.split('/')[-3] == 'Aquant':
                    res_xmin_a[k].append(np.asscalar(nn.get_parameters()[k].d))

            elif k.split('/')[-1] == 'xmax':
                print(f'{k}', f'{nn.get_parameters()[k].d}',
                      f'{nn.get_parameters()[k].g}')
                sys.stdout.flush()
                sys.stderr.flush()
                if k.split('/')[-3] == 'bquant':
                    res_xmax_b[k].append(np.asscalar(nn.get_parameters()[k].d))
                elif k.split('/')[-3] == 'Wquant':
                    res_xmax_w[k].append(np.asscalar(nn.get_parameters()[k].d))
                elif k.split('/')[-3] == 'Aquant':
                    res_xmax_a[k].append(np.asscalar(nn.get_parameters()[k].d))

        # Print
        logger.info(f'epoch={epoch}(iter={i}); '
                    f'overall cost={np.array(train_loss).mean()}; '
                    f'cross-entropy cost={np.array(train_loss1).mean()}; '
                    f'weight-size cost={np.array(train_loss2).mean()}; '
                    f'activations-size cost={np.array(train_loss3).mean()}; '
                    f'TrainErr={np.array(train_err).mean()}; '
                    f'ValidErr={v_err}; BestValidErr={best_v_err}')
        sys.stdout.flush()
        sys.stderr.flush()

        # update the logs
        iters.append(i)
        res_train_err.append(np.array(train_err).mean())
        res_train_loss.append([
            np.array(train_loss).mean(),
            np.array(train_loss1).mean(),
            np.array(train_loss2).mean(),
            np.array(train_loss3).mean()
        ])
        res_val_err.append(np.array(v_err).mean())
        res_ges = np.concatenate([
            np.array(iters)[:, np.newaxis],
            np.array(res_train_err)[:, np.newaxis],
            np.array(res_val_err)[:, np.newaxis],
            np.array(res_train_loss)
        ],
                                 axis=-1)

        # save the results
        np.savetxt(cfg.params_dir + '/results.csv',
                   np.array(res_ges),
                   fmt='%10.8f',
                   header='iter,train_err,val_err,loss,loss1,loss2,loss3',
                   comments='',
                   delimiter=',')

        for rs, res in zip([
                'res_n_b.csv', 'res_n_w.csv', 'res_n_a.csv', 'res_d_b.csv',
                'res_d_w.csv', 'res_d_a.csv', 'res_min_b.csv', 'res_min_w.csv',
                'res_min_a.csv', 'res_max_b.csv', 'res_max_w.csv',
                'res_max_a.csv'
        ], [
                res_n_b, res_n_w, res_n_a, res_d_b, res_d_w, res_d_a,
                res_xmin_b, res_xmin_w, res_xmin_a, res_xmax_b, res_xmax_w,
                res_xmax_a
        ]):
            res_mat = np.array([res[i] for i in res])
            if res_mat.shape[0] > 1 and res_mat.shape[1] > 1:
                np.savetxt(
                    cfg.params_dir + '/' + rs,
                    np.array([[i, j, res_mat[i, j]] for i, j in product(
                        range(res_mat.shape[0]), range(res_mat.shape[1]))]),
                    fmt='%10.8f',
                    comments='',
                    delimiter=',')
Ejemplo n.º 2
0
def _create_optimizer(ctx, o, networks, datasets):
    class Optimizer:
        pass

    optimizer = Optimizer()

    optimizer.comm = current_communicator()
    comm_size = optimizer.comm.size if optimizer.comm else 1
    optimizer.start_iter = (o.start_iter - 1) // comm_size + \
        1 if o.start_iter > 0 else 0
    optimizer.end_iter = (o.end_iter - 1) // comm_size + \
        1 if o.end_iter > 0 else 0
    optimizer.name = o.name
    optimizer.order = o.order
    optimizer.update_interval = o.update_interval if o.update_interval > 0 else 1
    optimizer.network = networks[o.network_name]
    optimizer.data_iterators = OrderedDict()
    for d in o.dataset_name:
        optimizer.data_iterators[d] = datasets[d].data_iterator

    optimizer.dataset_assign = OrderedDict()
    for d in o.data_variable:
        optimizer.dataset_assign[optimizer.network.variables[
            d.variable_name]] = d.data_name

    optimizer.generator_assign = OrderedDict()
    for g in o.generator_variable:
        optimizer.generator_assign[optimizer.network.variables[
            g.variable_name]] = _get_generator(g)

    optimizer.loss_variables = []
    for l in o.loss_variable:
        optimizer.loss_variables.append(
            optimizer.network.variables[l.variable_name])

    optimizer.parameter_learning_rate_multipliers = OrderedDict()
    for p in o.parameter_variable:
        param_variable_names = _get_matching_variable_names(
            p.variable_name, optimizer.network.variables.keys())
        for v_name in param_variable_names:
            optimizer.parameter_learning_rate_multipliers[
                optimizer.network.
                variables[v_name]] = p.learning_rate_multiplier

    with nn.context_scope(ctx):
        if o.solver.type == 'Adagrad':
            optimizer.solver = S.Adagrad(o.solver.adagrad_param.lr,
                                         o.solver.adagrad_param.eps)
            init_lr = o.solver.adagrad_param.lr
        elif o.solver.type == 'Adadelta':
            optimizer.solver = S.Adadelta(o.solver.adadelta_param.lr,
                                          o.solver.adadelta_param.decay,
                                          o.solver.adadelta_param.eps)
            init_lr = o.solver.adadelta_param.lr
        elif o.solver.type == 'Adam':
            optimizer.solver = S.Adam(o.solver.adam_param.alpha,
                                      o.solver.adam_param.beta1,
                                      o.solver.adam_param.beta2,
                                      o.solver.adam_param.eps)
            init_lr = o.solver.adam_param.alpha
        elif o.solver.type == 'Adamax':
            optimizer.solver = S.Adamax(o.solver.adamax_param.alpha,
                                        o.solver.adamax_param.beta1,
                                        o.solver.adamax_param.beta2,
                                        o.solver.adamax_param.eps)
            init_lr = o.solver.adamax_param.alpha
        elif o.solver.type == 'AdaBound':
            optimizer.solver = S.AdaBound(o.solver.adabound_param.alpha,
                                          o.solver.adabound_param.beta1,
                                          o.solver.adabound_param.beta2,
                                          o.solver.adabound_param.eps,
                                          o.solver.adabound_param.final_lr,
                                          o.solver.adabound_param.gamma)
            init_lr = o.solver.adabound_param.alpha
        elif o.solver.type == 'AMSGRAD':
            optimizer.solver = S.AMSGRAD(o.solver.amsgrad_param.alpha,
                                         o.solver.amsgrad_param.beta1,
                                         o.solver.amsgrad_param.beta2,
                                         o.solver.amsgrad_param.eps)
            init_lr = o.solver.amsgrad_param.alpha
        elif o.solver.type == 'AMSBound':
            optimizer.solver = S.AMSBound(o.solver.amsbound_param.alpha,
                                          o.solver.amsbound_param.beta1,
                                          o.solver.amsbound_param.beta2,
                                          o.solver.amsbound_param.eps,
                                          o.solver.amsbound_param.final_lr,
                                          o.solver.amsbound_param.gamma)
            init_lr = o.solver.amsbound_param.alpha
        elif o.solver.type == 'Eve':
            p = o.solver.eve_param
            optimizer.solver = S.Eve(p.alpha, p.beta1, p.beta2, p.beta3, p.k,
                                     p.k2, p.eps)
            init_lr = p.alpha
        elif o.solver.type == 'Momentum':
            optimizer.solver = S.Momentum(o.solver.momentum_param.lr,
                                          o.solver.momentum_param.momentum)
            init_lr = o.solver.momentum_param.lr
        elif o.solver.type == 'Nesterov':
            optimizer.solver = S.Nesterov(o.solver.nesterov_param.lr,
                                          o.solver.nesterov_param.momentum)
            init_lr = o.solver.nesterov_param.lr
        elif o.solver.type == 'RMSprop':
            optimizer.solver = S.RMSprop(o.solver.rmsprop_param.lr,
                                         o.solver.rmsprop_param.decay,
                                         o.solver.rmsprop_param.eps)
            init_lr = o.solver.rmsprop_param.lr
        elif o.solver.type == 'Sgd' or o.solver.type == 'SGD':
            optimizer.solver = S.Sgd(o.solver.sgd_param.lr)
            init_lr = o.solver.sgd_param.lr
        else:
            raise ValueError('Solver "' + o.solver.type +
                             '" is not supported.')

    parameters = {
        v.name: v.variable_instance
        for v, local_lr in
        optimizer.parameter_learning_rate_multipliers.items() if local_lr > 0.0
    }
    optimizer.solver.set_parameters(parameters)
    optimizer.parameters = OrderedDict(
        sorted(parameters.items(), key=lambda x: x[0]))

    optimizer.weight_decay = o.solver.weight_decay

    # keep following 2 lines for backward compatibility
    optimizer.lr_decay = o.solver.lr_decay if o.solver.lr_decay > 0.0 else 1.0
    optimizer.lr_decay_interval = o.solver.lr_decay_interval if o.solver.lr_decay_interval > 0 else 1
    optimizer.solver.set_states_from_protobuf(o)

    optimizer.comm = current_communicator()
    comm_size = optimizer.comm.size if optimizer.comm else 1
    optimizer.scheduler = ExponentialScheduler(init_lr, 1.0, 1)

    if o.solver.lr_scheduler_type == 'Polynomial':
        if o.solver.polynomial_scheduler_param.power != 0.0:
            optimizer.scheduler = PolynomialScheduler(
                init_lr,
                o.solver.polynomial_scheduler_param.max_iter // comm_size,
                o.solver.polynomial_scheduler_param.power)
    elif o.solver.lr_scheduler_type == 'Cosine':
        optimizer.scheduler = CosineScheduler(
            init_lr, o.solver.cosine_scheduler_param.max_iter // comm_size)
    elif o.solver.lr_scheduler_type == 'Exponential':
        if o.solver.exponential_scheduler_param.gamma != 1.0:
            optimizer.scheduler = ExponentialScheduler(
                init_lr, o.solver.exponential_scheduler_param.gamma,
                o.solver.exponential_scheduler_param.iter_interval //
                comm_size if
                o.solver.exponential_scheduler_param.iter_interval > comm_size
                else 1)
    elif o.solver.lr_scheduler_type == 'Step':
        if o.solver.step_scheduler_param.gamma != 1.0 and len(
                o.solver.step_scheduler_param.iter_steps) > 0:
            optimizer.scheduler = StepScheduler(
                init_lr, o.solver.step_scheduler_param.gamma, [
                    step // comm_size
                    for step in o.solver.step_scheduler_param.iter_steps
                ])
    elif o.solver.lr_scheduler_type == 'Custom':
        # ToDo
        raise NotImplementedError()
    elif o.solver.lr_scheduler_type == '':
        if o.solver.lr_decay_interval != 0 or o.solver.lr_decay != 0.0:
            optimizer.scheduler = ExponentialScheduler(
                init_lr, o.solver.lr_decay if o.solver.lr_decay > 0.0 else 1.0,
                o.solver.lr_decay_interval //
                comm_size if o.solver.lr_decay_interval > comm_size else 1)
    else:
        raise ValueError('Learning Rate Scheduler "' +
                         o.solver.lr_scheduler_type + '" is not supported.')

    if o.solver.lr_warmup_scheduler_type == 'Linear':
        if o.solver.linear_warmup_scheduler_param.warmup_iter >= comm_size:
            optimizer.scheduler = LinearWarmupScheduler(
                optimizer.scheduler,
                o.solver.linear_warmup_scheduler_param.warmup_iter //
                comm_size)

    optimizer.forward_sequence = optimizer.network.get_forward_sequence(
        optimizer.loss_variables)
    optimizer.backward_sequence = optimizer.network.get_backward_sequence(
        optimizer.loss_variables,
        optimizer.parameter_learning_rate_multipliers)

    return optimizer
Ejemplo n.º 3
0
def main():
    """
    Main script.

    Steps:
    * Get and set context.
    * Load Dataset
    * Initialize DataIterator.
    * Create Networks
    *   Net for Labeled Data
    *   Net for Unlabeled Data
    *   Net for Test Data
    * Create Solver.
    * Training Loop.
    *   Test
    *   Training
    *     by Labeled Data
    *       Calculate Cross Entropy Loss 
    *     by Unlabeled Data
    *       Estimate Adversarial Direction
    *       Calculate LDS Loss
    """

    args = get_args()

    # Get context.
    from nnabla.contrib.context import extension_context
    extension_module = args.context
    if args.context is None:
        extension_module = 'cpu'
    logger.info("Running in %s" % extension_module)
    ctx = extension_context(extension_module, device_id=args.device_id)
    nn.set_default_context(ctx)

    shape_x = (1, 28, 28)
    n_h = args.n_units
    n_y = args.n_class

    # Load MNist Dataset
    from mnist_data import MnistDataSource
    with MnistDataSource(train=True) as d:
        x_t = d.images
        t_t = d.labels
    with MnistDataSource(train=False) as d:
        x_v = d.images
        t_v = d.labels
    x_t = np.array(x_t / 256.0).astype(np.float32)
    x_t, t_t = x_t[:args.n_train], t_t[:args.n_train]
    x_v, t_v = x_v[:args.n_valid], t_v[:args.n_valid]

    # Create Semi-supervised Datasets
    x_l, t_l, x_u, _ = split_dataset(x_t, t_t, args.n_labeled, args.n_class)
    x_u = np.r_[x_l, x_u]
    x_v = np.array(x_v / 256.0).astype(np.float32)

    # Create DataIterators for datasets of labeled, unlabeled and validation
    di_l = DataIterator(args.batchsize_l, [x_l, t_l])
    di_u = DataIterator(args.batchsize_u, [x_u])
    di_v = DataIterator(args.batchsize_v, [x_v, t_v])

    # Create networks
    # feed-forward-net building function
    def forward(x, test=False):
        return mlp_net(x, n_h, n_y, test)

    # Net for learning labeled data
    xl = nn.Variable((args.batchsize_l,) + shape_x, need_grad=False)
    hl = forward(xl, test=False)
    tl = nn.Variable((args.batchsize_l, 1), need_grad=False)
    loss_l = F.mean(F.softmax_cross_entropy(hl, tl))

    # Net for learning unlabeled data
    xu = nn.Variable((args.batchsize_u,) + shape_x, need_grad=False)
    r = nn.Variable((args.batchsize_u,) + shape_x, need_grad=True)
    eps = nn.Variable((args.batchsize_u,) + shape_x, need_grad=False)
    loss_u, yu = vat(xu, r, eps, forward, distance)

    # Net for evaluating valiation data
    xv = nn.Variable((args.batchsize_v,) + shape_x, need_grad=False)
    hv = forward(xv, test=True)
    tv = nn.Variable((args.batchsize_v, 1), need_grad=False)

    # Create solver
    solver = S.Adam(args.learning_rate)
    solver.set_parameters(nn.get_parameters())

    # Monitor trainig and validation stats.
    import nnabla.monitor as M
    monitor = M.Monitor(args.model_save_path)
    monitor_verr = M.MonitorSeries("Test error", monitor, interval=240)
    monitor_time = M.MonitorTimeElapsed("Elapsed time", monitor, interval=240)

    # Training Loop.
    t0 = time.time()

    for i in range(args.max_iter):

        # Validation Test
        if i % args.val_interval == 0:
            n_error = calc_validation_error(
                di_v, xv, tv, hv, args.val_iter)
            monitor_verr.add(i, n_error)

        #################################
        ## Training by Labeled Data #####
        #################################

        # input minibatch of labeled data into variables
        xl.d, tl.d = di_l.next()

        # initialize gradients
        solver.zero_grad()

        # forward, backward and update
        loss_l.forward(clear_no_need_grad=True)
        loss_l.backward(clear_buffer=True)
        solver.weight_decay(args.weight_decay)
        solver.update()

        #################################
        ## Training by Unlabeled Data ###
        #################################

        # input minibatch of unlabeled data into variables
        xu.d, = di_u.next()

        ##### Calculate Adversarial Noise #####

        # Sample random noise
        n = np.random.normal(size=xu.shape).astype(np.float32)

        # Normalize noise vector and input to variable
        r.d = get_direction(n)

        # Set xi, the power-method scaling parameter.
        eps.data.fill(args.xi_for_vat)

        # Calculate y without noise, only once.
        yu.forward(clear_buffer=True)

        # Do power method iteration
        for k in range(args.n_iter_for_power_method):
            # Initialize gradient to receive value
            r.grad.zero()

            # forward, backward, without update
            loss_u.forward(clear_no_need_grad=True)
            loss_u.backward(clear_buffer=True)

            # Normalize gradinet vector and input to variable
            r.d = get_direction(r.g)

        ##### Calculate loss for unlabeled data #####

        # Clear remained gradients
        solver.zero_grad()

        # Set epsilon, the adversarial noise scaling parameter.
        eps.data.fill(args.eps_for_vat)

        # forward, backward and update
        loss_u.forward(clear_no_need_grad=True)
        loss_u.backward(clear_buffer=True)
        solver.weight_decay(args.weight_decay)
        solver.update()

        ##### Learning rate update #####
        if i % args.iter_per_epoch == 0:
            solver.set_learning_rate(
                solver.learning_rate() * args.learning_rate_decay)
        monitor_time.add(i)

    # Evaluate the final model by the error rate with validation dataset
    valid_error = calc_validation_error(di_v, xv, tv, hv, args.val_iter)
    monitor_verr.add(i, valid_error)
    monitor_time.add(i)

    # Save the model.
    nnp_file = os.path.join(
        args.model_save_path, 'vat_%06d.nnp' % args.max_iter)
    runtime_contents = {
        'networks': [
            {'name': 'Validation',
             'batch_size': args.batchsize_v,
             'outputs': {'y': hv},
             'names': {'x': xv}}],
        'executors': [
            {'name': 'Runtime',
             'network': 'Validation',
             'data': ['x'],
             'output': ['y']}]}
    save.save(nnp_file, runtime_contents)

    from cpp_forward_check import check_cpp_forward
    check_cpp_forward(args.model_save_path, [xv.d], [xv], hv, nnp_file)
Ejemplo n.º 4
0
def train():
    """
    Main script.

    Steps:

    * Parse command line arguments.
    * Specify a context for computation.
    * Initialize DataIterator for MNIST.
    * Construct a computation graph for training and validation.
    * Initialize a solver and set parameter variables to it.
    * Create monitor instances for saving and displaying training stats.
    * Training loop
      * Computate error rate for validation data (periodically)
      * Get a next minibatch.
      * Set parameter gradients zero
      * Execute forwardprop on the training graph.
      * Execute backprop.
      * Solver updates parameters by using gradients computed by backprop.
      * Compute training error
    """
    args = get_args(monitor_path='tmp.monitor.bnn')

    # Get context.
    from nnabla.contrib.context import extension_context
    extension_module = args.context
    if args.context is None:
        extension_module = 'cpu'
    logger.info("Running in %s" % extension_module)
    ctx = extension_context(extension_module, device_id=args.device_id)
    nn.set_default_context(ctx)

    # Initialize DataIterator for MNIST.
    data = data_iterator_mnist(args.batch_size, True)
    vdata = data_iterator_mnist(args.batch_size, False)

    # Create CNN network for both training and testing.
    mnist_cnn_prediction = mnist_inq_lenet_prediction
    if args.net == 'inq':
        mnist_cnn_prediction = mnist_inq_lenet_prediction
    elif args.net == 'inq_resnet':
        mnist_cnn_prediction = mnist_inq_resnet_prediction

    # TRAIN
    # Create input variables.
    image = nn.Variable([args.batch_size, 1, 28, 28])
    label = nn.Variable([args.batch_size, 1])
    # Create predition graph.
    pred = mnist_cnn_prediction(image / 255, test=False)
    pred.persistent = True
    # Create loss function.
    loss = F.mean(F.softmax_cross_entropy(pred, label))

    # TEST
    # Create input variables.
    vimage = nn.Variable([args.batch_size, 1, 28, 28])
    vlabel = nn.Variable([args.batch_size, 1])
    # Create predition graph.
    vpred = mnist_cnn_prediction(vimage / 255, test=True)

    # Create Solver.
    solver = S.Adam(args.learning_rate)
    solver.set_parameters(nn.get_parameters())

    # Create monitor.
    import nnabla.monitor as M
    monitor = M.Monitor(args.monitor_path)
    monitor_loss = M.MonitorSeries("Training loss", monitor, interval=10)
    monitor_err = M.MonitorSeries("Training error", monitor, interval=10)
    monitor_time = M.MonitorTimeElapsed("Training time", monitor, interval=100)
    monitor_verr = M.MonitorSeries("Test error", monitor, interval=10)

    # Training loop.
    for i in range(args.max_iter):
        if i % args.val_interval == 0:
            # Validation
            ve = 0.0
            for j in range(args.val_iter):
                vimage.d, vlabel.d = vdata.next()
                vpred.forward(clear_buffer=True)
                ve += categorical_error(vpred.d, vlabel.d)
            monitor_verr.add(i, ve / args.val_iter)
        if i % args.model_save_interval == 0:
            nn.save_parameters(os.path.join(
                args.model_save_path, 'params_%06d.h5' % i))
        # Training forward
        image.d, label.d = data.next()
        solver.zero_grad()
        loss.forward(clear_no_need_grad=True)
        # Training backward & update
        loss.backward(clear_buffer=True)
        solver.weight_decay(args.weight_decay)
        solver.update()
        # Monitor
        e = categorical_error(pred.d, label.d)
        monitor_loss.add(i, loss.d.copy())
        monitor_err.add(i, e)
        monitor_time.add(i)

    parameter_file = os.path.join(
        args.model_save_path, 'params_%06d.h5' % args.max_iter)
    nn.save_parameters(parameter_file)
Ejemplo n.º 5
0
def main():

    # Get arguments
    args = get_args()
    data_file = "https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.train.txt"
    model_file = args.work_dir + "model.h5"

    # Load Dataset
    itow, wtoi, dataset = load_ptbset(data_file)

    # Computation environment settings
    from nnabla.contrib.context import extension_context
    extension_module = args.context
    if args.context is None:
        extension_module = 'cpu'
    logger.info("Running in %s" % extension_module)
    ctx = extension_context(extension_module, device_id=args.device_id)
    nn.set_default_context(ctx)

    # Create data provider
    n_word = len(wtoi)
    n_dim = args.embed_dim
    batchsize = args.batchsize
    half_window = args.half_window_length
    n_negative = args.n_negative_sample

    di = DataIteratorForEmbeddingLearning(
        batchsize=batchsize,
        half_window=half_window,
        n_negative=n_negative,
        dataset=dataset)

    # Create model
    # - Real batch size including context samples and negative samples
    size = batchsize * (1 + n_negative) * (2 * (half_window - 1))

    # Model for learning
    # - input variables
    xl = nn.Variable((size,))  # variable for word
    yl = nn.Variable((size,))  # variable for context

    # Embed layers for word embedding function
    # - f_embed : word index x to get y, the n_dim vector
    # --  for each sample in a minibatch
    hx = PF.embed(xl, n_word, n_dim, name="e1")  # feature vector for word
    hy = PF.embed(yl, n_word, n_dim, name="e1")  # feature vector for context
    hl = F.sum(hx * hy, axis=1)

    # -- Approximated likelihood of context prediction
    # pos: word context, neg negative samples
    tl = nn.Variable([size, ], need_grad=False)
    loss = F.sigmoid_cross_entropy(hl, tl)
    loss = F.mean(loss)

    # Model for test of searching similar words
    xr = nn.Variable((1,), need_grad=False)
    hr = PF.embed(xr, n_word, n_dim, name="e1")  # feature vector for test

    # Create solver
    solver = S.Adam(args.learning_rate)
    solver.set_parameters(nn.get_parameters())

    # Create monitor.
    monitor = M.Monitor(args.work_dir)
    monitor_loss = M.MonitorSeries(
        "Training loss", monitor, interval=args.monitor_interval)
    monitor_time = M.MonitorTimeElapsed(
        "Training time", monitor, interval=args.monitor_interval)

    # Do training
    max_epoch = args.max_epoch
    for epoch in range(max_epoch):

        # iteration per epoch
        for i in range(di.n_batch):

            # get minibatch
            xi, yi, ti = di.next()

            # learn
            solver.zero_grad()
            xl.d, yl.d, tl.d = xi, yi, ti
            loss.forward(clear_no_need_grad=True)
            loss.backward(clear_buffer=True)
            solver.update()

            # monitor
            itr = epoch * di.n_batch + i
            monitor_loss.add(itr, loss.d)
            monitor_time.add(itr)

    # Save model
    nn.save_parameters(model_file)

    # Evaluate by similarity
    max_check_words = args.max_check_words
    for i in range(max_check_words):

        # prediction
        xr.d = i
        hr.forward(clear_buffer=True)
        h = hr.d

        # similarity calculation
        w = nn.get_parameters()['e1/embed/W'].d
        s = np.sqrt((w * w).sum(1))
        w /= s.reshape((s.shape[0], 1))
        similarity = w.dot(h[0]) / s[i]

        # for understanding
        output_similar_words(itow, i, similarity)
Ejemplo n.º 6
0
def meta_train(args, shape_x, train_data, valid_data, test_data):

    # Build episode generators
    train_episode_generator = EpisodeGenerator(
        train_data[0], train_data[1], args.n_class_tr, args.n_shot_tr, args.n_query_tr)
    valid_episode_generator = EpisodeGenerator(
        valid_data[0], valid_data[1], args.n_class, args.n_shot, args.n_query)
    test_episode_generator = EpisodeGenerator(
        test_data[0], test_data[1], args.n_class, args.n_shot, args.n_query)

    # Build training model
    xs_t = nn.Variable((args.n_class_tr * args.n_shot_tr, ) + shape_x)
    xq_t = nn.Variable((args.n_class_tr * args.n_query_tr, ) + shape_x)
    hq_t = net(args.n_class_tr, xs_t, xq_t, args.embedding,
               args.net_type, args.metric, False)
    yq_t = nn.Variable((args.n_class_tr * args.n_query_tr, 1))
    loss_t = F.mean(F.softmax_cross_entropy(hq_t, yq_t))

    # Build evaluation model
    xs_v = nn.Variable((args.n_class * args.n_shot, ) + shape_x)
    xq_v = nn.Variable((args.n_class * args.n_query, ) + shape_x)
    hq_v = net(args.n_class, xs_v, xq_v, args.embedding,
               args.net_type, args.metric, True)
    yq_v = nn.Variable((args.n_class * args.n_query, 1))
    err_v = F.mean(F.top_n_error(hq_v, yq_v, n=1))

    # Setup solver
    solver = S.Adam(args.learning_rate)
    solver.set_parameters(nn.get_parameters())

    # Monitor outputs
    monitor = Monitor(args.work_dir)
    monitor_loss = MonitorSeries(
        "Training loss", monitor, interval=args.iter_per_epoch)
    monitor_valid_err = MonitorSeries(
        "Validation error", monitor, interval=args.iter_per_valid)
    monitor_test_err = MonitorSeries("Test error", monitor)
    monitor_test_conf = MonitorSeries("Test error confidence", monitor)

    # Output files
    param_file = args.work_dir + "/params.h5"
    tsne_file = args.work_dir + "/tsne.png"

    # Save NNP
    batch_size = 1
    contents = save_nnp({'x0': xs_v, 'x1': xq_v}, {
                          'y': hq_v}, batch_size)
    save.save(os.path.join(args.work_dir,
                           'MetricMetaLearning_epoch0.nnp'), contents, variable_batch_size=False)

    # Training loop
    train_losses = []
    best_err = 1.0
    for i in range(args.max_iteration):

        # Decay learning rate
        if (i + 1) % args.lr_decay_interval == 0:
            solver.set_learning_rate(solver.learning_rate() * args.lr_decay)

        # Create an episode
        xs_t.d, xq_t.d, yq_t.d = train_episode_generator.next()

        # Training by the episode
        solver.zero_grad()
        loss_t.forward(clear_no_need_grad=True)
        loss_t.backward(clear_buffer=True)
        solver.update()
        train_losses.append(loss_t.d.copy())

        # Evaluation
        if (i + 1) % args.iter_per_valid == 0:
            train_loss = np.mean(train_losses)
            train_losses = []
            valid_errs = []
            for k in range(args.n_episode_for_valid):
                xs_v.d, xq_v.d, yq_v.d = valid_episode_generator.next()
                err_v.forward(clear_no_need_grad=True, clear_buffer=True)
                valid_errs.append(np.float(err_v.d.copy()))
            valid_err = np.mean(valid_errs)

            monitor_loss.add(i + 1, loss_t.d.copy())
            monitor_valid_err.add(i + 1, valid_err * 100)
            if valid_err < best_err:
                best_err = valid_err
                nn.save_parameters(param_file)

    # Final evaluation
    nn.load_parameters(param_file)
    v_errs = []
    for k in range(args.n_episode_for_test):
        xs_v.d, xq_v.d, yq_v.d = test_episode_generator.next()
        err_v.forward(clear_no_need_grad=True, clear_buffer=True)
        v_errs.append(np.float(err_v.d.copy()))
    v_err_mean = np.mean(v_errs)
    v_err_std = np.std(v_errs)
    v_err_conf = 1.96 * v_err_std / np.sqrt(args.n_episode_for_test)
    monitor_test_err.add(0, v_err_mean * 100)
    monitor_test_conf.add(0, v_err_conf * 100)

    # Visualization
    n_class = 50
    n_sample = 20
    visualize_episode_generator = EpisodeGenerator(
        train_data[0], train_data[1], n_class, 0, n_sample)
    _, samples, labels = visualize_episode_generator.next()
    u = get_embeddings(samples, conv4)
    v = get_tsne(u)
    plot_tsne(v[:, 0], v[:, 1], labels[:, 0], tsne_file)

    # Save NNP
    contents = save_nnp({'x0': xs_v, 'x1': xq_v}, {
                          'y': hq_v}, batch_size)
    save.save(os.path.join(args.work_dir,
                           'MetricMetaLearning.nnp'), contents, variable_batch_size=False)
Ejemplo n.º 7
0
def main():
    """
    Main script.
    Steps:
    * Setup calculation environment
    * Initialize data iterator.
    * Create Networks
    * Create Solver.
    * Training Loop.
    *   Training
    *   Test
    * Save
    """

    # Set args
    args = get_args(monitor_path='tmp.monitor.vae',
                    max_iter=60000,
                    model_save_path=None,
                    learning_rate=3e-4,
                    batch_size=100,
                    weight_decay=0)

    # Get context.
    from nnabla.ext_utils import get_extension_context
    logger.info("Running in %s" % args.context)
    ctx = get_extension_context(args.context,
                                device_id=args.device_id,
                                type_config=args.type_config)
    nn.set_default_context(ctx)

    # Initialize data provider
    di_l = data_iterator_mnist(args.batch_size, True)
    di_t = data_iterator_mnist(args.batch_size, False)

    # Network
    shape_x = (1, 28, 28)
    shape_z = (50, )
    x = nn.Variable((args.batch_size, ) + shape_x)
    loss_l = vae(x, shape_z, test=False)
    loss_t = vae(x, shape_z, test=True)

    # Create solver
    solver = S.Adam(args.learning_rate)
    solver.set_parameters(nn.get_parameters())

    # Monitors for training and validation
    monitor = M.Monitor(args.model_save_path)
    monitor_training_loss = M.MonitorSeries("Training loss",
                                            monitor,
                                            interval=600)
    monitor_test_loss = M.MonitorSeries("Test loss", monitor, interval=600)
    monitor_time = M.MonitorTimeElapsed("Elapsed time", monitor, interval=600)

    # Training Loop.
    for i in range(args.max_iter):

        # Initialize gradients
        solver.zero_grad()

        # Forward, backward and update
        x.d, _ = di_l.next()
        loss_l.forward(clear_no_need_grad=True)
        loss_l.backward(clear_buffer=True)
        solver.weight_decay(args.weight_decay)
        solver.update()

        # Forward for test
        x.d, _ = di_t.next()
        loss_t.forward(clear_no_need_grad=True)

        # Monitor for logging
        monitor_training_loss.add(i, loss_l.d.copy())
        monitor_test_loss.add(i, loss_t.d.copy())
        monitor_time.add(i)

    # Save the model
    nn.save_parameters(
        os.path.join(args.model_save_path, 'params_%06d.h5' % args.max_iter))
Ejemplo n.º 8
0
def _create_optimizer(ctx, o, networks, datasets):
    class Optimizer:
        pass

    optimizer = Optimizer()

    optimizer.name = o.name
    optimizer.order = o.order
    optimizer.update_interval = o.update_interval if o.update_interval > 0 else 1
    optimizer.network = networks[o.network_name]
    optimizer.data_iterator = datasets[o.dataset_name].data_iterator

    optimizer.dataset_assign = OrderedDict()
    for d in o.data_variable:
        optimizer.dataset_assign[
            optimizer.network.variables[d.variable_name]] = d.data_name

    optimizer.generator_assign = OrderedDict()
    for g in o.generator_variable:
        optimizer.generator_assign[optimizer.network.variables[
            g.variable_name]] = _get_generator(g)

    optimizer.loss_variables = []
    for l in o.loss_variable:
        optimizer.loss_variables.append(
            optimizer.network.variables[l.variable_name])

    optimizer.parameter_learning_rate_multipliers = OrderedDict()
    for p in o.parameter_variable:
        param_variable_names = [v_name for v_name in optimizer.network.variables.keys(
        ) if v_name.find(p.variable_name) == 0]
        for v_name in param_variable_names:
            optimizer.parameter_learning_rate_multipliers[
                optimizer.network.variables[v_name]] = p.learning_rate_multiplier

    with nn.context_scope(ctx):
        if o.solver.type == 'Adagrad':
            optimizer.solver = S.Adagrad(
                o.solver.adagrad_param.lr, o.solver.adagrad_param.eps)
        elif o.solver.type == 'Adadelta':
            optimizer.solver = S.Adadelta(
                o.solver.adadelta_param.lr, o.solver.adadelta_param.decay, o.solver.adadelta_param.eps)
        elif o.solver.type == 'Adam':
            optimizer.solver = S.Adam(o.solver.adam_param.alpha, o.solver.adam_param.beta1,
                                      o.solver.adam_param.beta2, o.solver.adam_param.eps)
        elif o.solver.type == 'Adamax':
            optimizer.solver = S.Adamax(o.solver.adamax_param.alpha, o.solver.adamax_param.beta1,
                                        o.solver.adamax_param.beta2, o.solver.adamax_param.eps)
        elif o.solver.type == 'Eve':
            p = o.solver.eve_param
            optimizer.solver = S.Eve(
                p.alpha, p.beta1, p.beta2, p.beta3, p.k, p.k2, p.eps)
        elif o.solver.type == 'Momentum':
            optimizer.solver = S.Momentum(
                o.solver.momentum_param.lr, o.solver.momentum_param.momentum)
        elif o.solver.type == 'Nesterov':
            optimizer.solver = S.Nesterov(
                o.solver.nesterov_param.lr, o.solver.nesterov_param.momentum)
        elif o.solver.type == 'RMSprop':
            optimizer.solver = S.RMSprop(
                o.solver.rmsprop_param.lr, o.solver.rmsprop_param.decay, o.solver.rmsprop_param.eps)
        elif o.solver.type == 'Sgd' or o.solver.type == 'SGD':
            optimizer.solver = S.Sgd(o.solver.sgd_param.lr)
        else:
            raise ValueError('Solver "' + o.solver.type +
                             '" is not supported.')

    optimizer.solver.set_parameters({v.name: v.variable_instance for v,
                                     local_lr in optimizer.parameter_learning_rate_multipliers.items() if local_lr > 0.0})

    optimizer.weight_decay = o.solver.weight_decay
    optimizer.lr_decay = o.solver.lr_decay if o.solver.lr_decay > 0.0 else 1.0
    optimizer.lr_decay_interval = o.solver.lr_decay_interval if o.solver.lr_decay_interval > 0 else 1

    optimizer.forward_sequence = optimizer.network.get_forward_sequence(
        optimizer.loss_variables)
    optimizer.backward_sequence = optimizer.network.get_backward_sequence(
        optimizer.loss_variables, optimizer.parameter_learning_rate_multipliers)

    return optimizer
def train_and_eval():

    # Settings
    args = get_args()
    n_class = args.n_class
    n_shot = args.n_shot
    n_query = args.n_query
    n_class_tr = args.n_class_tr
    n_shot_tr = args.n_shot_tr
    if n_shot_tr == 0:
        n_shot_tr = n_shot
    n_query_tr = args.n_query_tr
    if n_query_tr == 0:
        n_query_tr = n_query

    dataset = args.dataset
    dataset_root = args.dataset_root

    init_type = args.init_type
    embedding = args.embedding
    net_type = args.net_type
    metric = args.metric

    max_iteration = args.max_iteration
    lr_decay_interval = args.lr_decay_interval
    lr_decay = args.lr_decay
    iter_per_epoch = args.iter_per_epoch
    iter_per_valid = args.iter_per_valid
    n_episode_for_valid = args.n_episode_for_valid
    n_episode_for_test = args.n_episode_for_test
    work_dir = args.work_dir

    # Set context
    from nnabla.ext_utils import get_extension_context
    logger.info("Running in %s" % args.context)
    ctx = get_extension_context(args.context,
                                device_id=args.device_id,
                                type_config=args.type_config)
    nn.set_default_context(ctx)

    # Monitor outputs
    from nnabla.monitor import Monitor, MonitorSeries
    monitor = Monitor(args.work_dir)
    monitor_loss = MonitorSeries("Training loss",
                                 monitor,
                                 interval=iter_per_epoch)
    monitor_valid_err = MonitorSeries("Validation error",
                                      monitor,
                                      interval=iter_per_valid)
    monitor_test_err = MonitorSeries("Test error", monitor)
    monitor_test_conf = MonitorSeries("Test error confidence", monitor)

    # Output files
    param_file = work_dir + "params.h5"
    tsne_file = work_dir + "tsne.png"

    # Load data
    shape_x = (1, 28, 28)
    train_data, valid_data, test_data = load_omniglot(dataset_root +
                                                      "/omniglot/data/")
    train_episode_generator = EpisodeGenerator(n_class_tr, n_shot_tr,
                                               n_query_tr, shape_x, train_data)
    valid_episode_generator = EpisodeGenerator(n_class, n_shot, n_query,
                                               shape_x, valid_data)
    test_episode_generator = EpisodeGenerator(n_class, n_shot, n_query,
                                              shape_x, test_data)

    # Build training model
    xs_t = nn.Variable((n_class_tr * n_shot_tr, ) + shape_x)
    xq_t = nn.Variable((n_class_tr * n_query_tr, ) + shape_x)
    hq_t = net(n_class_tr, xs_t, xq_t, init_type, embedding, net_type, metric,
               False)
    yq_t = nn.Variable((n_class_tr * n_query_tr, 1))
    loss_t = F.mean(F.softmax_cross_entropy(hq_t, yq_t))

    # Build evaluation model
    xs_v = nn.Variable((n_class * n_shot, ) + shape_x)
    xq_v = nn.Variable((n_class * n_query, ) + shape_x)
    hq_v = net(n_class, xs_v, xq_v, init_type, embedding, net_type, metric,
               True)
    yq_v = nn.Variable((n_class * n_query, 1))
    err_v = F.mean(F.top_n_error(hq_v, yq_v, n=1))

    # Setup solver
    solver = S.Adam(1.0e-3)
    solver.set_parameters(nn.get_parameters())
    learning_rate_decay_activate = True

    # Training loop
    train_losses = []
    best_err = 1.0
    for i in range(max_iteration):

        # Decay learning rate
        if learning_rate_decay_activate and ((i + 1) % lr_decay_interval == 0):
            solver.set_learning_rate(solver.learning_rate() * lr_decay)

        # Create an episode
        xs_t.d, xq_t.d, yq_t.d = train_episode_generator.next()

        # Training by the episode
        solver.zero_grad()
        loss_t.forward(clear_no_need_grad=True)
        loss_t.backward(clear_buffer=True)
        solver.update()
        train_losses.append(loss_t.d.copy())

        # Evaluation
        if (i + 1) % iter_per_valid == 0:
            train_loss = np.mean(train_losses)
            train_losses = []
            valid_errs = []
            for k in range(n_episode_for_valid):
                xs_v.d, xq_v.d, yq_v.d = valid_episode_generator.next()
                err_v.forward(clear_no_need_grad=True, clear_buffer=True)
                valid_errs.append(np.float(err_v.d.copy()))
            valid_err = np.mean(valid_errs)

            #monitor_loss.add(i + 1, train_loss)
            monitor_valid_err.add(i + 1, valid_err * 100)
            if valid_err < best_err:
                best_err = valid_err
                nn.save_parameters(param_file)

    # Final evaluation
    nn.load_parameters(param_file)
    v_errs = []
    for k in range(n_episode_for_test):
        xs_v.d, xq_v.d, yq_v.d = test_episode_generator.next()
        err_v.forward(clear_no_need_grad=True, clear_buffer=True)
        v_errs.append(np.float(err_v.d.copy()))
    v_err = np.mean(v_errs)
    v_err_conf = 1.96 * np.std(v_errs) / np.sqrt(n_episode_for_test)
    monitor_test_err.add(0, v_err * 100)
    monitor_test_conf.add(0, v_err_conf)

    # Visualization
    n_class = 50
    n_sample = 20
    batch = test_data[:n_class].reshape(n_class * n_sample, 1, 28, 28)
    label = []
    for i in range(n_class):
        label.extend(np.ones(n_sample) * (i % 50))
    u = get_embeddings(batch, conv4)
    v = get_tsne(u)
    plot_tsne(v[:, 0], v[:, 1], label, tsne_file)
Ejemplo n.º 10
0
def meta_train(exp_string, monitor, args):
    # Set monitors
    monitor_loss = MonitorSeries('Training loss',
                                 monitor,
                                 interval=args.print_interval,
                                 verbose=False)
    monitor_valid_err = MonitorSeries('Validation error',
                                      monitor,
                                      interval=args.test_print_interval,
                                      verbose=False)

    # Load data
    if args.datasource == 'omniglot':
        shape_x = (1, 28, 28)
        train_data, valid_data, _ = load_omniglot(
            os.path.join(args.dataset_root, 'omniglot/data/'), shape_x)
    else:
        raise ValueError('Unrecognized data source.')

    train_data_generator = DataGenerator(args.num_classes,
                                         args.train_num_shots,
                                         args.train_num_queries, shape_x,
                                         train_data, args.meta_batch_size)
    valid_data_generator = DataGenerator(args.num_classes, args.num_shots,
                                         args.num_queries, shape_x, valid_data,
                                         args.meta_batch_size)

    # Build training models
    # a: training data for inner gradient, b: test data for meta gradient
    inputa_t = nn.Variable((train_data_generator.num_classes *
                            train_data_generator.num_shots, ) +
                           train_data_generator.shape_x)
    inputb_t = nn.Variable((train_data_generator.num_classes *
                            train_data_generator.num_queries, ) +
                           train_data_generator.shape_x)
    labela_t = nn.Variable(
        (train_data_generator.num_classes * train_data_generator.num_shots, 1))
    labelb_t = nn.Variable(
        (train_data_generator.num_classes * train_data_generator.num_queries,
         1))

    # Build evaluation models
    # a: training data for inner gradient, b: test data for meta gradient
    inputa_v = nn.Variable((valid_data_generator.num_classes *
                            valid_data_generator.num_shots, ) +
                           valid_data_generator.shape_x)
    inputb_v = nn.Variable((valid_data_generator.num_classes *
                            valid_data_generator.num_queries, ) +
                           valid_data_generator.shape_x)
    labela_v = nn.Variable(
        (valid_data_generator.num_classes * valid_data_generator.num_shots, 1))
    labelb_v = nn.Variable(
        (valid_data_generator.num_classes * valid_data_generator.num_queries,
         1))

    with nn.parameter_scope('meta'):
        # Set weights
        _ = net(inputa_t, labela_t, True, args)  # only definition of weights
        weights = nn.get_parameters()

        # Setup solver
        solver = S.Adam(args.meta_lr)
        solver.set_parameters(weights)

    if args.num_updates > 1:
        print(
            "[WARNING]: A number of updates in an inner loop is changed from "
            + str(args.updates) + " to 1")
        args.num_updates = 1

    print('Done initializing, starting training.')

    # Training loop
    for itr in range(1, args.metatrain_iterations + 1):
        solver.zero_grad()
        lossesa, lossesb, accuraciesa, accuraciesb = inner_train_test(
            inputa_t, inputb_t, labela_t, labelb_t, train_data_generator, True,
            args)
        solver.update()

        # Evaluation
        if itr % args.print_interval == 0:
            preaccuracies = np.mean(accuraciesa, axis=0)
            postaccuracy = np.mean(accuraciesb, axis=0)
            print_str = 'Iteration {}: '.format(itr)
            for j in range(len(preaccuracies)):
                print_str += ' %.4f ->' % preaccuracies[j]
            print_str += '->-> %.4f (final accuracy at queries)' % postaccuracy
            print(print_str)
            monitor_loss.add(itr, np.mean(lossesb, axis=0))

        if itr % args.test_print_interval == 0:
            # Inner training & testing
            lossesa, lossesb, accuraciesa, accuraciesb = inner_train_test(
                inputa_v, inputb_v, labela_v, labelb_v, valid_data_generator,
                False, args)

            # Validation
            preaccuracies = np.mean(accuraciesa, axis=0)
            postaccuracy = np.mean(accuraciesb, axis=0)
            print_str = 'Validation results: '
            for j in range(len(preaccuracies)):
                print_str += ' %.4f ->' % preaccuracies[j]
            print_str += '->-> %.4f (final accuracy at queries)' % postaccuracy
            print(print_str)
            monitor_valid_err.add(itr, (1.0 - postaccuracy) * 100.0)

        if itr % args.save_interval == 0:
            nn.save_parameters(
                os.path.join(args.logdir, exp_string,
                             'params{}.h5'.format(itr)))

    if itr % args.save_interval != 0:
        nn.save_parameters(
            os.path.join(args.logdir, exp_string, 'params{}.h5'.format(itr)))
Ejemplo n.º 11
0
def main():
    """
    Main script.

    Steps:
    * Get and set context.
    * Load Dataset
    * Initialize DataIterator.
    * Create Networks
    *   Net for Labeled Data
    *   Net for Unlabeled Data
    *   Net for Test Data
    * Create Solver.
    * Training Loop.
    *   Test
    *   Training
    *     by Labeled Data
    *       Calculate Supervised Loss
    *     by Unlabeled Data
    *       Calculate Virtual Adversarial Noise
    *       Calculate Unsupervised Loss
    """

    args = get_args()

    # Get context.
    from nnabla.ext_utils import get_extension_context
    logger.info("Running in %s" % args.context)
    ctx = get_extension_context(args.context,
                                device_id=args.device_id,
                                type_config=args.type_config)
    nn.set_default_context(ctx)

    shape_x = (1, 28, 28)
    n_h = args.n_units
    n_y = args.n_class

    # Load MNIST Dataset
    from mnist_data import load_mnist, data_iterator_mnist
    images, labels = load_mnist(train=True)
    rng = np.random.RandomState(706)
    inds = rng.permutation(len(images))

    def feed_labeled(i):
        j = inds[i]
        return images[j], labels[j]

    def feed_unlabeled(i):
        j = inds[i]
        return images[j], labels[j]

    di_l = data_iterator_simple(feed_labeled,
                                args.n_labeled,
                                args.batchsize_l,
                                shuffle=True,
                                rng=rng,
                                with_file_cache=False)
    di_u = data_iterator_simple(feed_unlabeled,
                                args.n_train,
                                args.batchsize_u,
                                shuffle=True,
                                rng=rng,
                                with_file_cache=False)
    di_v = data_iterator_mnist(args.batchsize_v, train=False)

    # Create networks
    # feed-forward-net building function
    def forward(x, test=False):
        return mlp_net(x, n_h, n_y, test)

    # Net for learning labeled data
    xl = nn.Variable((args.batchsize_l, ) + shape_x, need_grad=False)
    yl = forward(xl, test=False)
    tl = nn.Variable((args.batchsize_l, 1), need_grad=False)
    loss_l = F.mean(F.softmax_cross_entropy(yl, tl))

    # Net for learning unlabeled data
    xu = nn.Variable((args.batchsize_u, ) + shape_x, need_grad=False)
    yu = forward(xu, test=False)
    y1 = yu.get_unlinked_variable()
    y1.need_grad = False

    noise = nn.Variable((args.batchsize_u, ) + shape_x, need_grad=True)
    r = noise / (F.sum(noise**2, [1, 2, 3], keepdims=True))**0.5
    r.persistent = True
    y2 = forward(xu + args.xi_for_vat * r, test=False)
    y3 = forward(xu + args.eps_for_vat * r, test=False)
    loss_k = F.mean(distance(y1, y2))
    loss_u = F.mean(distance(y1, y3))

    # Net for evaluating validation data
    xv = nn.Variable((args.batchsize_v, ) + shape_x, need_grad=False)
    hv = forward(xv, test=True)
    tv = nn.Variable((args.batchsize_v, 1), need_grad=False)
    err = F.mean(F.top_n_error(hv, tv, n=1))

    # Create solver
    solver = S.Adam(args.learning_rate)
    solver.set_parameters(nn.get_parameters())

    # Monitor training and validation stats.
    import nnabla.monitor as M
    monitor = M.Monitor(args.model_save_path)
    monitor_verr = M.MonitorSeries("Test error", monitor, interval=240)
    monitor_time = M.MonitorTimeElapsed("Elapsed time", monitor, interval=240)

    # Training Loop.
    t0 = time.time()

    for i in range(args.max_iter):

        # Validation Test
        if i % args.val_interval == 0:
            valid_error = calc_validation_error(di_v, xv, tv, err,
                                                args.val_iter)
            monitor_verr.add(i, valid_error)

        #################################
        ## Training by Labeled Data #####
        #################################

        # forward, backward and update
        xl.d, tl.d = di_l.next()
        xl.d = xl.d / 255
        solver.zero_grad()
        loss_l.forward(clear_no_need_grad=True)
        loss_l.backward(clear_buffer=True)
        solver.weight_decay(args.weight_decay)
        solver.update()

        #################################
        ## Training by Unlabeled Data ###
        #################################

        # Calculate y without noise, only once.
        xu.d, _ = di_u.next()
        xu.d = xu.d / 255
        yu.forward(clear_buffer=True)

        ##### Calculate Adversarial Noise #####
        # Do power method iteration
        noise.d = np.random.normal(size=xu.shape).astype(np.float32)
        for k in range(args.n_iter_for_power_method):
            r.grad.zero()
            loss_k.forward(clear_no_need_grad=True)
            loss_k.backward(clear_buffer=True)
            noise.data.copy_from(r.grad)

        ##### Calculate loss for unlabeled data #####
        # forward, backward and update
        solver.zero_grad()
        loss_u.forward(clear_no_need_grad=True)
        loss_u.backward(clear_buffer=True)
        solver.weight_decay(args.weight_decay)
        solver.update()

        ##### Learning rate update #####
        if i % args.iter_per_epoch == 0:
            solver.set_learning_rate(solver.learning_rate() *
                                     args.learning_rate_decay)
        monitor_time.add(i)

    # Evaluate the final model by the error rate with validation dataset
    valid_error = calc_validation_error(di_v, xv, tv, err, args.val_iter)
    monitor_verr.add(i, valid_error)
    monitor_time.add(i)

    # Save the model.
    parameter_file = os.path.join(args.model_save_path,
                                  'params_%06d.h5' % args.max_iter)
    nn.save_parameters(parameter_file)
Ejemplo n.º 12
0
def meta_train(args, train_data, valid_data, test_data):

    # Build episode generators
    shape_x = (1, 28, 28)
    train_episode_generator = EpisodeGenerator(args.n_class_tr, args.n_shot_tr,
                                               args.n_query_tr, shape_x,
                                               train_data)
    valid_episode_generator = EpisodeGenerator(args.n_class, args.n_shot,
                                               args.n_query, shape_x,
                                               valid_data)
    test_episode_generator = EpisodeGenerator(args.n_class, args.n_shot,
                                              args.n_query, shape_x, test_data)

    # Build training model
    xs_t = nn.Variable((args.n_class_tr * args.n_shot_tr, ) + shape_x)
    xq_t = nn.Variable((args.n_class_tr * args.n_query_tr, ) + shape_x)
    hq_t = net(args.n_class_tr, xs_t, xq_t, args.embedding, args.net_type,
               args.metric, False)
    yq_t = nn.Variable((args.n_class_tr * args.n_query_tr, 1))
    loss_t = F.mean(F.softmax_cross_entropy(hq_t, yq_t))

    # Build evaluation model
    xs_v = nn.Variable((args.n_class * args.n_shot, ) + shape_x)
    xq_v = nn.Variable((args.n_class * args.n_query, ) + shape_x)
    hq_v = net(args.n_class, xs_v, xq_v, args.embedding, args.net_type,
               args.metric, True)
    yq_v = nn.Variable((args.n_class * args.n_query, 1))
    err_v = F.mean(F.top_n_error(hq_v, yq_v, n=1))

    # Setup solver
    solver = S.Adam(args.learning_rate)
    solver.set_parameters(nn.get_parameters())

    # Monitor outputs
    monitor = Monitor(args.work_dir)
    monitor_loss = MonitorSeries("Training loss",
                                 monitor,
                                 interval=args.iter_per_epoch)
    monitor_valid_err = MonitorSeries("Validation error",
                                      monitor,
                                      interval=args.iter_per_valid)
    monitor_test_err = MonitorSeries("Test error", monitor)
    monitor_test_conf = MonitorSeries("Test error confidence", monitor)

    # Output files
    param_file = args.work_dir + "params.h5"
    tsne_file = args.work_dir + "tsne.png"

    # Training loop
    train_losses = []
    best_err = 1.0
    for i in range(args.max_iteration):

        # Decay learning rate
        if (i + 1) % args.lr_decay_interval == 0:
            solver.set_learning_rate(solver.learning_rate() * args.lr_decay)

        # Create an episode
        xs_t.d, xq_t.d, yq_t.d = train_episode_generator.next()

        # Training by the episode
        solver.zero_grad()
        loss_t.forward(clear_no_need_grad=True)
        loss_t.backward(clear_buffer=True)
        solver.update()
        train_losses.append(loss_t.d.copy())

        # Evaluation
        if (i + 1) % args.iter_per_valid == 0:
            train_loss = np.mean(train_losses)
            train_losses = []
            valid_errs = []
            for k in range(args.n_episode_for_valid):
                xs_v.d, xq_v.d, yq_v.d = valid_episode_generator.next()
                err_v.forward(clear_no_need_grad=True, clear_buffer=True)
                valid_errs.append(np.float(err_v.d.copy()))
            valid_err = np.mean(valid_errs)

            monitor_valid_err.add(i + 1, valid_err * 100)
            if valid_err < best_err:
                best_err = valid_err
                nn.save_parameters(param_file)

    # Final evaluation
    nn.load_parameters(param_file)
    v_errs = []
    for k in range(args.n_episode_for_test):
        xs_v.d, xq_v.d, yq_v.d = test_episode_generator.next()
        err_v.forward(clear_no_need_grad=True, clear_buffer=True)
        v_errs.append(np.float(err_v.d.copy()))
    v_err_mean = np.mean(v_errs)
    v_err_std = np.std(v_errs)
    v_err_conf = 1.96 * v_err_std / np.sqrt(args.n_episode_for_test)
    monitor_test_err.add(0, v_err_mean * 100)
    monitor_test_conf.add(0, v_err_conf * 100)

    # Visualization
    n_class = 50
    n_sample = 20
    batch = test_data[:n_class].reshape(n_class * n_sample, 1, 28, 28)
    label = []
    for i in range(n_class):
        label.extend(np.ones(n_sample) * (i % 50))
    u = get_embeddings(batch, conv4)
    v = get_tsne(u)
    plot_tsne(v[:, 0], v[:, 1], label, tsne_file)