Exemple #1
0
def test_sink(seed):
    rng = np.random.RandomState(seed)
    v = nn.Variable((2, 3, 4), need_grad=True)
    h0 = F.tanh(v)
    h1 = F.sigmoid(v)
    v.d = rng.randn(*v.shape).astype(np.float32)

    # Create references
    v.grad.zero()
    h0.forward()
    h1.forward()
    h0.backward()
    h1.backward()  # v.grad is accumulated.
    h0d = h0.d.copy()
    h1d = h1.d.copy()
    vg = v.g.copy()

    # Reset values
    h0.data.zero()
    h1.data.zero()
    v.grad.zero()

    # Check if sink works
    dummy = F.sink(h0, h1, one_input_grad=True)
    dummy.forward()
    dummy.backward()
    assert np.all(h0d == h0.d)
    assert np.all(h1d == h1.d)
    assert np.all(vg == v.g)
Exemple #2
0
def get_gru_grad(xs_np, h0_np, w0_np, w_np, b_np, dy, dh, num_layers=1, dropout=0.0, bidirectional=False, training=True, **kw):
    # Inputs are numpy arrays
    num_directions = 2 if bidirectional else 1
    seq_len = xs_np.shape[0]
    batch_size = xs_np.shape[1]
    hidden_size = h0_np.shape[3]

    xs = nn.Variable.from_numpy_array(xs_np, need_grad=True)
    h0 = nn.Variable.from_numpy_array(h0_np, need_grad=True)
    w0 = nn.Variable.from_numpy_array(w0_np, need_grad=True)
    w = None
    b = None
    with_bias = False
    if num_layers > 1:
        w = nn.Variable.from_numpy_array(w_np, need_grad=True)
    if type(b_np) == np.ndarray:
        b = nn.Variable.from_numpy_array(b_np, need_grad=True)
        with_bias = True
    xs.grad.zero()
    h0.grad.zero()
    w0.grad.zero()
    if num_layers > 1:
        w.grad.zero()
    if with_bias:
        b.grad.zero()

    ys, hn = create_fixed_length_gru(
        xs, h0, w0, w, b, num_layers, num_directions, with_bias)  # returns Variables

    dummy = F.sink(ys, hn, one_input_grad=False)
    dummy.forward()
    ys.g = np.reshape(dy, ys.shape)
    hn.g = dh
    dummy.backward()

    if num_layers > 1 and with_bias:
        return np.concatenate((xs.g.flat, h0.g.flat, w0.g.flat, w.g.flat, b.g.flat))
    elif num_layers > 1 and not with_bias:
        return np.concatenate((xs.g.flat, h0.g.flat, w0.g.flat, w.g.flat))
    elif num_layers == 1 and with_bias:
        return np.concatenate((xs.g.flat, h0.g.flat, w0.g.flat, b.g.flat))
    else:
        return np.concatenate((xs.g.flat, h0.g.flat, w0.g.flat))
Exemple #3
0
    def test_clear_output_grad_prohibit_clear_input(self):
        x1 = nn.Variable([1], need_grad=True)

        xx1 = F.identity(x1)
        y1 = F.add_scalar(xx1)
        y2 = F.add_scalar(xx1)
        y3 = F.sink(y1, y2)

        answer_grad = []
        answer_grad.append([True])  # y3
        answer_grad.append([False])  # y2
        answer_grad.append([False])  # y1
        answer_grad.append([True])  # xx1

        y3.forward(clear_no_need_grad=True)
        clear_called_flag_recorder.deactivate_clear_called_flag_recorder()
        clear_called_flag_recorder.activate_clear_called_flag_recorder()
        y3.backward(clear_buffer=True)

        self.check_grad_cleared_flags(answer_grad)
Exemple #4
0
def create_ema_op(params, ema_decay=0.9999):
    """
    Define exponential moving average update for trainable params.
    """
    def ema_update(p_ema, p_train):
        return F.assign(p_ema, ema_decay * p_ema + (1. - ema_decay) * p_train)

    ops = []
    with nn.parameter_scope("ema"):
        for name, p_train in params.items():
            p_ema = get_parameter_or_create(name,
                                            shape=p_train.shape,
                                            need_grad=False)
            p_ema.data.copy_from(p_train.data,
                                 use_current_context=False)  # initialize
            ops.append(ema_update(p_ema, p_train))

        ema_params = nn.get_parameters(grad_only=False)

    return F.sink(*ops), ema_params
Exemple #5
0
    def __init__(self, num_actions, num_envs, batch_size, v_coeff, ent_coeff,
                 lr_scheduler):
        # inference graph
        self.infer_obs_t = nn.Variable((num_envs, 4, 84, 84))
        self.infer_pi_t,\
        self.infer_value_t = cnn_network(self.infer_obs_t, num_actions,
                                         'network')
        self.infer_t = F.sink(self.infer_pi_t, self.infer_value_t)

        # evaluation graph
        self.eval_obs_t = nn.Variable((1, 4, 84, 84))
        self.eval_pi_t, _ = cnn_network(self.eval_obs_t, num_actions,
                                        'network')

        # training graph
        self.obss_t = nn.Variable((batch_size, 4, 84, 84))
        self.acts_t = nn.Variable((batch_size, 1))
        self.rets_t = nn.Variable((batch_size, 1))
        self.advs_t = nn.Variable((batch_size, 1))

        pi_t, value_t = cnn_network(self.obss_t, num_actions, 'network')

        # value loss
        l2loss = F.squared_error(value_t, self.rets_t)
        self.value_loss = v_coeff * F.mean(l2loss)

        # policy loss
        log_pi_t = F.log(pi_t + 1e-20)
        a_one_hot = F.one_hot(self.acts_t, (num_actions, ))
        log_probs_t = F.sum(log_pi_t * a_one_hot, axis=1, keepdims=True)
        self.pi_loss = F.mean(log_probs_t * self.advs_t)

        # KL loss
        entropy = -ent_coeff * F.mean(F.sum(pi_t * log_pi_t, axis=1))

        self.loss = self.value_loss - self.pi_loss - entropy

        self.params = nn.get_parameters()
        self.solver = S.RMSprop(lr_scheduler(0.0), 0.99, 1e-5)
        self.solver.set_parameters(self.params)
        self.lr_scheduler = lr_scheduler
Exemple #6
0
def forward_variable(inputs, outputs, side, feed=None):
    rng = np.random.RandomState(389)
    if feed is None:
        if isinstance(inputs, nn.Variable):
            inputs.d = rng.randn(*inputs.d.shape)
        else:
            for v in inputs:
                v.d = rng.randn(*v.d.shape)
    elif callable(feed):
        feed(inputs, rng)

    if isinstance(outputs, nn.Variable):
        outputs.forward()
        yield outputs.d.copy()
    else:
        y = F.sink(*outputs)
        for v in outputs:
            v.persistent = True
        y.forward()
        for v in outputs:
            yield v.d.copy()
Exemple #7
0
def test_instance_normalization_forward_backward(seed, x_shape, batch_axis,
                                                 channel_axis, output_stat):

    rng = np.random.RandomState(seed)
    input = np.array(rng.randn(*x_shape).astype(np.float32))
    eps = 1e-05

    stat_shape = tuple([
        x_shape[i] if i in _force_list(batch_axis) + [
            channel_axis,
        ] else 1 for i in range(len(x_shape))
    ])

    beta = rng.randn(*stat_shape).astype(np.float32)
    gamma = rng.randn(*stat_shape).astype(np.float32)

    x = nn.Variable.from_numpy_array(input)
    v_beta = nn.Variable.from_numpy_array(beta)
    v_gamma = nn.Variable.from_numpy_array(gamma)

    output = F.instance_normalization(x, v_beta, v_gamma, channel_axis,
                                      batch_axis, eps, output_stat)
    ref = ref_instance_normalization(input, beta, gamma, channel_axis,
                                     batch_axis, eps, output_stat)

    if output_stat:
        tmp = F.sink(*output)
        tmp.forward()
        tmp.backward()

        for o, r in zip(output, ref):
            assert o.shape == r.shape
            assert_allclose(o.d, r, atol=1e-2, rtol=1e-5)

    else:
        output.forward()
        output.backward()

        assert output.shape == ref.shape
        assert_allclose(output.d, ref, atol=1e-2, rtol=1e-5)
Exemple #8
0
def execute_fixed_length_rnn(xs_np,
                             h0_np,
                             w0_np,
                             w_np,
                             b_np,
                             num_layers=1,
                             nonlinearity='tanh',
                             dropout=0.0,
                             bidirectional=False,
                             training=True):
    # Inputs are numpy arrays
    num_directions = 2 if bidirectional else 1
    seq_len = xs_np.shape[0]
    batch_size = xs_np.shape[1]
    hidden_size = h0_np.shape[3]

    xs = nn.Variable.from_numpy_array(xs_np)
    h0 = nn.Variable.from_numpy_array(h0_np)
    w0 = nn.Variable.from_numpy_array(w0_np)
    w = None
    b = None
    with_bias = False
    if num_layers > 1:
        w = nn.Variable.from_numpy_array(w_np)
    if type(b_np) is np.ndarray:
        b = nn.Variable.from_numpy_array(b_np)
        with_bias = True

    ys, hn = create_fixed_length_rnn(xs, h0, w0, w, b, num_layers,
                                     nonlinearity, num_directions,
                                     with_bias)  # returns Variables

    dummy = F.sink(ys, hn)
    dummy.forward()

    # returns numpy arrays
    ys = F.reshape(ys, (seq_len, batch_size, num_directions * hidden_size))
    ys.forward()
    return ys.d, hn.d
def test_group_normalization_forward_backward(seed, num_groups, x_shape,
                                              batch_axis, channel_axis,
                                              output_stat):

    rng = np.random.RandomState(seed)
    input = np.array(rng.randn(*x_shape).astype(np.float32))

    stat_shape = [1 for _ in range(len(x_shape))]
    stat_shape[channel_axis] = input.shape[channel_axis]

    beta = rng.randn(*stat_shape).astype(np.float32)
    gamma = rng.randn(*stat_shape).astype(np.float32)

    eps = 1e-05

    x = nn.Variable.from_numpy_array(input)
    v_beta = nn.Variable.from_numpy_array(beta)
    v_gamma = nn.Variable.from_numpy_array(gamma)

    output = F.group_normalization(x, v_beta, v_gamma, num_groups,
                                   channel_axis, batch_axis, eps, output_stat)
    ref = ref_group_normalization(input, beta, gamma, num_groups, channel_axis,
                                  batch_axis, eps, output_stat)

    if output_stat:
        tmp = F.sink(*output)
        tmp.forward()
        tmp.backward()

        for o, r in zip(output, ref):
            assert o.shape == r.shape
            assert_allclose(o.d, r, atol=1e-2, rtol=1e-5)

    else:
        output.forward()
        output.backward()

        assert output.shape == ref.shape
        assert_allclose(output.d, ref, atol=1e-2, rtol=1e-5)
Exemple #10
0
def test_weight_standardization_forward_backward(rng, w_shape, channel_axis, output_stat):
    input = np.array(rng.randn(*w_shape).astype(np.float32))
    eps = 1e-05

    x = nn.Variable.from_numpy_array(input)
    output = F.weight_standardization(x, channel_axis, eps, output_stat)
    ref = ref_weight_standardization(input, channel_axis, eps, output_stat)

    if output_stat:
        tmp = F.sink(*output)
        tmp.forward()
        tmp.backward()

        for o, r in zip(output, ref):
            assert o.shape == r.shape
            assert np.allclose(o.d, r, atol=1e-2, rtol=1e-5)

    else:
        output.forward()
        output.backward()

        assert np.allclose(output.d, ref, atol=1e-2, rtol=1e-5)
Exemple #11
0
def test_iterator_through_forward_sequence(module_func):
    func, in_shapes = module_func
    with nn.graph_def.graph() as g:
        inputs = [nn.ProtoVariable(shape) for shape in in_shapes]
        outputs = func(*inputs)

    inputs = [nn.Variable(shape) for shape in in_shapes]
    for i in inputs:
        i.d = np.random.random(i.shape)
    outputs_ref = func(*inputs)
    if not isinstance(outputs_ref, tuple):
        outputs_ref = (outputs_ref, )

    output = F.sink(*outputs_ref)
    forward_sequence = []

    def visit_func(f):
        if f.name != 'Sink':
            forward_sequence.append(f.name)

    output.visit(visit_func)

    for a, b in zip(g.default_graph().forward_sequence(), forward_sequence):
        assert a.type == b
def test_layer_normalization_forward_backward(seed, x_shape, batch_axis,
                                              output_stat):
    rng = np.random.RandomState(seed)
    input = rng.randn(*x_shape).astype(np.float32)

    stat_shape = list(x_shape)
    for baxis in _force_list(batch_axis):
        stat_shape[baxis] = 1

    beta = rng.randn(*stat_shape).astype(np.float32)
    gamma = rng.randn(*stat_shape).astype(np.float32)
    eps = 1e-05

    x = nn.Variable.from_numpy_array(input)
    v_beta = nn.Variable.from_numpy_array(beta)
    v_gamma = nn.Variable.from_numpy_array(gamma)

    output = F.layer_normalization(x, v_beta, v_gamma, batch_axis, eps,
                                   output_stat)
    ref = ref_layer_normalization(input, beta, gamma, batch_axis, eps,
                                  output_stat)

    if output_stat:
        tmp = F.sink(*output)
        tmp.forward()
        tmp.backward()

        for o, r in zip(output, ref):
            assert o.shape == r.shape
            assert_allclose(o.d, r, atol=1e-2, rtol=1e-5)

    else:
        output.forward()
        output.backward()

        assert_allclose(output.d, ref, atol=1e-2, rtol=1e-5)
Exemple #13
0
    def _build(self):
        # inference
        self.infer_obs_t = nn.Variable((1,) + self.obs_shape)
        with nn.parameter_scope('trainable'):
            self.infer_policy_t = policy_network(self.infer_obs_t,
                                                 self.action_size, 'actor')

        # training
        self.obss_t = nn.Variable((self.batch_size,) + self.obs_shape)
        self.acts_t = nn.Variable((self.batch_size, self.action_size))
        self.rews_tp1 = nn.Variable((self.batch_size, 1))
        self.obss_tp1 = nn.Variable((self.batch_size,) + self.obs_shape)
        self.ters_tp1 = nn.Variable((self.batch_size, 1))

        # critic loss
        with nn.parameter_scope('trainable'):
            # critic functions
            q1_t = q_network(self.obss_t, self.acts_t, 'critic/1')
            q2_t = q_network(self.obss_t, self.acts_t, 'critic/2')
        with nn.parameter_scope('target'):
            # target functions
            policy_tp1 = policy_network(self.obss_tp1, self.action_size,
                                        'actor')
            smoothed_target = _smoothing_target(policy_tp1,
                                                self.target_reg_sigma,
                                                self.target_reg_clip)
            q1_tp1 = q_network(self.obss_tp1, smoothed_target, 'critic/1')
            q2_tp1 = q_network(self.obss_tp1, smoothed_target, 'critic/2')
        q_tp1 = F.minimum2(q1_tp1, q2_tp1)
        y = self.rews_tp1 + self.gamma * q_tp1 * (1.0 - self.ters_tp1)
        td1 = F.mean(F.squared_error(q1_t, y))
        td2 = F.mean(F.squared_error(q2_t, y))
        self.critic_loss = td1 + td2

        # actor loss
        with nn.parameter_scope('trainable'):
            policy_t = policy_network(self.obss_t, self.action_size, 'actor')
            q1_t_with_actor = q_network(self.obss_t, policy_t, 'critic/1')
            q2_t_with_actor = q_network(self.obss_t, policy_t, 'critic/2')
        q_t_with_actor = F.minimum2(q1_t_with_actor, q2_t_with_actor)
        self.actor_loss = -F.mean(q_t_with_actor)

        # get neural network parameters
        with nn.parameter_scope('trainable'):
            with nn.parameter_scope('critic'):
                critic_params = nn.get_parameters()
            with nn.parameter_scope('actor'):
                actor_params = nn.get_parameters()

        # setup optimizers
        self.critic_solver = S.Adam(self.critic_lr)
        self.critic_solver.set_parameters(critic_params)
        self.actor_solver = S.Adam(self.actor_lr)
        self.actor_solver.set_parameters(actor_params)

        with nn.parameter_scope('trainable'):
            trainable_params = nn.get_parameters()
        with nn.parameter_scope('target'):
            target_params = nn.get_parameters()

        # build target update
        update_targets = []
        sync_targets = []
        for key, src in trainable_params.items():
            dst = target_params[key]
            updated_dst = (1.0 - self.tau) * dst + self.tau * src
            update_targets.append(F.assign(dst, updated_dst))
            sync_targets.append(F.assign(dst, src))
        self.update_target_expr = F.sink(*update_targets)
        self.sync_target_expr = F.sink(*sync_targets)
def forward_backward_all(*vv):
    y = F.sink(*vv)
    y.forward()
    y.backward()
Exemple #15
0
def train(args):
    """
    Multi-Device Training

    NOTE: the communicator exposes low-level interfaces

    Steps:
    * Instantiate a communicator and set parameter variables.
    * Specify contexts for computation.
    * Initialize DataIterator.
    * Construct a computation graph for training and one for validation.
    * Initialize solver and set parameter variables to that.
    * Load checkpoint to resume previous training.
    * Create monitor instances for saving and displaying training stats.
    * Training loop
      * Computate error rate for validation data (periodically)
      * Get a next minibatch.
      * Execute forwardprop
      * Set parameter gradients zero
      * Execute backprop.
      * AllReduce for gradients
      * Solver updates parameters by using gradients computed by backprop and all reduce.
      * Compute training error
    """
    # Create Communicator and Context
    comm = create_communicator(ignore_error=True)
    if comm:
        n_devices = comm.size
        mpi_rank = comm.rank
        device_id = comm.local_rank
    else:
        n_devices = 1
        mpi_rank = 0
        device_id = args.device_id

    if args.context == 'cpu':
        import nnabla_ext.cpu
        context = nnabla_ext.cpu.context()
    else:
        import nnabla_ext.cudnn
        context = nnabla_ext.cudnn.context(device_id=device_id)
    nn.set_default_context(context)

    n_train_samples = 50000
    n_valid_samples = 10000
    bs_valid = args.batch_size
    iter_per_epoch = int(n_train_samples / args.batch_size / n_devices)

    # Model
    rng = np.random.RandomState(313)
    comm_syncbn = comm if args.sync_bn else None
    if args.net == "cifar10_resnet23":
        prediction = functools.partial(resnet23_prediction,
                                       rng=rng,
                                       ncls=10,
                                       nmaps=64,
                                       act=F.relu,
                                       comm=comm_syncbn)
        data_iterator = data_iterator_cifar10
    if args.net == "cifar100_resnet23":
        prediction = functools.partial(resnet23_prediction,
                                       rng=rng,
                                       ncls=100,
                                       nmaps=384,
                                       act=F.elu,
                                       comm=comm_syncbn)
        data_iterator = data_iterator_cifar100

    # Create training graphs
    image_train = nn.Variable((args.batch_size, 3, 32, 32))
    label_train = nn.Variable((args.batch_size, 1))
    pred_train = prediction(image_train, test=False)
    pred_train.persistent = True
    loss_train = (loss_function(pred_train, label_train) /
                  n_devices).apply(persistent=True)
    error_train = F.mean(F.top_n_error(pred_train, label_train,
                                       axis=1)).apply(persistent=True)
    loss_error_train = F.sink(loss_train, error_train)

    # Create validation graphs
    image_valid = nn.Variable((bs_valid, 3, 32, 32))
    label_valid = nn.Variable((bs_valid, 1))
    pred_valid = prediction(image_valid, test=True)
    error_valid = F.mean(F.top_n_error(pred_valid, label_valid, axis=1))

    # Solvers
    solver = S.Adam()
    solver.set_parameters(nn.get_parameters())
    base_lr = args.learning_rate
    warmup_iter = iter_per_epoch * args.warmup_epoch
    warmup_slope = base_lr * (n_devices - 1) / warmup_iter
    solver.set_learning_rate(base_lr)

    # load checkpoint if file exist.
    start_point = 0
    if args.use_latest_checkpoint:
        files = glob.glob(f'{args.model_save_path}/checkpoint_*.json')
        if len(files) != 0:
            index = max([
                int(n) for n in
                [re.sub(r'.*checkpoint_(\d+).json', '\\1', f) for f in files]
            ])
            # load weights and solver state info from specified checkpoint file.
            start_point = load_checkpoint(
                f'{args.model_save_path}/checkpoint_{index}.json', solver)
        print(f'checkpoint is loaded. start iteration from {start_point}')

    # Create monitor
    monitor = Monitor(args.monitor_path)
    monitor_loss = MonitorSeries("Training loss", monitor, interval=10)
    monitor_err = MonitorSeries("Training error", monitor, interval=10)
    monitor_time = MonitorTimeElapsed("Training time", monitor, interval=10)
    monitor_verr = MonitorSeries("Validation error", monitor, interval=1)
    monitor_vtime = MonitorTimeElapsed("Validation time", monitor, interval=1)

    # Data Iterator

    # If the data does not exist, it will try to download it from the server
    # and prepare it. When executing multiple processes on the same host, it is
    # necessary to execute initial data preparation by the representative
    # process (rank is 0) on the host.

    # Download dataset by rank-0 process
    if single_or_rankzero():
        rng = np.random.RandomState(mpi_rank)
        _, tdata = data_iterator(args.batch_size, True, rng)
        vsource, vdata = data_iterator(bs_valid, False)

    # Wait for data to be prepared without watchdog
    if comm:
        comm.barrier()

    # Prepare dataset for remaining process
    if not single_or_rankzero():
        rng = np.random.RandomState(mpi_rank)
        _, tdata = data_iterator(args.batch_size, True, rng)
        vsource, vdata = data_iterator(bs_valid, False)

    # Training-loop
    ve = nn.Variable()
    for i in range(start_point // n_devices, args.epochs * iter_per_epoch):
        # Validation
        if i % iter_per_epoch == 0:
            ve_local = 0.
            k = 0
            idx = np.random.permutation(n_valid_samples)
            val_images = vsource.images[idx]
            val_labels = vsource.labels[idx]
            for j in range(int(n_valid_samples / n_devices * mpi_rank),
                           int(n_valid_samples / n_devices * (mpi_rank + 1)),
                           bs_valid):
                image = val_images[j:j + bs_valid]
                label = val_labels[j:j + bs_valid]
                if len(image
                       ) != bs_valid:  # note that smaller batch is ignored
                    continue
                image_valid.d = image
                label_valid.d = label
                error_valid.forward(clear_buffer=True)
                ve_local += error_valid.d.copy()
                k += 1
            ve_local /= k
            ve.d = ve_local
            if comm:
                comm.all_reduce(ve.data, division=True, inplace=True)

            # Monitoring error and elapsed time
            if single_or_rankzero():
                monitor_verr.add(i * n_devices, ve.d.copy())
                monitor_vtime.add(i * n_devices)

        # Save model
        if single_or_rankzero():
            if i % (args.model_save_interval // n_devices) == 0:
                iter = i * n_devices
                nn.save_parameters(
                    os.path.join(args.model_save_path,
                                 'params_%06d.h5' % iter))
                if args.use_latest_checkpoint:
                    save_checkpoint(args.model_save_path, iter, solver)

        # Forward/Zerograd
        image, label = tdata.next()
        image_train.d = image
        label_train.d = label
        loss_error_train.forward(clear_no_need_grad=True)
        solver.zero_grad()

        # Backward/AllReduce
        backward_and_all_reduce(
            loss_error_train,
            comm,
            with_all_reduce_callback=args.with_all_reduce_callback)

        # Solvers update
        solver.update()

        # Linear Warmup
        if i <= warmup_iter:
            lr = base_lr + warmup_slope * i
            solver.set_learning_rate(lr)

        # Monitoring loss, error and elapsed time
        if single_or_rankzero():
            monitor_loss.add(i * n_devices, loss_train.d.copy())
            monitor_err.add(i * n_devices, error_train.d.copy())
            monitor_time.add(i * n_devices)

    # Save nnp last epoch
    if single_or_rankzero():
        runtime_contents = {
            'networks': [{
                'name': 'Validation',
                'batch_size': args.batch_size,
                'outputs': {
                    'y': pred_valid
                },
                'names': {
                    'x': image_valid
                }
            }],
            'executors': [{
                'name': 'Runtime',
                'network': 'Validation',
                'data': ['x'],
                'output': ['y']
            }]
        }
        iter = args.epochs * iter_per_epoch
        nn.save_parameters(
            os.path.join(args.model_save_path, 'params_%06d.h5' % iter))
        nnabla.utils.save.save(
            os.path.join(args.model_save_path, f'{args.net}_result.nnp'),
            runtime_contents)
    if comm:
        comm.barrier()
Exemple #16
0
def train():
    parser = argparse.ArgumentParser()
    parser.add_argument("--train-file", type=str)
    parser.add_argument("--valid-file", type=str)
    parser.add_argument("--num-training-examples", type=int, default=50)
    parser.add_argument("--accum-grad", type=int, default=1)
    parser.add_argument("--valid-interval", type=int, default=200)
    parser.add_argument("--threshold", type=float, default=0.95)
    parser.add_argument("--context", type=str, default="cpu")
    parser.add_argument("--device-id", type=int, default=0)

    args = parser.parse_args()

    from nnabla.ext_utils import get_extension_context
    extension_module = args.context
    ctx = get_extension_context(extension_module, device_id=args.device_id)
    nn.set_default_context(ctx)

    # prepare data iterators
    tdata = data_iterator(
        BAbI15DataSource(args.train_file,
                         args.num_training_examples,
                         shuffle=True), 1, False, False, False)
    vdata = data_iterator(
        BAbI15DataSource(args.valid_file, 1000, shuffle=True), 1, False, False,
        False)

    # prepare monitors
    monitor = M.Monitor("./bAbI15")
    tloss = M.MonitorSeries("Training Loss", monitor, interval=10)
    terror = M.MonitorSeries("Training Error", monitor, interval=10)
    verror = M.MonitorSeries("Validation Error", monitor, interval=1)

    # prepare solver
    solver = S.Adam()
    solver_initialized = False

    cnt = 0
    while True:
        l = 0.0
        e = 0.0

        solver.zero_grad()
        for _ in range(args.accum_grad):
            # read next data
            x = tdata.next()
            V = x[1][0][0]
            E = x[2][0][0]
            ans = x[3][0][0]

            # construct GGNN
            output = predict(V, E)
            output = F.reshape(output, (1, output.shape[0]))

            # initialize solver
            if not solver_initialized:
                solver.set_parameters(nn.get_parameters())
                solver_initialized = True
                solver.zero_grad()

            # calculate loss/error
            label = nn.Variable((1, 1))
            label.data.data[0, 0] = ans
            output2 = output.unlinked()
            loss = F.mean(F.softmax_cross_entropy(output, label))
            error = F.mean(F.top_n_error(output2, label))
            F.sink(loss, error).forward(clear_no_need_grad=True)
            loss.backward(clear_buffer=True)

            l += loss.data.data
            e += error.data.data

        # dump log
        tloss.add(cnt, l / args.accum_grad)
        terror.add(cnt, e / args.accum_grad)
        l = 0.0
        e = 0.0

        solver.update()

        cnt += 1
        if cnt % args.valid_interval == 0:
            # validation
            validation_error = 0
            correct_example = None
            wrong_example = None
            for _ in range(vdata.size):
                x = vdata.next()
                id2str = x[0][0][0]
                V = x[1][0][0]
                E = x[2][0][0]
                ans = x[3][0][0]

                output = predict(V, E)
                output = F.reshape(output, (1, output.shape[0]))

                # calculate error
                label = nn.Variable((1, 1))
                label.data.data[0, 0] = ans
                error = F.top_n_error(output, label)
                error.forward(clear_no_need_grad=True)

                if error.data.data > 0.5:
                    if wrong_example is None:
                        wrong_example = (id2str, V, E, ans, output.data.data)
                else:
                    if correct_example is None:
                        correct_example = (id2str, V, E, ans, output.data.data)
                validation_error += error.data.data
            validation_error /= vdata.size
            verror.add(cnt, validation_error)
            accuracy = 1 - validation_error
            if accuracy >= args.threshold:

                def show(example):
                    for i, j in example[2]["is"]:
                        print("{} is {}.".format(example[0][i], example[0][j]))
                    for i, j in example[2]["has_fear"]:
                        print("{} are afraid of {}.".format(
                            example[0][i], example[0][j]))
                    i = np.argmax(example[1])
                    print("What is {} afraid of?".format(example[0][i]))
                    i = np.argmax(example[4])
                    print("Expected: {}, Actual: {}".format(
                        example[0][example[3]], example[0][i]))

                if correct_example is not None:
                    show(correct_example)
                if wrong_example is not None:
                    show(wrong_example)

                break
Exemple #17
0
def train():
    """
    Naive Multi-Device Training

    NOTE: the communicator exposes low-level interfaces

    * Parse command line arguments.
    * Instantiate a communicator and set parameter variables.
    * Specify contexts for computation.
    * Initialize DataIterator.
    * Construct a computation graph for training and one for validation.
    * Initialize solver and set parameter variables to that.
    * Create monitor instances for saving and displaying training stats.
    * Training loop
      * Computate error rate for validation data (periodically)
      * Get a next minibatch.
      * Execute forwardprop
      * Set parameter gradients zero
      * Execute backprop.
      * AllReduce for gradients
      * Solver updates parameters by using gradients computed by backprop and all reduce.
      * Compute training error
    """
    # Parse args
    args = get_args()
    n_train_samples = 50000
    n_valid_samples = 10000
    bs_valid = args.batch_size

    # Create Communicator and Context
    extension_module = "cudnn"
    ctx = get_extension_context(extension_module, type_config=args.type_config)
    comm = C.MultiProcessDataParalellCommunicator(ctx)
    comm.init()
    n_devices = comm.size
    mpi_rank = comm.rank
    mpi_local_rank = comm.local_rank
    device_id = mpi_local_rank
    ctx.device_id = str(device_id)
    nn.set_default_context(ctx)

    # Model
    rng = np.random.RandomState(313)
    comm_syncbn = comm if args.sync_bn else None
    if args.net == "cifar10_resnet23":
        prediction = functools.partial(resnet23_prediction,
                                       rng=rng,
                                       ncls=10,
                                       nmaps=32,
                                       act=F.relu,
                                       comm=comm_syncbn)
        data_iterator = data_iterator_cifar10
    if args.net == "cifar100_resnet23":
        prediction = functools.partial(resnet23_prediction,
                                       rng=rng,
                                       ncls=100,
                                       nmaps=384,
                                       act=F.elu,
                                       comm=comm_syncbn)
        data_iterator = data_iterator_cifar100

    # Create training graphs
    image_train = nn.Variable((args.batch_size, 3, 32, 32))
    label_train = nn.Variable((args.batch_size, 1))
    pred_train = prediction(image_train, test=False)
    pred_train.persistent = True
    loss_train = (loss_function(pred_train, label_train) /
                  n_devices).apply(persistent=True)
    error_train = F.mean(F.top_n_error(pred_train, label_train,
                                       axis=1)).apply(persistent=True)
    loss_error_train = F.sink(loss_train, error_train)
    input_image_train = {"image": image_train, "label": label_train}

    # Create validation graph
    image_valid = nn.Variable((bs_valid, 3, 32, 32))
    label_valid = nn.Variable((args.batch_size, 1))
    pred_valid = prediction(image_valid, test=True)
    error_valid = F.mean(F.top_n_error(pred_valid, label_valid, axis=1))
    input_image_valid = {"image": image_valid, "label": label_valid}

    # Solvers
    solver = S.Adam()
    solver.set_parameters(nn.get_parameters())
    base_lr = args.learning_rate
    warmup_iter = int(
        1. * n_train_samples / args.batch_size / n_devices) * args.warmup_epoch
    warmup_slope = base_lr * (n_devices - 1) / warmup_iter
    solver.set_learning_rate(base_lr)

    # Create monitor
    from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed
    monitor = Monitor(args.monitor_path)
    monitor_loss = MonitorSeries("Training loss", monitor, interval=10)
    monitor_err = MonitorSeries("Training error", monitor, interval=10)
    monitor_time = MonitorTimeElapsed("Training time", monitor, interval=10)
    monitor_verr = MonitorSeries("Validation error", monitor, interval=1)
    monitor_vtime = MonitorTimeElapsed("Validation time", monitor, interval=1)

    # Data Iterator
    rng = np.random.RandomState(device_id)
    _, tdata = data_iterator(args.batch_size, True, rng)
    vsource, vdata = data_iterator(args.batch_size, False)

    # loss_error_train.forward()

    # Training-loop
    ve = nn.Variable()
    for i in range(int(args.max_iter / n_devices)):
        # Validation
        if i % int(n_train_samples / args.batch_size / n_devices) == 0:
            ve_local = 0.
            k = 0
            idx = np.random.permutation(n_valid_samples)
            val_images = vsource.images[idx]
            val_labels = vsource.labels[idx]
            for j in range(int(n_valid_samples / n_devices * mpi_rank),
                           int(n_valid_samples / n_devices * (mpi_rank + 1)),
                           bs_valid):
                image = val_images[j:j + bs_valid]
                label = val_labels[j:j + bs_valid]
                if len(image
                       ) != bs_valid:  # note that smaller batch is ignored
                    continue
                input_image_valid["image"].d = image
                input_image_valid["label"].d = label
                error_valid.forward(clear_buffer=True)
                ve_local += error_valid.d.copy()
                k += 1
            ve_local /= k
            ve.d = ve_local
            comm.all_reduce(ve.data, division=True, inplace=True)

            # Save model
            if device_id == 0:
                monitor_verr.add(i * n_devices, ve.d.copy())
                monitor_vtime.add(i * n_devices)
                if i % int(args.model_save_interval / n_devices) == 0:
                    nn.save_parameters(
                        os.path.join(args.model_save_path,
                                     'params_%06d.h5' % i))

        # Forward/Zerograd
        image, label = tdata.next()
        input_image_train["image"].d = image
        input_image_train["label"].d = label
        loss_error_train.forward(clear_no_need_grad=True)
        solver.zero_grad()

        # Backward/AllReduce
        backward_and_all_reduce(
            loss_error_train,
            comm,
            with_all_reduce_callback=args.with_all_reduce_callback)

        # Solvers update
        solver.update()

        # Linear Warmup
        if i <= warmup_iter:
            lr = base_lr + warmup_slope * i
            solver.set_learning_rate(lr)

        if device_id == 0:  # loss and error locally, and elapsed time
            monitor_loss.add(i * n_devices, loss_train.d.copy())
            monitor_err.add(i * n_devices, error_train.d.copy())
            monitor_time.add(i * n_devices)

        # exit(0)

    if device_id == 0:
        nn.save_parameters(
            os.path.join(args.model_save_path,
                         'params_%06d.h5' % (args.max_iter / n_devices)))
def clear_no_need_grad_tester(rng,
                              func,
                              inputs,
                              func_args=[],
                              func_kwargs={},
                              backward=None,
                              atol_f=1e-6,
                              ctx=None,
                              func_name=None,
                              insert_identity=[],
                              auto_forward=False):
    if ctx is None:
        ctx = nn.Context()
    if backward is None:
        backward = [True for _ in inputs]
    if not True in backward:
        return

    state_rng = None
    if rng is not None:
        state_rng = rng.get_state()
    else:
        rng = rng = np.random.RandomState(313)

    def create_variables(inputs, backward):
        vinputs = []
        for i, b in zip(inputs, backward):
            if i is None:
                vinputs += [None]
                continue
            vinputs += [nn.Variable(i.shape, need_grad=b)]
            vinputs[-1].data.cast(i.dtype)[...] = i
        return vinputs

    vinputs = create_variables(inputs, backward)
    vinputs_clear_buffer = create_variables(inputs, backward)
    vinputs_identity_clear_buffer = []
    if not insert_identity:
        insert_identity = [True] * len(vinputs)

    with nn.context_scope(ctx), nn.auto_forward(auto_forward):
        for idx, i in enumerate(vinputs_clear_buffer):
            if i is None:
                vinputs_identity_clear_buffer += [None]
            elif insert_identity[idx]:
                vinputs_identity_clear_buffer += [F.identity(i)]
            else:
                vinputs_identity_clear_buffer += [i]

    # Checking forward(clear_no_need_grad=True)
    with nn.context_scope(ctx), nn.auto_forward(auto_forward):
        o = func(*(vinputs + func_args), **func_kwargs)
        o = force_tuple(o)
        F.sink(*o).forward(clear_no_need_grad=False)

        o_clear_buffer = func(*(vinputs_identity_clear_buffer + func_args),
                              **func_kwargs)
        o_clear_buffer = force_tuple(o_clear_buffer)
        o_identity_clear_buffer = list(
            map(lambda x: F.identity(x)
                if x is not None else None, o_clear_buffer))
        o_identity_clear_buffer = list(
            filter(lambda x: x is not None, o_identity_clear_buffer))

        F.sink(*o_identity_clear_buffer).forward(clear_no_need_grad=True)

    for i in range(len(o)):
        if o[i] is None:
            continue
        ref = o[i].d
        res = o_identity_clear_buffer[i].d
        assert_allclose(
            ref,
            res,
            atol=atol_f,
            err_msg="{} forward(clear_no_need_grad=True) test fails".format(
                func_name))

    vinputs = list(filter(lambda x: x is not None, vinputs))
    vinputs_clear_buffer = list(
        filter(lambda x: x is not None, vinputs_clear_buffer))

    for i in range(len(vinputs)):
        vinputs[i].grad.zero()
        vinputs_clear_buffer[i].grad.zero()

    for i in range(len(o)):
        if o[i] is None:
            continue
        o[i].g = randn(rng, *o[i].shape)
        o_identity_clear_buffer[i].g = o[i].g

    F.sink(*o).backward()
    F.sink(*o_identity_clear_buffer).backward(clear_buffer=True)

    for i in range(len(vinputs)):
        ref = vinputs[i].g
        res = vinputs_clear_buffer[i].g
        assert_allclose(
            ref,
            res,
            atol=atol_f,
            err_msg="{} forward(clear_no_need_grad=True) and backward test fails"
            .format(func_name))

    if state_rng:
        rng.set_state(state_rng)
Exemple #19
0
def _create_optimizer(ctx, o, networks, datasets, renamed):
    class Optimizer:
        pass

    optimizer = Optimizer()

    optimizer.comm = current_communicator()
    comm_size = optimizer.comm.size if optimizer.comm else 1
    optimizer.start_iter = (o.start_iter - 1) // comm_size + \
        1 if o.start_iter > 0 else 0
    optimizer.end_iter = (o.end_iter - 1) // comm_size + \
        1 if o.end_iter > 0 else 0
    optimizer.name = o.name
    optimizer.order = o.order
    optimizer.update_interval = o.update_interval if o.update_interval > 0 else 1
    optimizer.network = networks[o.network_name]
    optimizer.data_iterators = OrderedDict()
    for d in o.dataset_name:
        optimizer.data_iterators[d] = datasets[d].data_iterator

    optimizer.dataset_assign = OrderedDict()
    for d in o.data_variable:
        optimizer.dataset_assign[
            optimizer.network.variables[renamed.get(d.variable_name, d.variable_name)]] = d.data_name

    optimizer.generator_assign = OrderedDict()
    for g in o.generator_variable:
        optimizer.generator_assign[optimizer.network.variables[
            renamed.get(g.variable_name, g.variable_name)]] = _get_generator(g)

    # for debugging
    # optimizer.net_variables = optimizer.network.variables
    # optimizer.net_variables.update(optimizer.network.parameters)

    optimizer.loss_variables = []
    for l in o.loss_variable:
        optimizer.loss_variables.append(
            optimizer.network.variables[renamed.get(l.variable_name, l.variable_name)])

    optimizer.parameter_learning_rate_multipliers = OrderedDict()
    for p in o.parameter_variable:
        param_variable_names = _get_matching_variable_names(
            p.variable_name, list(itertools.chain(optimizer.network.parameters.keys(),
                                                  optimizer.network.variables.keys())))
        for v_name in param_variable_names:
            if v_name in optimizer.network.parameters:
                optimizer.parameter_learning_rate_multipliers[
                    optimizer.network.parameters[v_name]] = p.learning_rate_multiplier
            elif v_name in optimizer.network.variables:
                optimizer.parameter_learning_rate_multipliers[
                    optimizer.network.variables[v_name]] = p.learning_rate_multiplier

    with nn.context_scope(ctx):
        if o.solver.type == 'Adagrad':
            optimizer.solver = S.Adagrad(
                o.solver.adagrad_param.lr, o.solver.adagrad_param.eps)
            init_lr = o.solver.adagrad_param.lr
        elif o.solver.type == 'Adadelta':
            optimizer.solver = S.Adadelta(
                o.solver.adadelta_param.lr, o.solver.adadelta_param.decay, o.solver.adadelta_param.eps)
            init_lr = o.solver.adadelta_param.lr
        elif o.solver.type == 'AdaBelief':
            optimizer.solver = S.AdaBelief(o.solver.adabelief_param.alpha, o.solver.adabelief_param.beta1,
                                           o.solver.adabelief_param.beta2, o.solver.adabelief_param.eps,
                                           o.solver.adabelief_param.wd,
                                           o.solver.adabelief_param.amsgrad,
                                           o.solver.adabelief_param.weight_decouple,
                                           o.solver.adabelief_param.fixed_decay,
                                           o.solver.adabelief_param.rectify)
            init_lr = o.solver.adabelief_param.alpha
        elif o.solver.type == 'Adam':
            optimizer.solver = S.Adam(o.solver.adam_param.alpha, o.solver.adam_param.beta1,
                                      o.solver.adam_param.beta2, o.solver.adam_param.eps)
            init_lr = o.solver.adam_param.alpha
        elif o.solver.type == 'AdamW':
            optimizer.solver = S.AdamW(o.solver.adamw_param.alpha, o.solver.adamw_param.beta1,
                                       o.solver.adamw_param.beta2, o.solver.adamw_param.eps,
                                       o.solver.adamw_param.wd)
            init_lr = o.solver.adamw_param.alpha
        elif o.solver.type == 'Adamax':
            optimizer.solver = S.Adamax(o.solver.adamax_param.alpha, o.solver.adamax_param.beta1,
                                        o.solver.adamax_param.beta2, o.solver.adamax_param.eps)
            init_lr = o.solver.adamax_param.alpha
        elif o.solver.type == 'AdaBound':
            optimizer.solver = S.AdaBound(o.solver.adabound_param.alpha, o.solver.adabound_param.beta1,
                                          o.solver.adabound_param.beta2, o.solver.adabound_param.eps,
                                          o.solver.adabound_param.final_lr, o.solver.adabound_param.gamma)
            init_lr = o.solver.adabound_param.alpha
        elif o.solver.type == 'AMSGRAD':
            optimizer.solver = S.AMSGRAD(o.solver.amsgrad_param.alpha, o.solver.amsgrad_param.beta1,
                                         o.solver.amsgrad_param.beta2, o.solver.amsgrad_param.eps)
            init_lr = o.solver.amsgrad_param.alpha
        elif o.solver.type == 'AMSBound':
            optimizer.solver = S.AMSBound(o.solver.amsbound_param.alpha, o.solver.amsbound_param.beta1,
                                          o.solver.amsbound_param.beta2, o.solver.amsbound_param.eps,
                                          o.solver.amsbound_param.final_lr, o.solver.amsbound_param.gamma)
            init_lr = o.solver.amsbound_param.alpha
        elif o.solver.type == 'Eve':
            p = o.solver.eve_param
            optimizer.solver = S.Eve(
                p.alpha, p.beta1, p.beta2, p.beta3, p.k, p.k2, p.eps)
            init_lr = p.alpha
        elif o.solver.type == 'Lars':
            optimizer.solver = S.Lars(o.solver.lars_param.lr, o.solver.lars_param.momentum,
                                      o.solver.lars_param.coefficient, o.solver.lars_param.eps)
            init_lr = o.solver.lars_param.lr
        elif o.solver.type == 'Momentum':
            optimizer.solver = S.Momentum(
                o.solver.momentum_param.lr, o.solver.momentum_param.momentum)
            init_lr = o.solver.momentum_param.lr
        elif o.solver.type == 'Nesterov':
            optimizer.solver = S.Nesterov(
                o.solver.nesterov_param.lr, o.solver.nesterov_param.momentum)
            init_lr = o.solver.nesterov_param.lr
        elif o.solver.type == 'RMSprop':
            optimizer.solver = S.RMSprop(
                o.solver.rmsprop_param.lr, o.solver.rmsprop_param.decay, o.solver.rmsprop_param.eps)
            init_lr = o.solver.rmsprop_param.lr
        elif o.solver.type == 'RMSpropGraves':
            optimizer.solver = S.RMSpropGraves(
                o.solver.rmsprop_graves_param.lr, o.solver.rmsprop_graves_param.decay,
                o.solver.rmsprop_graves_param.momentum, o.solver.rmsprop_graves_param.eps)
            init_lr = o.solver.rmsprop_graves_param.lr
        elif o.solver.type == 'Sgd' or o.solver.type == 'SGD':
            optimizer.solver = S.Sgd(o.solver.sgd_param.lr)
            init_lr = o.solver.sgd_param.lr
        elif o.solver.type == 'SgdW':
            optimizer.solver = S.SgdW(o.solver.sgdw_param.lr, o.solver.sgdw_param.momentum,
                                      o.solver.sgdw_param.wd)
            init_lr = o.solver.sgdw_param.lr
        else:
            raise ValueError('Solver "' + o.solver.type +
                             '" is not supported.')

    parameters = {v.name: v.variable_instance for v,
                  local_lr in optimizer.parameter_learning_rate_multipliers.items() if local_lr > 0.0}
    optimizer.solver.set_parameters(parameters)
    optimizer.parameters = OrderedDict(
        sorted(parameters.items(), key=lambda x: x[0]))

    optimizer.weight_decay = o.solver.weight_decay

    # keep following 2 lines for backward compatibility
    optimizer.lr_decay = o.solver.lr_decay if o.solver.lr_decay > 0.0 else 1.0
    optimizer.lr_decay_interval = o.solver.lr_decay_interval if o.solver.lr_decay_interval > 0 else 1
    optimizer.solver.set_states_from_protobuf(o)

    optimizer.comm = current_communicator()
    comm_size = optimizer.comm.size if optimizer.comm else 1
    optimizer.scheduler = ExponentialScheduler(init_lr, 1.0, 1)

    if o.solver.lr_scheduler_type == 'Polynomial':
        if o.solver.polynomial_scheduler_param.power != 0.0:
            optimizer.scheduler = PolynomialScheduler(
                init_lr, o.solver.polynomial_scheduler_param.max_iter // comm_size, o.solver.polynomial_scheduler_param.power)
    elif o.solver.lr_scheduler_type == 'Cosine':
        optimizer.scheduler = CosineScheduler(
            init_lr, o.solver.cosine_scheduler_param.max_iter // comm_size)
    elif o.solver.lr_scheduler_type == 'Exponential':
        if o.solver.exponential_scheduler_param.gamma != 1.0:
            optimizer.scheduler = ExponentialScheduler(
                init_lr, o.solver.exponential_scheduler_param.gamma, o.solver.exponential_scheduler_param.iter_interval // comm_size if o.solver.exponential_scheduler_param.iter_interval > comm_size else 1)
    elif o.solver.lr_scheduler_type == 'Step':
        if o.solver.step_scheduler_param.gamma != 1.0 and len(o.solver.step_scheduler_param.iter_steps) > 0:
            optimizer.scheduler = StepScheduler(
                init_lr, o.solver.step_scheduler_param.gamma, [step // comm_size for step in o.solver.step_scheduler_param.iter_steps])
    elif o.solver.lr_scheduler_type == 'Custom':
        # ToDo
        raise NotImplementedError()
    elif o.solver.lr_scheduler_type == '':
        if o.solver.lr_decay_interval != 0 or o.solver.lr_decay != 0.0:
            optimizer.scheduler = ExponentialScheduler(
                init_lr, o.solver.lr_decay if o.solver.lr_decay > 0.0 else 1.0, o.solver.lr_decay_interval // comm_size if o.solver.lr_decay_interval > comm_size else 1)
    else:
        raise ValueError('Learning Rate Scheduler "' + o.solver.lr_scheduler_type +
                         '" is not supported.')

    if o.solver.lr_warmup_scheduler_type == 'Linear':
        if o.solver.linear_warmup_scheduler_param.warmup_iter >= comm_size:
            optimizer.scheduler = LinearWarmupScheduler(
                optimizer.scheduler, o.solver.linear_warmup_scheduler_param.warmup_iter // comm_size)

    for v in optimizer.loss_variables:
        v.variable_instance.grad.fill(1.0 / v.variable_instance.size)

    if len(optimizer.loss_variables) == 1:
        optimizer.target = optimizer.loss_variables[0].variable_instance
    else:
        optimizer.target = F.sink(
            *[v.variable_instance for v in optimizer.loss_variables], one_input_grad=False)

    return optimizer
Exemple #20
0
    def GRU(self, network, func):
        def register_parameters(h, w, b, with_bias):
            hidden_size = h.shape[1]
            w0, w1, w2 = (np.squeeze(wd, 0)
                          for wd in np.split(w, w.shape[0], axis=0))
            w0_nn = nn.Variable.from_numpy_array(np.transpose(w0, (1, 0)))
            w1_nn = nn.Variable.from_numpy_array(np.transpose(w1, (1, 0)))
            w2_0 = w2[:, :w2.shape[1] - hidden_size]
            w2_1 = w2[:, w2.shape[1] - hidden_size:]
            w2_0_nn = nn.Variable.from_numpy_array(np.transpose(w2_0, (1, 0)))
            w2_1_nn = nn.Variable.from_numpy_array(np.transpose(w2_1, (1, 0)))

            w_dict = {
                self._get_unique_name("@{}/gru/w0_nn".format(func_model_name)):
                w0_nn,
                self._get_unique_name("@{}/gru/w1_nn".format(func_model_name)):
                w1_nn,
                self._get_unique_name("@{}/gru/w2_0_nn".format(func_model_name)):
                w2_0_nn,
                self._get_unique_name("@{}/gru/w2_1_nn".format(func_model_name)):
                w2_1_nn
            }
            params.update(w_dict)
            names.update(w_dict)

            b0 = b1 = b2 = b3 = None
            if with_bias:
                b_dict = {
                    self._get_unique_name("@{}/gru/b{}_nn".format(
                        func_model_name, i)):
                    nn.Variable.from_numpy_array(np.squeeze(b_item, 0))
                    for i, b_item in enumerate(np.split(b, b.shape[0], axis=0))
                }
                b0, b1, b2, b3 = b_dict.values()
                names.update(b_dict)
                params.update(b_dict)

            parameters_dict = {
                'w0_nn': w0_nn,
                'w1_nn': w1_nn,
                'w2_0_nn': w2_0_nn,
                'w2_1_nn': w2_1_nn,
                'b0': b0,
                'b1': b1,
                'b2': b2,
                'b3': b3,
            }
            return parameters_dict

        def gru(x, h, parameters_dict):
            xh = F.concatenate(*(x, h), axis=1)
            w0_nn = parameters_dict.get('w0_nn', None)
            w1_nn = parameters_dict.get('w1_nn', None)
            w2_0_nn = parameters_dict.get('w2_0_nn', None)
            w2_1_nn = parameters_dict.get('w2_1_nn', None)
            b0 = parameters_dict.get('b0', None)
            b1 = parameters_dict.get('b1', None)
            b2 = parameters_dict.get('b2', None)
            b3 = parameters_dict.get('b3', None)

            r_t = F.sigmoid(F.affine(xh, w0_nn, b0))
            z_t = F.sigmoid(F.affine(xh, w1_nn, b1))

            n_t = F.tanh(
                F.affine(x, w2_0_nn, b2) + r_t * F.affine(h, w2_1_nn, b3))
            h_t = (1 - z_t) * n_t + z_t * h

            return h_t

        def create_fixed_length_gru(xs0, h0, w0, w, b, num_layers,
                                    num_directions, with_bias):
            # xs : [T, B, I]
            # h0 : [L, D, B, H]
            # c0 : [L, D, B, H]
            # w0 : [D, 3, H, I+H]
            # w : [L-1, D, 3, H, D * H + H]
            # b : [L, D, 3, H]

            batch_size = xs0.shape[1]
            hidden_size = h0.shape[3]

            if xs0.shape[0] == 1:
                xs = [xs0[0]]
            else:
                xs = F.split(xs0, axis=0)
            hn = []
            for i in range(num_layers):
                wi = w0
                if i > 0:
                    wi = w[i - 1]
                # wi : [D, 3, H, ?]
                # Forward direction
                hif = h0[i, 0]  # [B, H]
                wif = wi[0]
                bif = None
                if with_bias:
                    bif = b[i, 0]
                p_dict = register_parameters(hif, wif, bif, with_bias)
                hs = []
                for j, x in enumerate(xs):
                    # x : [B, I]
                    hif = gru(x, hif, p_dict)
                    hs.append(hif)
                hn.append(hif)

                if num_directions == 1:
                    xs = hs
                    continue

                # Backward direction
                hib = h0[i, 1]  # [B, H]
                wib = wi[1]
                bib = None
                if with_bias:
                    bib = b[i, 1]
                p_dict = register_parameters(hib, wib, bib, with_bias)
                for k, x, in enumerate(reversed(xs)):
                    j = len(xs) - 1 - k
                    # x : [B, I]
                    hib = gru(x, hib, p_dict)
                    hs[j] = F.concatenate(hs[j], hib, axis=1)
                hn.append(hib)
                xs = hs

            ys = xs  # list of [B, HD]
            ys = F.stack(*ys, axis=0)  # [T, B, HD]
            hn = F.reshape(F.stack(*hn, axis=0),
                           (num_layers, num_directions, batch_size,
                            hidden_size))  # LD list of [B, H] --> [L, D, B, H]
            return ys, hn

        num_layers = func.gru_param.num_layers
        drop_out = func.gru_param.dropout  # no use
        bidirectional = func.gru_param.bidirectional
        training = func.gru_param.training  # no use
        num_directions = 2 if bidirectional else 1

        xs_nn = nn.Variable(self._variables[func.input[0]].shape.dim[:])
        h0_nn = nn.Variable(self._variables[func.input[1]].shape.dim[:])
        w0_np = self._get_parameter(func.input[2])
        w_np = None
        b_np = None
        with_bias = False
        if num_layers > 1:
            w_np = self._get_parameter(func.input[3])
            if len(func.input) == 5:
                b_np = self._get_parameter(func.input[4])
                with_bias = True
        else:
            if len(func.input) == 4:
                b_np = self._get_parameter(func.input[3])
                with_bias = True

        nn.graph_def.reset_default_graph()
        names = {func.input[0]: xs_nn, func.input[1]: h0_nn}
        params = {}

        func_model_name = self._get_unique_name("gru")

        ys, hn = create_fixed_length_gru(xs_nn, h0_nn, w0_np, w_np, b_np,
                                         num_layers, num_directions,
                                         with_bias)  # returns Variables
        names.update({func.output[0]: ys, func.output[1]: hn})

        output = F.sink(ys, hn)
        pg = ProtoGenerator(func_model_name, names)
        output.visit(pg)

        for _, proto_v in pg.variables.items():
            self._variables[proto_v.name] = proto_v

        for pv_name, pv in params.items():
            if pv_name in self._variables:
                self._variables[pv_name].type = "Parameter"
            parameter = self._nnp.protobuf.parameter.add()
            parameter.variable_name = pv_name
            parameter.shape.dim.extend(pv.shape)
            parameter.data.extend(np.array(pv.d).flatten().tolist())
            parameter.need_grad = pv.need_grad
            self._parameters[pv_name] = parameter

        for proto_f in pg.functions:
            self._default_resolver(network, proto_f)
Exemple #21
0
    def _build(self):
        # infer variable
        self.infer_obs_t = infer_obs_t = nn.Variable((1, 4, 84, 84))
        # inference output
        self.infer_q_t,\
        self.infer_probs_t, _ = self.q_function(infer_obs_t, self.num_actions,
                                                self.min_v, self.max_v,
                                                self.num_bins, 'q_func')
        self.infer_t = F.sink(self.infer_q_t, self.infer_probs_t)

        # train variables
        self.obss_t = nn.Variable((self.batch_size, 4, 84, 84))
        self.acts_t = nn.Variable((self.batch_size, 1))
        self.rews_tp1 = nn.Variable((self.batch_size, 1))
        self.obss_tp1 = nn.Variable((self.batch_size, 4, 84, 84))
        self.ters_tp1 = nn.Variable((self.batch_size, 1))

        # training output
        q_t, probs_t, dists = self.q_function(self.obss_t, self.num_actions,
                                              self.min_v, self.max_v,
                                              self.num_bins, 'q_func')
        q_tp1, probs_tp1, _ = self.q_function(self.obss_tp1, self.num_actions,
                                              self.min_v, self.max_v,
                                              self.num_bins, 'target_q_func')

        expand_last = lambda x: F.reshape(x, x.shape + (1, ))
        flat = lambda x: F.reshape(x, (-1, 1))

        # extract selected dimension
        a_t_one_hot = expand_last(F.one_hot(self.acts_t, (self.num_actions, )))
        probs_t_selected = F.max(probs_t * a_t_one_hot, axis=1)
        # extract max dimension
        _, indices = F.max(q_tp1, axis=1, keepdims=True, with_index=True)
        a_tp1_one_hot = expand_last(F.one_hot(indices, (self.num_actions, )))
        probs_tp1_best = F.max(probs_tp1 * a_tp1_one_hot, axis=1)

        # clipping reward
        clipped_rews_tp1 = clip_by_value(self.rews_tp1, -1.0, 1.0)

        disc_q_tp1 = F.reshape(dists, (1, -1)) * (1.0 - self.ters_tp1)
        t_z = clip_by_value(clipped_rews_tp1 + self.gamma * disc_q_tp1,
                            self.min_v, self.max_v)

        # update indices
        b = (t_z - self.min_v) / ((self.max_v - self.min_v) /
                                  (self.num_bins - 1))
        l = F.floor(b)
        l_mask = F.reshape(F.one_hot(flat(l), (self.num_bins, )),
                           (-1, self.num_bins, self.num_bins))
        u = F.ceil(b)
        u_mask = F.reshape(F.one_hot(flat(u), (self.num_bins, )),
                           (-1, self.num_bins, self.num_bins))

        m_l = expand_last(probs_tp1_best * (1 - (b - l)))
        m_u = expand_last(probs_tp1_best * (b - l))
        m = F.sum(m_l * l_mask + m_u * u_mask, axis=1)
        m.need_grad = False

        self.loss = -F.mean(F.sum(m * F.log(probs_t_selected + 1e-10), axis=1))

        # optimizer
        self.solver = S.RMSprop(self.lr, 0.95, 1e-2)

        # weights and biases
        with nn.parameter_scope('q_func'):
            self.params = nn.get_parameters()
        with nn.parameter_scope('target_q_func'):
            self.target_params = nn.get_parameters()

        # set q function parameters to solver
        self.solver.set_parameters(self.params)
Exemple #22
0
def _get_network_sink(outputs):
    import nnabla.functions as F
    outputs = [o for o in outputs.values()]
    return F.sink(*outputs)
Exemple #23
0
def inner_train_test(inputa, inputb, labela, labelb, data_generator,
                     meta_training, args):
    lossesa, lossesb, accuraciesa, accuraciesb = [], [], [], []
    if meta_training:
        num_updates = args.num_updates
        update_lr = args.train_update_lr
    else:
        num_updates = args.test_num_updates
        update_lr = args.update_lr

    # Training
    for inp in data_generator.next():
        inputa.d, inputb.d, labela.d, labelb.d = inp

        # Initialize network
        with nn.parameter_scope('meta'):
            resulta = net(inputa, labela, True, args)
            resultb = net(inputb, labelb, True, args)
            fast_weights = nn.get_parameters()

        # For saving training accuracies
        resulta[0].persistent = True
        resulta[1].persistent = True
        task_lossa_var = [
            resulta[0],
        ]
        task_accuracya_var = [
            resulta[1],
        ]

        # Inner loop
        for j in range(num_updates):
            grad_list = nn.grad(resulta[0], fast_weights.values())
            for ind, key in enumerate(fast_weights.keys()):
                if grad_list[ind] is None:
                    continue
                if args.first_order or not meta_training:
                    grad_list[ind].need_grad = False
                fast_weights[key] = fast_weights[key] - \
                    update_lr * grad_list[ind]

            resulta = net(inputa, labela, True, args, fast_weights)
            resulta[0].persistent = True
            resulta[1].persistent = True
            task_lossa_var.append(resulta[0])
            task_accuracya_var.append(resulta[1])

        # Loss on queries is calculated only at the end of the inner loop
        # Following the original implementation,
        # we always use batch stats for batch normalization even in a test phase
        resultb = net(inputb, labelb, True, args, fast_weights)

        # Forward calculation
        result_all = F.sink(resulta[0], resulta[1], resultb[0], resultb[1])
        result_all.forward()

        if meta_training:
            # Backward calculation
            lossb = resultb[0] / data_generator.batch_size
            lossb.backward(
            )  # gradients on weights are automatically accumlated

        task_lossa = []
        task_accuracya = []
        for j in range(num_updates + 1):
            task_accuracya_var[j].forward()
            task_lossa.append(task_lossa_var[j].d)
            task_accuracya.append(task_accuracya_var[j].d)

        lossesa.append(task_lossa)
        lossesb.append(resultb[0].d)
        accuraciesa.append(task_accuracya)
        accuraciesb.append(resultb[1].d)

    return lossesa, lossesb, accuraciesa, accuraciesb
Exemple #24
0
    def sample_loop(self, model, shape, sampler,
                    noise=None,
                    dump_interval=-1,
                    progress=False,
                    without_auto_forward=False):
        """
        Iteratively Sample data from model from t=T to t=0.
        T is specified as the length of betas given to __init__().

        Args:
            model (collable): 
                A callable that takes x_t and t and predict noise (and sigma related parameters).
            shape (list like object): A data shape.
            sampler (callable): A function to sample x_{t-1} given x_{t} and t. Typically, self.p_sample or self.ddim_sample.
            noise (collable): A noise generator. If None, F.randn(shape) will be used.
            interval (int): 
                If > 0, all intermediate results at every `interval` step will be returned as a list.
                e.g. if interval = 10, the predicted results at {10, 20, 30, ...} will be returned.
            progress (bool): If True, tqdm will be used to show the sampling progress.

        Returns:
            - x_0 (nn.Variable): the final sampled result of x_0
            - samples (a list of nn.Variable): the sampled results at every `interval`
            - pred_x_starts (a list of nn.Variable): the predicted x_0 from each x_t at every `interval`: 
        """
        T = self.num_timesteps
        indices = list(range(T))[::-1]

        samples = []
        pred_x_starts = []

        if progress:
            from tqdm.auto import tqdm
            indices = tqdm(indices)

        if without_auto_forward:
            if noise is None:
                noise = np.random.randn(*shape)
            else:
                assert isinstance(noise, np.ndarray)
                assert noise.shape == shape

            x_t = nn.Variable.from_numpy_array(noise)
            t = nn.Variable.from_numpy_array([T - 1 for _ in range(shape[0])])

            # build graph
            y, pred_x_start = sampler(model, x_t, t)
            up_x_t = F.assign(x_t, y)
            up_t = F.assign(t, t - 1)
            update = F.sink(up_x_t, up_t)

            cnt = 0
            for step in indices:
                y.forward(clear_buffer=True)
                update.forward(clear_buffer=True)

                cnt += 1
                if dump_interval > 0 and cnt % dump_interval == 0:
                    samples.append((step, y.d.copy()))
                    pred_x_starts.append((step, pred_x_start.d.copy()))
        else:
            with nn.auto_forward():
                if noise is None:
                    x_t = F.randn(shape=shape)
                else:
                    assert isinstance(noise, np.ndarray)
                    assert noise.shape == shape
                    x_t = nn.Variable.from_numpy_array(noise)
                cnt = 0
                for step in indices:
                    t = F.constant(step, shape=(shape[0], ))
                    x_t, pred_x_start = sampler(
                        model, x_t, t, no_noise=step == 0)
                    cnt += 1
                    if dump_interval > 0 and cnt % dump_interval == 0:
                        samples.append((step, x_t.d.copy()))
                        pred_x_starts.append((step, pred_x_start.d.copy()))

        assert x_t.shape == shape
        return x_t.d.copy(), samples, pred_x_starts
Exemple #25
0
def train():
    parser = argparse.ArgumentParser()
    parser.add_argument("--train-file", type=str)
    parser.add_argument("--valid-file", type=str)
    parser.add_argument("--num-training-examples", type=int, default=250)
    parser.add_argument("--accum-grad", type=int, default=1)
    parser.add_argument("--valid-interval", type=int, default=200)
    parser.add_argument("--threshold", type=float, default=0.95)
    parser.add_argument("--context", type=str, default="cpu")
    parser.add_argument("--device-id", type=int, default=0)

    args = parser.parse_args()

    from nnabla.ext_utils import get_extension_context
    extension_module = args.context
    ctx = get_extension_context(extension_module, device_id=args.device_id)
    nn.set_default_context(ctx)

    # prepare data iterators
    tdata = data_iterator(
        BAbI19DataSource(args.train_file,
                         args.num_training_examples,
                         shuffle=True), 1, False, False, False)
    vdata = data_iterator(
        BAbI19DataSource(args.valid_file, 1000, shuffle=True), 1, False, False,
        False)

    # prepare monitors
    monitor = M.Monitor("./bAbI19")
    tloss = M.MonitorSeries("Training Loss", monitor, interval=10)
    terror = M.MonitorSeries("Training Error", monitor, interval=10)
    verror = M.MonitorSeries("Validation Error", monitor, interval=1)

    # prepare solver
    solver = S.Adam()
    solver_initialized = False

    cnt = 0
    while True:
        l = 0.0
        e = 0.0

        solver.zero_grad()
        for _ in range(args.accum_grad):
            # read next data
            x = tdata.next()
            V = x[1][0][0]
            E = x[2][0][0]
            ans = x[3][0][0]

            # construct GGNN
            ## convert to nn.Variable
            x = nn.Variable(V.shape)
            x.data.data = V
            h = nn.Variable((len(V), 6))
            h.data.data = utils.h_0(V, 6)

            outputs = predict(V, E, len(ans))
            losses = []
            errors = []
            for a, output in zip(ans, outputs):
                label = nn.Variable((1, 1))
                label.data.data[0, 0] = a

                losses.append(F.mean(F.softmax_cross_entropy(output, label)))
                output2 = output.unlinked()
                errors.append(F.mean(F.top_n_error(output2, label)))

            # initialize solver
            if not solver_initialized:
                solver.set_parameters(nn.get_parameters())
                solver_initialized = True
                solver.zero_grad()

            # calculate loss/error
            loss = F.mean(F.stack(*losses))
            error = F.mean(F.stack(*errors))
            F.sink(loss, error).forward(clear_no_need_grad=True)
            loss.backward(clear_buffer=True)

            l += loss.data.data
            e += error.data.data

        # dump log
        tloss.add(cnt, l / args.accum_grad)
        terror.add(cnt, e / args.accum_grad)
        l = 0.0
        e = 0.0

        solver.update()

        cnt += 1
        if cnt % args.valid_interval == 0:
            # validation
            validation_error = 0
            correct_example = None
            wrong_example = None
            for _ in range(vdata.size):
                x = vdata.next()
                id2str = x[0][0][0]
                V = x[1][0][0]
                E = x[2][0][0]
                ans = x[3][0][0]

                # construct GGNN
                ## convert to nn.Variable
                x = nn.Variable(V.shape)
                x.data.data = V
                h = nn.Variable((len(V), 6))
                h.data.data = utils.h_0(V, 6)

                outputs = predict(V, E, len(ans))
                errors = []
                actual = []
                for a, output in zip(ans, outputs):
                    label = nn.Variable((1, 1))
                    label.data.data[0, 0] = a

                    errors.append(F.mean(F.top_n_error(output, label)))
                    actual.append(output.data.data)

                error = F.mean(F.stack(*errors))
                error.forward(clear_no_need_grad=True)

                x = 0.0
                if error.data.data == 0:
                    x = 0
                else:
                    x = 1

                if x > 0.5:
                    if wrong_example is None:
                        wrong_example = (id2str, V, E, ans, actual)
                else:
                    if correct_example is None:
                        correct_example = (id2str, V, E, ans, actual)
                validation_error += x
            validation_error /= vdata.size
            verror.add(cnt, validation_error)
            accuracy = 1 - validation_error
            if accuracy >= args.threshold:

                def show(example):
                    if "s" in example[2]:
                        for i, j in example[2]["s"]:
                            print("The {} is south the {}.".format(
                                example[0][i], example[0][j]))
                    if "n" in example[2]:
                        for i, j in example[2]["n"]:
                            print("The {} is north the {}.".format(
                                example[0][i], example[0][j]))
                    if "w" in example[2]:
                        for i, j in example[2]["w"]:
                            print("The {} is west the {}.".format(
                                example[0][i], example[0][j]))
                    if "e" in example[2]:
                        for i, j in example[2]["e"]:
                            print("The {} is east the {}.".format(
                                example[0][i], example[0][j]))
                    i = np.argmax(example[1][:, 0])
                    j = np.argmax(example[1][:, 1])
                    print("What is the path from {} to {}?".format(
                        example[0][i], example[0][j]))

                    for (expected, actual) in zip(example[3], example[4]):
                        i = np.argmax(actual[0])
                        print("Expected: {}, Actual: {}".format(
                            id2classes[expected], id2classes[i]))

                if correct_example is not None:
                    show(correct_example)
                if wrong_example is not None:
                    show(wrong_example)

                break
Exemple #26
0
def _executors(info):
    renamed = info.renamed_variables
    proto, networks = info.proto, info.networks

    class Executor:
        pass
    executors = OrderedDict()

    for e in proto.executor:
        executor = Executor()

        executor.network = networks[e.network_name]
        executor.num_evaluations = e.num_evaluations if e.num_evaluations > 0 else 1
        executor.repeat_evaluation_type = e.repeat_evaluation_type
        executor.need_back_propagation = e.need_back_propagation
        executor.no_image_normalization = e.no_image_normalization

        executor.dataset_assign = OrderedDict()
        for d in e.data_variable:
            executor.dataset_assign[executor.network.variables[
                renamed.get(d.variable_name, d.variable_name)]] = d.data_name

        executor.generator_assign = OrderedDict()
        for g in e.generator_variable:
            executor.generator_assign[executor.network.variables[
                renamed.get(g.variable_name, g.variable_name)]] = _get_generator(g)

        executor.output_assign = OrderedDict()
        for o in e.output_variable:
            executor.output_assign[executor.network.variables[
                renamed.get(o.variable_name, o.variable_name)]] = [o.type, o.data_name]

        executor.parameters = OrderedDict()
        for p in e.parameter_variable:
            param_variable_names = _get_matching_variable_names(
                p.variable_name, list(itertools.chain(executor.network.parameters.keys(),
                                                      executor.network.variables.keys())))
            for v_name in param_variable_names:
                if v_name in executor.network.parameters:
                    executor.parameters[
                        executor.network.parameters[v_name]] = v_name
                if v_name in executor.network.variables:
                    executor.parameters[
                        executor.network.variables[v_name]] = v_name

        executor.forward_target = F.sink(*[v.variable_instance
                                           for v in executor.output_assign.keys()])

        if executor.need_back_propagation:
            executor.loss_variables = []
            for l in e.loss_variable:
                executor.loss_variables.append(executor.network.variables[
                    l.variable_name])

            executor.parameter_learning_rate_multipliers = OrderedDict()
            for p in e.parameter_variable:
                param_variable_names = _get_matching_variable_names(
                    p.variable_name, list(itertools.chain(executor.network.parameters.keys(),
                                                          executor.network.variables.keys())))
                for v_name in param_variable_names:
                    if v_name in executor.network.parameters:
                        executor.parameter_learning_rate_multipliers[
                            executor.network.parameters[v_name]] = p.learning_rate_multiplier
                    elif v_name in executor.network.variables:
                        executor.parameter_learning_rate_multipliers[
                            executor.network.variables[v_name]] = p.learning_rate_multiplier

            executor.backward_target = F.sink(
                *[v.variable_instance for v in executor.loss_variables])

        executors[e.name] = executor

    return executors
Exemple #27
0
def _get_network_sink(outputs):
    import nnabla.functions as F
    outputs = [o for o in outputs.values()]
    return F.sink(*outputs)
Exemple #28
0
    def _build(self):
        # inference graph
        self.infer_obs_t = nn.Variable((1, ) + self.obs_shape)
        with nn.parameter_scope('trainable'):
            infer_dist = policy_network(self.infer_obs_t, self.action_size,
                                        'actor')
        self.infer_act_t, _ = _squash_action(infer_dist)
        self.deterministic_act_t = infer_dist.mean()

        # training graph
        self.obss_t = nn.Variable((self.batch_size, ) + self.obs_shape)
        self.acts_t = nn.Variable((self.batch_size, self.action_size))
        self.rews_tp1 = nn.Variable((self.batch_size, 1))
        self.obss_tp1 = nn.Variable((self.batch_size, ) + self.obs_shape)
        self.ters_tp1 = nn.Variable((self.batch_size, 1))

        with nn.parameter_scope('trainable'):
            dist = policy_network(self.obss_t, self.action_size, 'actor')
            squashed_act_t, log_prob_t = _squash_action(dist)
            v_t = v_network(self.obss_t, 'value')
            q_t1 = q_network(self.obss_t, self.acts_t, 'critic/1')
            q_t2 = q_network(self.obss_t, self.acts_t, 'critic/2')
            q_t1_with_actor = q_network(self.obss_t, squashed_act_t,
                                        'critic/1')
            q_t2_with_actor = q_network(self.obss_t, squashed_act_t,
                                        'critic/2')

        with nn.parameter_scope('target'):
            v_tp1 = v_network(self.obss_tp1, 'value')

        # value loss
        q_t = F.minimum2(q_t1_with_actor, q_t2_with_actor)
        v_target = q_t - log_prob_t
        v_target.need_grad = False
        self.value_loss = 0.5 * F.mean(F.squared_error(v_t, v_target))

        # q function loss
        scaled_rews_tp1 = self.rews_tp1 * self.reward_scale
        q_target = scaled_rews_tp1 + self.gamma * v_tp1 * (1.0 - self.ters_tp1)
        q_target.need_grad = False
        q1_loss = 0.5 * F.mean(F.squared_error(q_t1, q_target))
        q2_loss = 0.5 * F.mean(F.squared_error(q_t2, q_target))
        self.critic_loss = q1_loss + q2_loss

        # policy function loss
        mean_loss = 0.5 * F.mean(dist.mean()**2)
        logstd_loss = 0.5 * F.mean(F.log(dist.stddev())**2)
        policy_reg_loss = self.policy_reg * (mean_loss + logstd_loss)
        self.objective_loss = F.mean(log_prob_t - q_t)
        self.actor_loss = self.objective_loss + policy_reg_loss

        # trainable parameters
        with nn.parameter_scope('trainable'):
            with nn.parameter_scope('value'):
                value_params = nn.get_parameters()
            with nn.parameter_scope('critic'):
                critic_params = nn.get_parameters()
            with nn.parameter_scope('actor'):
                actor_params = nn.get_parameters()
        # target parameters
        with nn.parameter_scope('target/value'):
            target_params = nn.get_parameters()

        # target update
        update_targets = []
        sync_targets = []
        for key, src in value_params.items():
            dst = target_params[key]
            updated_dst = (1.0 - self.tau) * dst + self.tau * src
            update_targets.append(F.assign(dst, updated_dst))
            sync_targets.append(F.assign(dst, src))
        self.update_target_expr = F.sink(*update_targets)
        self.sync_target_expr = F.sink(*sync_targets)

        # setup solvers
        self.value_solver = S.Adam(self.value_lr)
        self.value_solver.set_parameters(value_params)
        self.critic_solver = S.Adam(self.critic_lr)
        self.critic_solver.set_parameters(critic_params)
        self.actor_solver = S.Adam(self.actor_lr)
        self.actor_solver.set_parameters(actor_params)
Exemple #29
0
def backward_function_tester(rng,
                             func,
                             ref_func,
                             inputs,
                             func_args=[],
                             func_kwargs={},
                             atol_f=1e-6,
                             atol_b=1e-3,
                             atol_accum=1e-3,
                             dstep=1e-3,
                             backward=None,
                             ctx=None,
                             func_name=None,
                             ref_grad=None,
                             disable_half_test=False,
                             atol_half=1e-1):
    """Backward function tester

    In the forward test, it compares the results of nn.grad and `func`.backward.
    In the backward test, it compares the analytical gradients and numerical gradient with `grad_outputs`.
    """
    # TODO: half

    from scipy.optimize import approx_fprime

    if ctx is None:
        ctx = nn.Context()
    if backward is None:
        backward = [True if i is not None else False for i in inputs]

    # TODO: Remove set_default_context after adding ctx to BackwardFunction.
    nn.set_default_context(ctx)

    # Create Variables
    def create_variables(inputs, backward):
        vinputs = []
        for i, b in zip(inputs, backward):
            if i is None:
                vinputs += [None]
                continue
            vinputs += [nn.Variable(i.shape, need_grad=b)]
            vinputs[-1].data.cast(i.dtype)[...] = i
        return vinputs

    # Create grad_outputs
    def create_grad_outputs(outputs):
        grad_outputs = []
        for o in outputs:
            if o.shape == ():
                go = nn.NdArray.from_numpy_array(np.array(randn(rng)))
                #go = nn.NdArray.from_numpy_array(np.array(1.0))
            else:
                go = nn.NdArray.from_numpy_array(randn(rng, *o.shape))
                #go = nn.NdArray.from_numpy_array(np.ones(o.shape))

            grad_outputs.append(go)
        return grad_outputs

    # Fill grads
    def fill_grads(vinputs, grads):
        for vi, gd in zip(vinputs, grads):
            if vi is None:
                continue
            vi.g = gd

    # Fill grads
    def zero_grads(vinputs):
        for vi in vinputs:
            if vi is None:
                continue
            vi.grad.zero()
        return

    # Gradient penalty on grads
    def gradient_penalty2(grads):
        gp2 = 0.0
        for g in grads:
            gp2 += F.sum(g**2.0)
        return gp2

    # Product sum

    def prod_sum(inputs0, inputs1):
        out = 0.0
        for inp0, inp1 in zip(inputs0, inputs1):
            out += inp0 * nn.Variable(inp1.shape).apply(data=inp1)
        return out

    # Set inputs for the numerical gradients

    def set_inputs(inputs0, vinputs):
        begin = 0
        for i in vinputs:
            end = begin + i.size
            if i.need_grad == True:
                i.d = inputs0[begin:end].reshape(i.shape)
            begin = end

    # Gradient penalty on grads used for computing numerical gradients
    def obj_func(inputs0, gp2, vinputs):
        set_inputs(inputs0, vinputs)
        gp2.forward()
        return gp2.d.copy()

    # # Half test
    # if not disable_half_test:
    #     finputs = create_variables(inputs, backward)
    #     hinputs = create_variables(inputs, backward)
    #     half_test(rng, func, finputs, hinputs, func_args,
    #               func_kwargs, backward, ctx, func_name, atol=atol_half)

    # Create input variables
    vinputs = create_variables(inputs, backward)
    # --- Forward test --- #
    # Zero grads
    zero_grads(vinputs)
    # Forward/Backward on the forward graph
    voutputs = [
        F.sigmoid(x)
        for x in force_list(func(*(vinputs + func_args), **func_kwargs))
    ]
    agrad_outputs = create_grad_outputs(voutputs)
    o = prod_sum(voutputs, agrad_outputs)
    o.forward()
    o.backward()  # clear_buffer=True)
    # Grads
    voutputs = voutputs
    vinputs = list(filter(lambda vi: vi is not None, vinputs))
    agrad_outputs = agrad_outputs
    grads = nn.grad(voutputs, vinputs, agrad_outputs)
    grads = list(filter(lambda x: x is not None, grads))
    o = F.sink(*grads)
    o.forward()
    # Check forward
    for vi, go in zip(vinputs, grads):
        if vi.need_grad is False:
            continue
        fgrads = vi.g
        bgrads = go.d
        assert_allclose(fgrads, bgrads, atol=atol_f)

    # TODO: 1. Pass function argument directly to backward functions.
    # TODO: 2. should be changed for the simplier form by simply testing BackwardFunction

    # --- Backward (accum = False) test --- #
    # Zero grads
    zero_grads(vinputs)
    # Compute analytical grads
    gp2 = gradient_penalty2(grads)
    gp2.forward()
    gp2.backward(clear_buffer=True)
    analytical_grads = np.concatenate(
        [vi.g.copy().flatten() for vi in vinputs])
    analytical_grads0 = analytical_grads
    # Compute numerical grads
    inputs0 = np.concatenate(
        [inp.flatten() for inp in inputs if inp is not None])
    numerical_grads = approx_fprime(inputs0, obj_func, dstep, gp2, vinputs)
    # Check backward
    assert_allclose(analytical_grads, numerical_grads, atol=atol_b)

    # --- Backward (accum = True) test --- #
    # Random grads
    rand_grads = [randn(rng, *vi.shape) for vi in vinputs]
    fill_grads(vinputs, rand_grads)
    # Compute analytical grads
    gp2.forward()
    gp2.backward(clear_buffer=True)

    analytical_grads = np.concatenate(
        [vi.g.copy().flatten() for vi in vinputs])
    rand_grads = np.concatenate([
        rg.flatten() if isinstance(rg, np.ndarray) else np.array(rg).reshape(
            (1, )) for rg in rand_grads
    ])
    analytical_grads -= rand_grads
    # Check backward
    assert_allclose(analytical_grads, analytical_grads0, atol=atol_accum)
Exemple #30
0
    def _build(self):
        # inference graph
        self.infer_obs_t = nn.Variable((1, ) + self.obs_shape)
        with nn.parameter_scope('trainable'):
            infer_dist = policy_network(self.infer_obs_t, self.action_size,
                                        'actor')
        self.infer_act_t, _ = _squash_action(infer_dist)
        self.deterministic_act_t = infer_dist.mean()

        # training graph
        self.obss_t = nn.Variable((self.batch_size, ) + self.obs_shape)
        self.acts_t = nn.Variable((self.batch_size, self.action_size))
        self.rews_tp1 = nn.Variable((self.batch_size, 1))
        self.obss_tp1 = nn.Variable((self.batch_size, ) + self.obs_shape)
        self.ters_tp1 = nn.Variable((self.batch_size, 1))

        with nn.parameter_scope('trainable'):
            self.log_temp = get_parameter_or_create('temp', [1, 1],
                                                    ConstantInitializer(0.0))
            dist_t = policy_network(self.obss_t, self.action_size, 'actor')
            dist_tp1 = policy_network(self.obss_tp1, self.action_size, 'actor')
            squashed_act_t, log_prob_t = _squash_action(dist_t)
            squashed_act_tp1, log_prob_tp1 = _squash_action(dist_tp1)
            q1_t = q_network(self.obss_t, self.acts_t, 'critic/1')
            q2_t = q_network(self.obss_t, self.acts_t, 'critic/2')
            q1_t_with_actor = q_network(self.obss_t, squashed_act_t,
                                        'critic/1')
            q2_t_with_actor = q_network(self.obss_t, squashed_act_t,
                                        'critic/2')

        with nn.parameter_scope('target'):
            q1_tp1 = q_network(self.obss_tp1, squashed_act_tp1, 'critic/1')
            q2_tp1 = q_network(self.obss_tp1, squashed_act_tp1, 'critic/2')

        # q function loss
        q_tp1 = F.minimum2(q1_tp1, q2_tp1)
        entropy_tp1 = F.exp(self.log_temp) * log_prob_tp1
        mask = (1.0 - self.ters_tp1)
        q_target = self.rews_tp1 + self.gamma * (q_tp1 - entropy_tp1) * mask
        q_target.need_grad = False
        q1_loss = 0.5 * F.mean(F.squared_error(q1_t, q_target))
        q2_loss = 0.5 * F.mean(F.squared_error(q2_t, q_target))
        self.critic_loss = q1_loss + q2_loss

        # policy function loss
        q_t = F.minimum2(q1_t_with_actor, q2_t_with_actor)
        entropy_t = F.exp(self.log_temp) * log_prob_t
        self.actor_loss = F.mean(entropy_t - q_t)

        # temperature loss
        temp_target = log_prob_t - self.action_size
        temp_target.need_grad = False
        self.temp_loss = -F.mean(F.exp(self.log_temp) * temp_target)

        # trainable parameters
        with nn.parameter_scope('trainable'):
            with nn.parameter_scope('critic'):
                critic_params = nn.get_parameters()
            with nn.parameter_scope('actor'):
                actor_params = nn.get_parameters()
        # target parameters
        with nn.parameter_scope('target/critic'):
            target_params = nn.get_parameters()

        # target update
        update_targets = []
        sync_targets = []
        for key, src in critic_params.items():
            dst = target_params[key]
            updated_dst = (1.0 - self.tau) * dst + self.tau * src
            update_targets.append(F.assign(dst, updated_dst))
            sync_targets.append(F.assign(dst, src))
        self.update_target_expr = F.sink(*update_targets)
        self.sync_target_expr = F.sink(*sync_targets)

        # setup solvers
        self.critic_solver = S.Adam(self.critic_lr)
        self.critic_solver.set_parameters(critic_params)
        self.actor_solver = S.Adam(self.actor_lr)
        self.actor_solver.set_parameters(actor_params)
        self.temp_solver = S.Adam(self.temp_lr)
        self.temp_solver.set_parameters({'temp': self.log_temp})
def backward_function_tester(rng,
                             func,
                             inputs=None,
                             func_args=[],
                             func_kwargs={},
                             atol_f=1e-4,
                             atol_b=1e-3,
                             atol_accum=5e-2,
                             dstep=1e-3,
                             backward=None,
                             backward_b=None,
                             ctx=None,
                             non_accum_check=False,
                             skip_backward_check=False,
                             insert_identity=[],
                             auto_forward=False):
    """ Automatic testing of backward function and backward pass of `func` by comparing it.
    The backward pass of `func` is the reference; therefore, 
    the backward pass of `func` must be tested first!

    Syntax of `ref_func`: inputs, parameters
    """

    if ctx is None:
        ctx = nn.Context()
    if backward is None:
        backward = [True for _ in inputs]

    def create_variables(inputs, backward):
        vinputs = []
        for i, b in zip(inputs, backward):
            if i is None:
                vinputs += [None]
                continue
            vinp = nn.Variable(i.shape, need_grad=b)
            vinp.grad.zero()  # grads always not accumulation
            vinputs += [vinp]
            vinputs[-1].data.cast(i.dtype)[...] = i
        return vinputs

    vinputs = create_variables(inputs, backward)
    vinputs_for_clear_buffer = create_variables(inputs, backward)
    vinputs_for_nn_grad = create_variables(inputs, backward)

    vinputs_identity = []
    vinputs_identity_for_clear_buffer = []
    vinputs_identity_for_nn_grad = []
    if not insert_identity:
        insert_identity = [True] * len(vinputs)

    for idx, i in enumerate(
            zip(vinputs, vinputs_for_clear_buffer, vinputs_for_nn_grad)):
        with nn.auto_forward(auto_forward):
            i0, i1, i2 = i
            if i0 is None:
                vinputs_identity += [None]
                vinputs_identity_for_clear_buffer += [None]
                vinputs_identity_for_nn_grad += [None]
            elif insert_identity[idx]:
                vinputs_identity += [F.identity(i0)]
                vinputs_identity_for_clear_buffer += [F.identity(i1)]
                vinputs_identity_for_nn_grad += [F.identity(i2)]
            else:
                vinputs_identity += [i0]
                vinputs_identity_for_clear_buffer += [i1]
                vinputs_identity_for_nn_grad += [i2]

    # Forward and backward of the forward function with no buffer clear
    with nn.context_scope(ctx), nn.auto_forward(auto_forward):
        outputs0 = func(*(vinputs_identity + func_args), **func_kwargs)
        outputs0 = force_list(outputs0)
        F.sink(*outputs0).forward(clear_no_need_grad=False)
    grad_voutputs = []
    for output in outputs0:
        ograd = rng.randn(*output.shape)
        grad_voutputs.append(
            nn.Variable.from_numpy_array(ograd).apply(need_grad=True))
        output.g = ograd
    F.sink(*outputs0, one_input_grad=False).backward()
    vinputs = list(filter(lambda x: x is not None, vinputs))
    vinputs_identity = list(filter(lambda x: x is not None, vinputs_identity))
    vinputs_for_clear_buffer = list(
        filter(lambda x: x is not None, vinputs_for_clear_buffer))
    grad_inputs0 = [inp.g.copy() for inp in vinputs]

    # Forward and backward of the forward function with clear redundant buffer
    with nn.context_scope(ctx), nn.auto_forward(auto_forward):
        outputs_for_clear_buffer = func(
            *(vinputs_identity_for_clear_buffer + func_args), **func_kwargs)
        outputs_for_clear_buffer = force_list(outputs_for_clear_buffer)
        outputs_for_clear_buffer = list(
            map(lambda x: F.identity(x)
                if x is not None else None, outputs_for_clear_buffer))
        F.sink(*outputs_for_clear_buffer).forward(clear_no_need_grad=True)

    for o, ref_o in zip(outputs_for_clear_buffer, outputs0):
        o.g = ref_o.g

    # Check backward
    F.sink(*outputs_for_clear_buffer,
           one_input_grad=False).backward(clear_buffer=True)

    grad_inputs_for_clear_buffer = [
        inp.g.copy() for inp in vinputs_for_clear_buffer
    ]
    for grad_ref, grad_res in zip(grad_inputs0, grad_inputs_for_clear_buffer):
        if grad_ref is None or grad_res is None:
            continue
        assert_allclose(
            grad_ref,
            grad_res,
            atol=atol_f,
            err_msg=
            "backward(clear_buffer=True) and backward(clear_buffer=False) results differ."
        )

    # Forward of the backward function
    from nnabla.backward_functions import registry
    func_name = output.parent.info.type_name
    func_backward = registry[func_name]
    grad_vinputs = grad_voutputs + vinputs
    grad_vinputs_identity = grad_voutputs + vinputs_identity
    func_info_args = output.parent.info.args
    with nn.context_scope(ctx), nn.auto_forward(auto_forward):
        ograds0 = func_backward(grad_vinputs_identity, **func_info_args)
        ograds0 = force_list(ograds0)
        ograds0_ = list(filter(lambda o: o is not None, ograds0))
        F.sink(*ograds0_).forward(clear_no_need_grad=True)
    outputs1 = []
    for i, ograd in enumerate(ograds0):
        outputs1.append(ograd.d.copy()) if ograd is not None else \
          outputs1.append(None)

    # Check num of returned elements
    assert_allclose(
        len(vinputs),
        len(outputs1),
        err_msg="Length of the outputs ({}) does not match "
        "the length of the inputs ({}) to the backward function".format(
            len(outputs1), len(vinputs)))

    # Check forward
    for i, elm in enumerate(zip(grad_inputs0, outputs1)):
        grad_ref, grad_res = elm
        if grad_ref is None or grad_res is None:
            continue
        assert_allclose(
            grad_ref,
            grad_res,
            atol=atol_f,
            err_msg=
            "Forward of the backward function ({}) fails at {}-th output.".
            format(func_backward.__name__, i))

    # Check the same results between backward_function and nn.grad
    vinputs = [v for b, v in zip(backward, vinputs) if b]
    vinputs = list(filter(lambda x: x is not None, vinputs))

    with nn.context_scope(ctx), nn.auto_forward(auto_forward):
        outputs0_for_nn_grad = func(
            *(vinputs_identity_for_nn_grad + func_args), **func_kwargs)
        outputs0_for_nn_grad = force_list(outputs0_for_nn_grad)
        vinputs_identity_for_nn_grad = [
            v for b, v in zip(backward, vinputs_identity_for_nn_grad) if b
        ]
        vinputs_identity_for_nn_grad = list(
            filter(lambda x: x is not None, vinputs_identity_for_nn_grad))

        ograds1 = nn.grad(outputs0_for_nn_grad,
                          vinputs_identity_for_nn_grad,
                          grad_outputs=[g.d.copy() for g in grad_voutputs])
        F.sink(*ograds1).forward(clear_no_need_grad=True)
    ograds0 = list(filter(lambda o: o is not None, ograds0))
    ograds1 = list(filter(lambda o: o is not None, ograds1))
    for i in range(len(ograds0)):
        if ograds0[i].parent is None:
            continue
        assert_allclose(ograds0[i].d,
                        ograds1[i].d,
                        atol=atol_f,
                        err_msg="nn.grad and backward_functon results differ.")

    # Check backward
    # needed since we sometimes do need_grad=False for optimization, e.g., mask.
    def set_inputs(inputs0, vinputs):
        begin = 0
        for i in vinputs:
            end = begin + i.size
            i.d = inputs0[begin:end].reshape(i.shape)
            begin = end

    def obj_func(inputs0, voutput, vinputs):
        set_inputs(inputs0, vinputs)
        voutput.forward()
        y = voutput.d.copy()
        return y

    initial_grads = []
    for grad_vinput in grad_vinputs:
        if grad_vinput is None:
            continue
        g = np.asarray(rng.randn(*grad_vinput.shape))
        initial_grads.append(g)
    grad_inputs1 = np.concatenate(
        [v.d.flatten() for v in grad_vinputs if v is not None])

    for i, ograd in enumerate(ograds0):
        # We can skip if the backward is the functions composite.
        # If the backward is of functions composite,
        # the numerical difference is really different from the analytical one for some functions.
        if skip_backward_check:
            continue

        if ograd is None or not backward[i]:
            continue
        for ig, v in zip(initial_grads, grad_vinputs):
            v.g = ig

        # This must be first since approx_fprime destroys the input values
        # analytical grad.
        rgrad = rng.randn()
        with nn.auto_forward(auto_forward):
            sum_ograd = F.sum(ograd) * rgrad
        sum_ograd.forward(clear_no_need_grad=True)
        sum_ograd.backward()
        analytical_grads = np.concatenate(
            [v.g.flatten() for v in grad_vinputs])
        analytical_grads -= np.concatenate(
            [g.flatten() for g in initial_grads])
        # numerical grad
        from scipy.optimize import approx_fprime
        numerical_grads = approx_fprime(grad_inputs1, obj_func, dstep,
                                        sum_ograd, grad_vinputs)

        # grad_vinputs: dy_1, ..., dy_n, x_1, ..., x_n
        # grad_voutputs: dy_1, ..., dy_n
        seps = [0] + np.cumsum([int(np.prod(v.shape))
                                for v in grad_vinputs]).tolist()
        ngrads = len(grad_voutputs)
        ninputs = len(grad_vinputs)
        backward_b = [True] * ninputs if backward_b is None else backward_b
        for k, sep in enumerate(zip(seps[:-1], seps[1:])):
            if k >= ngrads and not backward[k - ngrads] or not backward_b[k]:
                continue
            s0, s1 = sep
            analytical_grad = analytical_grads[s0:s1]
            numerical_grad = numerical_grads[s0:s1]
            assert_allclose(
                analytical_grad,
                numerical_grad,
                atol=atol_accum,
                err_msg=
                "Backward (accum) of the backward function ({}) wrt {}-th / {} input fails."
                .format(func_backward.__name__, k, ninputs))

    # Some functions backward like AffineDataGrad and AffineFilterGrad does not check non-accum anywhere
    # so check those non-accum backward method here.
    if non_accum_check:
        # for any outputs, parents are the same function.
        parent = outputs0[0].parent
        inputs = parent.inputs
        # Accum
        initial_grads = np.concatenate(
            [inp.g.flatten() for inp, b in zip(inputs, backward) if b])
        accum = [True] * len(inputs)
        parent.backward(inputs, outputs0, accum=accum)
        accum_grads = np.concatenate(
            [inp.g.flatten() for inp, b in zip(inputs, backward) if b])
        non_accum_grads0 = accum_grads - initial_grads
        # Non-accum
        accum = [False] * len(inputs)
        parent.backward(inputs, outputs0, accum=accum)
        non_accum_grads1 = np.concatenate(
            [inp.g.flatten() for inp, b in zip(inputs, backward) if b])
        # Check
        assert_allclose(
            non_accum_grads0,
            non_accum_grads1,
            atol=atol_b,
            err_msg="Backward (non-accum) of the backward function ({}) fails."
            .format(func_backward.__name__))
Exemple #32
0
def test_sink(seed):
    rng = np.random.RandomState(seed)
    v = nn.Variable((2, 3, 4), need_grad=True)
    h0 = F.tanh(v)
    h1 = F.sigmoid(v)
    v.d = rng.randn(*v.shape).astype(np.float32)

    # Create references
    v.grad.zero()
    h0.forward()
    h1.forward()
    h0.backward()
    h1.backward()  # v.grad is accumulated.
    h0d = h0.d.copy()
    h1d = h1.d.copy()
    vg = v.g.copy()

    # Reset values
    h0.data.zero()
    h1.data.zero()
    v.grad.zero()

    # Check if sink works
    dummy = F.sink(h0, h1, one_input_grad=True)
    dummy.forward()
    dummy.backward()
    assert np.all(h0d == h0.d)
    assert np.all(h1d == h1.d)
    assert np.all(vg == v.g)

    # Check if clear_buffer still keeps h0 an h1 even though they are not
    # leaf variables.
    # It's done by defining prohibit_clear_input_buffers function in sink.hpp.
    dummy = F.sink(h0, h1, one_input_grad=True)
    dummy.forward(clear_buffer=True)
    assert np.all(h0d == h0.d)
    assert np.all(h1d == h1.d)
    # Also checking backward when clear buffers.
    v.grad.zero()
    dummy = F.sink(h0, h1, one_input_grad=True)
    dummy.forward(clear_no_need_grad=True)
    dummy.backward(clear_buffer=True)
    assert np.all(h0d == h0.d)
    assert np.all(h1d == h1.d)
    assert np.all(vg == v.g)

    # Check if one_input_grad=False works
    dummy = F.sink(h0, h1, one_input_grad=False)
    g0 = rng.randn(*h0.shape).astype(np.float32)
    g1 = rng.randn(*h1.shape).astype(np.float32)
    h0.g = g0
    h1.g = g1
    dummy.forward()
    # Compute reference
    v.grad.zero()
    h0.backward(grad=g0)
    h1.backward(grad=g1)
    gv = v.g.copy()
    # Compute with sink
    v.grad.zero()
    dummy.backward()
    assert_allclose(v.g, gv)