def sigma_regularization(ctx, log_var, one): with nn.context_scope(ctx): h = F.exp(log_var) h = F.pow_scalar(h, 0.5) h = F.mean(h, axis=1) r = F.mean(F.squared_error(h, one)) return r
def vat(x, r, eps, predict, distance): """ Function for calculate LDS Loss, e.g. KL(p(y|x)||KL(p(y|x+n) Args: x(`~nnabla.Variable`): N-D array r(`~nnabla.Variable`): N-D array of randn/grad eps(`~nnabla.Variable`): Scaling factor, xi for power iteration, epsilon for loss predict: pointer of feed-forward-net building function distance: pointer of distance function e.g. KL(p(y|x)||KL(p(y|x+n) Returns: ~nnabla.Variable: LDS loss (KL(p(y|x)||KL(p(y|x+n)) """ # Calculate log(p(y|x)) y = predict(x) # For stoping the backprop from this path. y1 = y.unlinked() # Calculate log(p(y|x+n)) y2 = predict(x + eps * r) # Calculate kl(p(y|x)||p(y|x+n)) loss = distance(y1, y2) loss = F.mean(loss) # Returns loss and y # y is returned for avoiding duplicated calculation return loss, y
def test_graph_logreg(seed): rng = np.random.RandomState(seed) x = nn.Variable([2, 3, 4], need_grad=True) w = nn.Variable([12, 5], need_grad=True) b = nn.Variable([5], need_grad=True) t = nn.Variable([2, 1]) x.d = rng.randn(*x.shape) w.d = rng.randn(*w.shape) b.d = rng.randn(*b.shape) t.d = rng.randint(0, 5, size=t.shape) nn.set_default_context(nn.Context()) # Forwardprop by definintion with nn.auto_forward(): z = F.affine(x, w, b, 1) l = F.softmax_cross_entropy(z, t, 1) L = F.mean(l) # Backprop # Diff should be initialized since they are always accumulated x.g = 0 w.g = 0 b.g = 0 L.backward(clear_buffer=True) x.g = rng.randn(*x.shape) inputs = [x, w, b] from nbla_test_utils import \ compute_analytical_and_numerical_grad_graph as grads agrad, ngrad = grads(L, inputs, 1e-3) assert np.allclose(ngrad, agrad, atol=1e-2)
def ce_loss_with_uncertainty(ctx, pred, y_l, log_var): r = F.randn(0., 1., log_var.shape) r = F.pow_scalar(F.exp(log_var), 0.5) * r h = pred + r with nn.context_scope(ctx): loss_ce = F.mean(F.softmax_cross_entropy(h, y_l)) return loss_ce
def kl_divergence(ctx, pred, label, log_var): with nn.context_scope(ctx): s = F.pow_scalar(F.exp(log_var), 0.5) elms = softmax_with_temperature(ctx, label, s) \ * F.log(F.softmax(pred, axis=1)) loss = -F.mean(F.sum(elms, axis=1)) return loss
def sigmas_regularization(ctx, log_var0, log_var1): with nn.context_scope(ctx): h0 = F.exp(log_var0) h0 = F.pow_scalar(h0, 0.5) h1 = F.exp(log_var1) h1 = F.pow_scalar(h1, 0.5) r = F.mean(F.squared_error(h0, h1)) return r
def sr_loss_with_uncertainty(ctx, pred0, pred1, log_var0, log_var1): #TODO: squared error/absolute error s0 = F.exp(log_var0) s1 = F.exp(log_var1) squared_error = F.squared_error(pred0, pred1) with nn.context_scope(ctx): loss_sr = F.mean(squared_error * (1 / s0 + 1 / s1) + (s0 / s1 + s1 / s0)) * 0.5 return loss_sr
def test_graph_model(model, seed): np.random.seed(313) rng = np.random.RandomState(seed) x = nn.Variable([2, 3, 4, 4], need_grad=True) t = nn.Variable([2, 1]) x.d = rng.randn(*x.shape) t.d = rng.randint(0, 5, size=t.shape) nn.set_default_context(nn.Context()) # Forwardprop by definintion nn.clear_parameters() if model == "mlp": with nn.parameter_scope('fc1'): z = PF.affine(x, 3) z2 = F.relu(z, inplace=True) with nn.parameter_scope('fc2'): z3 = PF.affine(z2, 5) elif model == "recurrent": with nn.parameter_scope('fc1'): z = PF.affine(x, 3) z2 = F.relu(z, inplace=True) h = z2 for _ in range(2): with nn.parameter_scope('fc2'): h = PF.affine(h, 3) h = F.relu(h, inplace=True) with nn.parameter_scope('fc3'): z3 = PF.affine(h, 5) elif model == "convolution": with nn.parameter_scope('conv1'): z = PF.convolution(x, 3, (2, 2)) z2 = F.relu(z, inplace=True) with nn.parameter_scope('fc2'): z3 = PF.affine(z2, 5) else: raise ValueError() l = F.softmax_cross_entropy(z3, t, 1) L = F.mean(l) # Forwardprop L.forward(clear_no_need_grad=True) # Backprop # Diff should be initialized since they are always accumulated x.grad.zero() L.backward(clear_buffer=True) x.g = rng.randn(*x.shape) parameters = nn.get_parameters() for param in parameters.values(): param.grad.zero() inputs = [x] + list(parameters.values()) from nbla_test_utils import \ compute_analytical_and_numerical_grad_graph as grads agrad, ngrad = grads(L, inputs, 1e-3) assert np.allclose(ngrad, agrad, atol=1.05e-2)
def sr_loss_with_uncertainty(ctx, pred0, pred1, log_var0, log_var1): var0 = F.exp(log_var0) var1 = F.exp(log_var1) s0 = F.pow_scalar(var0, 0.5) s1 = F.pow_scalar(var0, 0.5) squared_error = F.squared_error(pred0, pred1) with nn.context_scope(ctx): loss = F.log(s1/s0) + (var0/var1 + squared_error/var1) * 0.5 loss_sr = F.mean(loss) return loss_sr
def sr_loss_with_uncertainty(ctx, pred0, pred1, log_v0, log_v1, log_s0, log_s1): v0 = F.exp(log_v0) v1 = F.exp(log_v1) squared_error = F.squared_error(pred0, pred1) s0 = F.exp(log_s0) s1 = F.exp(log_s1) with nn.context_scope(ctx): error = squared_error * (1 / v0 + 1 / v1) + (v0 / v1 + v1 / v0) + (s0 / s1 + s1 / s0) loss_sr = F.mean(error) * 0.5 return loss_sr
def sr_loss_with_uncertainty_and_coef(ctx, pred0, pred1, log_var0, log_var1): c0 = srwu_learned_coef(ctx, log_var0) c1 = srwu_learned_coef(ctx, log_var1) sc0 = sigmas_learned_coef(ctx, log_var0, log_var1) sc1 = sigmas_learned_coef(ctx, log_var1, log_var0) c0.need_grad = False c1.need_grad = False sc0.need_grad = False sc1.need_grad = False #TODO: squared error/absolute error s0 = F.exp(log_var0) s1 = F.exp(log_var1) squared_error = F.squared_error(pred0, pred1) with nn.context_scope(ctx): loss_sr = F.mean( squared_error * (c0 / s0 + c1 / s1) + (sc0 * s0 / s1 + sc1 * s1 / s0)) * 0.5 return loss_sr
def test_forward_backward(): batch_size, m, h, w = 4, 3, 32, 32 extension_module = "cpu" device_id = 0 ctx = extension_context(extension_module, device_id=device_id) x_l_data = np.random.randn(batch_size, m, h, w) y_l_data = (np.random.rand(batch_size, 1) * 10).astype(np.int32) x_l = nn.Variable(x_l_data.shape) y_l = nn.Variable(y_l_data.shape) x_l.d = x_l_data y_l.d = y_l_data pred = cnn_model_003(ctx, x_l) with nn.context_scope(ctx): loss = F.mean(F.softmax_cross_entropy(pred, y_l)) loss.forward() loss.backward()
def get_model(args, num_classes, test=False, tiny=False): """ Create computation graph and variables. Args: tiny: Tiny ImageNet mode if True. """ data_size = 320 nn_in_size = 224 if tiny: data_size = 64 nn_in_size = 56 image = nn.Variable([args.batch_size, 3, data_size, data_size]) label = nn.Variable([args.batch_size, 1]) pimage = image_preprocess(image, nn_in_size) pred, hidden = model_resnet.resnet_imagenet( pimage, num_classes, args.num_layers, args.shortcut_type, test=test, tiny=tiny) loss = F.mean(F.softmax_cross_entropy(pred, label)) Model = namedtuple('Model', ['image', 'label', 'pred', 'loss', 'hidden']) return Model(image, label, pred, loss, hidden)
def test_graph_clear_buffer(seed): np.random.seed(313) rng = np.random.RandomState(seed) x = nn.Variable([2, 3, 4, 4]) t = nn.Variable([2, 1]) x.d = rng.randn(*x.shape) t.d = rng.randint(0, 5, size=t.shape) # Network definition nn.set_default_context(nn.Context()) nn.clear_parameters() x1 = x + 1 x2 = x1 - 1 with nn.parameter_scope('conv1'): z = PF.convolution(x2, 3, (2, 2)) z2 = F.relu(z, inplace=True) with nn.parameter_scope('fc2'): z3 = PF.affine(z2, 5) l = F.softmax_cross_entropy(z3, t, 1) L = F.mean(l) # Forwardprop import tempfile import os tmpd = tempfile.mkdtemp() nn.save_parameters(os.path.join(tmpd, 'parameter.h5')) first = False for cnng in [False, True]: for cb in [False, True]: _ = nn.load_parameters(os.path.join(tmpd, 'parameter.h5')) for v in nn.get_parameters().values(): v.grad.zero() L.forward(clear_no_need_grad=cnng) L.backward(clear_buffer=cb) if not first: first = True g = list(nn.get_parameters().values())[0].g.copy() else: g2 = list(nn.get_parameters().values())[0].g.copy() assert np.all(g == g2)
len(x_valid), batch_size, shuffle=True, with_file_cache=False) x = nn.Variable((batch_size, sentence_length)) t = nn.Variable((batch_size, sentence_length, 1)) h = PF.embed(x, vocab_size, embedding_size) h = LSTM(h, hidden, return_sequences=True) h = TimeDistributed(PF.affine)(h, hidden, name='hidden') y = TimeDistributed(PF.affine)(h, vocab_size, name='output') mask = F.sum(F.sign(t), axis=2) # do not predict 'pad'. entropy = TimeDistributedSoftmaxCrossEntropy(y, t) * mask count = F.sum(mask, axis=1) loss = F.mean(F.div2(F.sum(entropy, axis=1), count)) # Create solver. solver = S.Momentum(1e-2, momentum=0.9) solver.set_parameters(nn.get_parameters()) # Create monitor. from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed monitor = Monitor('./tmp-lstmlm') monitor_perplexity = MonitorSeries('perplexity', monitor, interval=1) monitor_perplexity_valid = MonitorSeries('perplexity_valid', monitor, interval=1) for epoch in range(max_epoch): train_loss_set = []
def sr_loss(ctx, pred0, pred1): with nn.context_scope(ctx): pred_x_u0 = F.softmax(pred0) pred_x_u1 = F.softmax(pred1) loss_sr = F.mean(F.squared_error(pred_x_u0, pred_x_u1)) return loss_sr
def train(): parser, args = get_args() # Get context. ctx = get_extension_context(args.context, device_id=args.device_id) nn.set_default_context(ctx) # Initialize DataIterator for MNIST. train_source, valid_source, args = data.load_datasources( parser, args, rng=RandomState(42)) train_iter = data_iterator(train_source, args.batch_size, RandomState(args.seed), with_memory_cache=False, with_file_cache=False) valid_iter = data_iterator(valid_source, args.batch_size, RandomState(args.seed), with_memory_cache=False, with_file_cache=False) scaler_mean, scaler_std = get_statistics(args, train_source) max_bin = utils.bandwidth_to_max_bin(train_source.sample_rate, args.nfft, args.bandwidth) unmix = model.OpenUnmix(input_mean=scaler_mean, input_scale=scaler_std, nb_channels=args.nb_channels, hidden_size=args.hidden_size, n_fft=args.nfft, n_hop=args.nhop, max_bin=max_bin, sample_rate=train_source.sample_rate) # Create input variables. audio_shape = [args.batch_size] + list(train_source._get_data(0)[0].shape) mixture_audio = nn.Variable(audio_shape) target_audio = nn.Variable(audio_shape) vmixture_audio = nn.Variable(audio_shape) vtarget_audio = nn.Variable(audio_shape) # create train graph pred_spec = unmix(mixture_audio, test=False) pred_spec.persistent = True target_spec = model.Spectrogram(*model.STFT(target_audio, n_fft=unmix.n_fft, n_hop=unmix.n_hop), mono=(unmix.nb_channels == 1)) loss = F.mean(F.squared_error(pred_spec, target_spec), axis=1) # Create Solver. solver = S.Adam(args.lr) solver.set_parameters(nn.get_parameters()) # Training loop. t = tqdm.trange(1, args.epochs + 1, disable=args.quiet) es = utils.EarlyStopping(patience=args.patience) for epoch in t: # TRAINING t.set_description("Training Epoch") b = tqdm.trange(0, train_source._size // args.batch_size, disable=args.quiet) losses = utils.AverageMeter() for batch in b: mixture_audio.d, target_audio.d = train_iter.next() b.set_description("Training Batch") solver.zero_grad() loss.forward(clear_no_need_grad=True) loss.backward(clear_buffer=True) solver.weight_decay(args.weight_decay) solver.update() losses.update(loss.d.copy().mean()) b.set_postfix(train_loss=losses.avg) # VALIDATION vlosses = utils.AverageMeter() for batch in range(valid_source._size): # Create new validation input variables for every batch vmixture_audio.d, vtarget_audio.d = valid_iter.next() # create validation graph vpred_spec = unmix(vmixture_audio, test=True) vpred_spec.persistent = True vtarget_spec = model.Spectrogram(*model.STFT(vtarget_audio, n_fft=unmix.n_fft, n_hop=unmix.n_hop), mono=(unmix.nb_channels == 1)) vloss = F.mean(F.squared_error(vpred_spec, vtarget_spec), axis=1) vloss.forward(clear_buffer=True) vlosses.update(vloss.d.copy().mean()) t.set_postfix(train_loss=losses.avg, val_loss=vlosses.avg) stop = es.step(vlosses.avg) is_best = vlosses.avg == es.best # save current model nn.save_parameters( os.path.join(args.output, 'checkpoint_%s.h5' % args.target)) if is_best: best_epoch = epoch nn.save_parameters(os.path.join(args.output, '%s.h5' % args.target)) if stop: print("Apply Early Stopping") break
def augment(batch, aug_list, p_aug=1.0): if isinstance(p_aug, float): p_aug = nn.Variable.from_numpy_array(p_aug * np.ones((1,))) if "flip" in aug_list: rnd = F.rand(shape=[batch.shape[0], ]) batch_aug = F.random_flip(batch, axes=(2, 3)) batch = F.where( F.greater(F.tile(p_aug, batch.shape[0]), rnd), batch_aug, batch) if "lrflip" in aug_list: rnd = F.rand(shape=[batch.shape[0], ]) batch_aug = F.random_flip(batch, axes=(3,)) batch = F.where( F.greater(F.tile(p_aug, batch.shape[0]), rnd), batch_aug, batch) if "translation" in aug_list and batch.shape[2] >= 8: rnd = F.rand(shape=[batch.shape[0], ]) # Currently nnabla does not support random_shift with border_mode="noise" mask = np.ones((1, 3, batch.shape[2], batch.shape[3])) mask[:, :, :, 0] = 0 mask[:, :, :, -1] = 0 mask[:, :, 0, :] = 0 mask[:, :, -1, :] = 0 batch_int = F.concatenate( batch, nn.Variable().from_numpy_array(mask), axis=0) batch_int_aug = F.random_shift(batch_int, shifts=( batch.shape[2]//8, batch.shape[3]//8), border_mode="nearest") batch_aug = F.slice(batch_int_aug, start=( 0, 0, 0, 0), stop=batch.shape) mask_var = F.slice(batch_int_aug, start=( batch.shape[0], 0, 0, 0), stop=batch_int_aug.shape) batch_aug = batch_aug * F.broadcast(mask_var, batch_aug.shape) batch = F.where( F.greater(F.tile(p_aug, batch.shape[0]), rnd), batch_aug, batch) if "color" in aug_list: rnd = F.rand(shape=[batch.shape[0], ]) rnd_contrast = 1.0 + 0.5 * \ (2.0 * F.rand(shape=[batch.shape[0], 1, 1, 1] ) - 1.0) # from 0.5 to 1.5 rnd_brightness = 0.5 * \ (2.0 * F.rand(shape=[batch.shape[0], 1, 1, 1] ) - 1.0) # from -0.5 to 0.5 rnd_saturation = 2.0 * \ F.rand(shape=[batch.shape[0], 1, 1, 1]) # from 0.0 to 2.0 # Brightness batch_aug = batch + rnd_brightness # Saturation mean_s = F.mean(batch_aug, axis=1, keepdims=True) batch_aug = rnd_saturation * (batch_aug - mean_s) + mean_s # Contrast mean_c = F.mean(batch_aug, axis=(1, 2, 3), keepdims=True) batch_aug = rnd_contrast * (batch_aug - mean_c) + mean_c batch = F.where( F.greater(F.tile(p_aug, batch.shape[0]), rnd), batch_aug, batch) if "cutout" in aug_list and batch.shape[2] >= 16: batch = F.random_erase(batch, prob=p_aug.d[0], replacements=(0.0, 0.0)) return batch
def recon_loss(ctx, pred, x_l): with nn.context_scope(ctx): loss_recon = F.mean(F.squared_error(pred, x_l)) return loss_recon
def ce_loss(ctx, pred, y_l): with nn.context_scope(ctx): loss_ce = F.mean(F.softmax_cross_entropy(pred, y_l)) return loss_ce
def _build(self): # inference graph self.infer_obs_t = nn.Variable((1, ) + self.obs_shape) with nn.parameter_scope('trainable'): infer_dist = policy_network(self.infer_obs_t, self.action_size, 'actor') self.infer_act_t, _ = _squash_action(infer_dist) self.deterministic_act_t = infer_dist.mean() # training graph self.obss_t = nn.Variable((self.batch_size, ) + self.obs_shape) self.acts_t = nn.Variable((self.batch_size, self.action_size)) self.rews_tp1 = nn.Variable((self.batch_size, 1)) self.obss_tp1 = nn.Variable((self.batch_size, ) + self.obs_shape) self.ters_tp1 = nn.Variable((self.batch_size, 1)) with nn.parameter_scope('trainable'): dist = policy_network(self.obss_t, self.action_size, 'actor') squashed_act_t, log_prob_t = _squash_action(dist) v_t = v_network(self.obss_t, 'value') q_t1 = q_network(self.obss_t, self.acts_t, 'critic/1') q_t2 = q_network(self.obss_t, self.acts_t, 'critic/2') q_t1_with_actor = q_network(self.obss_t, squashed_act_t, 'critic/1') q_t2_with_actor = q_network(self.obss_t, squashed_act_t, 'critic/2') with nn.parameter_scope('target'): v_tp1 = v_network(self.obss_tp1, 'value') # value loss q_t = F.minimum2(q_t1_with_actor, q_t2_with_actor) v_target = q_t - log_prob_t v_target.need_grad = False self.value_loss = 0.5 * F.mean(F.squared_error(v_t, v_target)) # q function loss scaled_rews_tp1 = self.rews_tp1 * self.reward_scale q_target = scaled_rews_tp1 + self.gamma * v_tp1 * (1.0 - self.ters_tp1) q_target.need_grad = False q1_loss = 0.5 * F.mean(F.squared_error(q_t1, q_target)) q2_loss = 0.5 * F.mean(F.squared_error(q_t2, q_target)) self.critic_loss = q1_loss + q2_loss # policy function loss mean_loss = 0.5 * F.mean(dist.mean()**2) logstd_loss = 0.5 * F.mean(F.log(dist.stddev())**2) policy_reg_loss = self.policy_reg * (mean_loss + logstd_loss) self.objective_loss = F.mean(log_prob_t - q_t) self.actor_loss = self.objective_loss + policy_reg_loss # trainable parameters with nn.parameter_scope('trainable'): with nn.parameter_scope('value'): value_params = nn.get_parameters() with nn.parameter_scope('critic'): critic_params = nn.get_parameters() with nn.parameter_scope('actor'): actor_params = nn.get_parameters() # target parameters with nn.parameter_scope('target/value'): target_params = nn.get_parameters() # target update update_targets = [] sync_targets = [] for key, src in value_params.items(): dst = target_params[key] updated_dst = (1.0 - self.tau) * dst + self.tau * src update_targets.append(F.assign(dst, updated_dst)) sync_targets.append(F.assign(dst, src)) self.update_target_expr = F.sink(*update_targets) self.sync_target_expr = F.sink(*sync_targets) # setup solvers self.value_solver = S.Adam(self.value_lr) self.value_solver.set_parameters(value_params) self.critic_solver = S.Adam(self.critic_lr) self.critic_solver.set_parameters(critic_params) self.actor_solver = S.Adam(self.actor_lr) self.actor_solver.set_parameters(actor_params)
def train(): """ Main script. Steps: * Parse command line arguments. * Specify a context for computation. * Initialize DataIterator for MNIST. * Construct a computation graph for training and validation. * Initialize a solver and set parameter variables to it. * Create monitor instances for saving and displaying training stats. * Training loop * Computate error rate for validation data (periodically) * Get a next minibatch. * Execute forwardprop on the training graph. * Compute training error * Set parameter gradients zero * Execute backprop. * Solver updates parameters by using gradients computed by backprop. """ args = get_args() from numpy.random import seed seed(0) # Get context. from nnabla.ext_utils import get_extension_context logger.info("Running in %s" % args.context) ctx = get_extension_context(args.context, device_id=args.device_id, type_config=args.type_config) nn.set_default_context(ctx) # Create CNN network for both training and testing. if args.net == 'lenet': mnist_cnn_prediction = mnist_lenet_prediction elif args.net == 'resnet': mnist_cnn_prediction = mnist_resnet_prediction else: raise ValueError("Unknown network type {}".format(args.net)) # TRAIN # Create input variables. image = nn.Variable([args.batch_size, 1, 28, 28]) label = nn.Variable([args.batch_size, 1]) # Create prediction graph. pred = mnist_cnn_prediction(image, test=False, aug=args.augment_train) pred.persistent = True # Create loss function. loss = F.mean(F.softmax_cross_entropy(pred, label)) # TEST # Create input variables. vimage = nn.Variable([args.batch_size, 1, 28, 28]) vlabel = nn.Variable([args.batch_size, 1]) # Create prediction graph. vpred = mnist_cnn_prediction(vimage, test=True, aug=args.augment_test) # Create Solver. solver = S.Adam(args.learning_rate) solver.set_parameters(nn.get_parameters()) # Create monitor. from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=10) monitor_err = MonitorSeries("Training error", monitor, interval=10) monitor_time = MonitorTimeElapsed("Training time", monitor, interval=100) monitor_verr = MonitorSeries("Test error", monitor, interval=10) # Initialize DataIterator for MNIST. from numpy.random import RandomState data = data_iterator_mnist(args.batch_size, True, rng=RandomState(1223)) vdata = data_iterator_mnist(args.batch_size, False) # Training loop. for i in range(args.max_iter): if i % args.val_interval == 0: # Validation ve = 0.0 for j in range(args.val_iter): vimage.d, vlabel.d = vdata.next() vpred.forward(clear_buffer=True) vpred.data.cast(np.float32, ctx) ve += categorical_error(vpred.d, vlabel.d) monitor_verr.add(i, ve / args.val_iter) if i % args.model_save_interval == 0: nn.save_parameters( os.path.join(args.model_save_path, 'params_%06d.h5' % i)) # Training forward image.d, label.d = data.next() solver.zero_grad() loss.forward(clear_no_need_grad=True) loss.backward(clear_buffer=True) solver.weight_decay(args.weight_decay) solver.update() loss.data.cast(np.float32, ctx) pred.data.cast(np.float32, ctx) e = categorical_error(pred.d, label.d) monitor_loss.add(i, loss.d.copy()) monitor_err.add(i, e) monitor_time.add(i) ve = 0.0 for j in range(args.val_iter): vimage.d, vlabel.d = vdata.next() vpred.forward(clear_buffer=True) ve += categorical_error(vpred.d, vlabel.d) monitor_verr.add(i, ve / args.val_iter) parameter_file = os.path.join( args.model_save_path, '{}_params_{:06}.h5'.format(args.net, args.max_iter)) nn.save_parameters(parameter_file) # append F.Softmax to the prediction graph so users see intuitive outputs runtime_contents = { 'networks': [{ 'name': 'Validation', 'batch_size': args.batch_size, 'outputs': { 'y': F.softmax(vpred) }, 'names': { 'x': vimage } }], 'executors': [{ 'name': 'Runtime', 'network': 'Validation', 'data': ['x'], 'output': ['y'] }] } save.save( os.path.join(args.model_save_path, '{}_result.nnp'.format(args.net)), runtime_contents)
def train(args): """ Main script. """ # Get context. from nnabla.ext_utils import get_extension_context logger.info("Running in %s" % args.context) ctx = get_extension_context(args.context, device_id=args.device_id, type_config=args.type_config) nn.set_default_context(ctx) # Create CNN network for both training and testing. margin = 1.0 # Margin for contrastive loss. # TRAIN # Create input variables. image0 = nn.Variable([args.batch_size, 1, 28, 28]) image1 = nn.Variable([args.batch_size, 1, 28, 28]) label = nn.Variable([args.batch_size]) # Create prediction graph. pred = mnist_lenet_siamese(image0, image1, test=False) # Create loss function. loss = F.mean(contrastive_loss(pred, label, margin)) # TEST # Create input variables. vimage0 = nn.Variable([args.batch_size, 1, 28, 28]) vimage1 = nn.Variable([args.batch_size, 1, 28, 28]) vlabel = nn.Variable([args.batch_size]) # Create prediction graph. vpred = mnist_lenet_siamese(vimage0, vimage1, test=True) vloss = F.mean(contrastive_loss(vpred, vlabel, margin)) # Create Solver. solver = S.Adam(args.learning_rate) solver.set_parameters(nn.get_parameters()) start_point = 0 if args.checkpoint is not None: # load weights and solver state info from specified checkpoint file. start_point = load_checkpoint(args.checkpoint, solver) # Create monitor. import nnabla.monitor as M monitor = M.Monitor(args.monitor_path) monitor_loss = M.MonitorSeries("Training loss", monitor, interval=10) monitor_time = M.MonitorTimeElapsed("Training time", monitor, interval=100) monitor_vloss = M.MonitorSeries("Test loss", monitor, interval=10) # Initialize DataIterator for MNIST. rng = np.random.RandomState(313) data = siamese_data_iterator(args.batch_size, True, rng) vdata = siamese_data_iterator(args.batch_size, False, rng) # Training loop. for i in range(start_point, args.max_iter): if i % args.val_interval == 0: # Validation ve = 0.0 for j in range(args.val_iter): vimage0.d, vimage1.d, vlabel.d = vdata.next() vloss.forward(clear_buffer=True) ve += vloss.d monitor_vloss.add(i, ve / args.val_iter) if i % args.model_save_interval == 0: # save checkpoint file save_checkpoint(args.model_save_path, i, solver) image0.d, image1.d, label.d = data.next() solver.zero_grad() # Training forward, backward and update loss.forward(clear_no_need_grad=True) loss.backward(clear_buffer=True) solver.weight_decay(args.weight_decay) solver.update() monitor_loss.add(i, loss.d.copy()) monitor_time.add(i) parameter_file = os.path.join(args.model_save_path, 'params_%06d.h5' % args.max_iter) nn.save_parameters(parameter_file)
def classification_svd(): args = get_args() # Get context. from nnabla.ext_utils import get_extension_context logger.info("Running in %s" % args.context) ctx = get_extension_context(args.context, device_id=args.device_id, type_config=args.type_config) nn.set_default_context(ctx) # Create CNN network for both training and testing. mnist_cnn_prediction = mnist_lenet_prediction_slim # TRAIN reference = "reference" slim = "slim" rrate = 0.5 # reduction rate # Create input variables. image = nn.Variable([args.batch_size, 1, 28, 28]) label = nn.Variable([args.batch_size, 1]) # Create `reference` and "slim" prediction graph. model_load_path = args.model_load_path pred = mnist_cnn_prediction(image, scope=slim, rrate=rrate, test=False) pred.persistent = True # Decompose and set parameters decompose_network_and_set_params(model_load_path, reference, slim, rrate) loss = F.mean(F.softmax_cross_entropy(pred, label)) # TEST # Create input variables. vimage = nn.Variable([args.batch_size, 1, 28, 28]) vlabel = nn.Variable([args.batch_size, 1]) # Create reference predition graph. vpred = mnist_cnn_prediction(vimage, scope=slim, rrate=rrate, test=True) # Create Solver. solver = S.Adam(args.learning_rate) with nn.parameter_scope(slim): solver.set_parameters(nn.get_parameters()) # Create monitor. from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=10) monitor_err = MonitorSeries("Training error", monitor, interval=10) monitor_time = MonitorTimeElapsed("Training time", monitor, interval=100) monitor_verr = MonitorSeries("Test error", monitor, interval=10) # Initialize DataIterator for MNIST. data = data_iterator_mnist(args.batch_size, True) vdata = data_iterator_mnist(args.batch_size, False) best_ve = 1.0 # Training loop. for i in range(args.max_iter): if i % args.val_interval == 0: # Validation ve = 0.0 for j in range(args.val_iter): vimage.d, vlabel.d = vdata.next() vpred.forward(clear_buffer=True) ve += categorical_error(vpred.d, vlabel.d) monitor_verr.add(i, ve / args.val_iter) if ve < best_ve: nn.save_parameters( os.path.join(args.model_save_path, 'params_%06d.h5' % i)) best_ve = ve # Training forward image.d, label.d = data.next() solver.zero_grad() loss.forward(clear_no_need_grad=True) loss.backward(clear_buffer=True) solver.weight_decay(args.weight_decay) solver.update() e = categorical_error(pred.d, label.d) monitor_loss.add(i, loss.d.copy()) monitor_err.add(i, e) monitor_time.add(i) ve = 0.0 for j in range(args.val_iter): vimage.d, vlabel.d = vdata.next() vpred.forward(clear_buffer=True) ve += categorical_error(vpred.d, vlabel.d) monitor_verr.add(i, ve / args.val_iter) parameter_file = os.path.join(args.model_save_path, 'params_{:06}.h5'.format(args.max_iter)) nn.save_parameters(parameter_file)
def sigma_regularization(ctx, log_var, one): with nn.context_scope(ctx): h = F.exp(log_var) h = F.pow_scalar(h, 0.5) r = F.mean(F.abs(h - one)) return r
def sr_loss(ctx, pred0, pred1): with nn.context_scope(ctx): loss_sr = F.mean(F.abs(pred0 - pred1)) return loss_sr
def train(args, train_dataset, tokenizer): """ Train the model """ # Load the pretrianed model nn.load_parameters(args.pretrained_model) # Drop final layer for task-specific fine-tuning nn.parameter.pop_parameter('affine_seq_class/affine/W') nn.parameter.pop_parameter('affine_seq_class/affine/b') train_dataloader = data_iterator( train_dataset, batch_size=args.train_batch_size) global_step = 0 train_loss = 0.0 model = BertForSequenceClassification() input_ids = nn.Variable((args.train_batch_size, args.max_seq_length)) attention_mask = nn.Variable((args.train_batch_size, args.max_seq_length)) token_type_ids = nn.Variable((args.train_batch_size, args.max_seq_length)) labels = nn.Variable((args.train_batch_size, )) input_ids_eval = nn.Variable((args.eval_batch_size, args.max_seq_length)) attention_mask_eval = nn.Variable( (args.eval_batch_size, args.max_seq_length)) token_type_ids_eval = nn.Variable( (args.eval_batch_size, args.max_seq_length)) labels_eval = nn.Variable((args.eval_batch_size, )) activation = F.gelu if args.activation == 'relu': activation = F.relu loss, _, train_error = model(args, input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels, num_labels=args.num_labels, vocab_size=args.vocab_size, num_embed_dim=args.num_embed_dim, num_pos_ids=args.num_position_ids, num_attention_layers=args.num_attention_layers, num_attention_embed_dim=args.num_attention_embed_dim, num_attention_heads=args.num_attention_heads, num_attention_dim_feedforward=args.num_attention_dim_feedforward, attention_activation=activation, pool_outmap=args.num_pool_outmap, embed_dropout_prob=args.embed_dropout, attention_dropout_prob=args.attention_dropout, dropout_prob=args.last_dropout, test=False) loss.persistent = True if args.solver == 'Adam': solver = S.Adam(args.learning_rate, eps=args.adam_epsilon) else: solver = S.AdamW(args.learning_rate, eps=args.adam_epsilon) solver.set_parameters(nn.get_parameters()) monitor = Monitor(args.output_dir) monitor_loss = MonitorSeries( "Training Loss", monitor, interval=10) monitor_eloss = MonitorSeries( "Evaluation Loss", monitor, interval=10) monitor_train_error = MonitorSeries( "Training Error Rate", monitor, interval=10) monitor_lr = MonitorSeries( "learning Rate", monitor, interval=10) total_steps = train_dataloader.size // args.train_batch_size var_linear = total_steps * args.num_train_epochs var_warmup = total_steps * (args.num_train_epochs - 1) for epoch in range(args.num_train_epochs): logger.info("Starting Epoch %d out of %d", epoch+1, args.num_train_epochs) for it in range(total_steps): batch = train_dataloader.next() input_ids.d = batch[0] attention_mask.d = batch[1] token_type_ids.d = batch[2] labels.d = batch[3] learning_rate_linear = lr_linear(global_step, var_linear) learning_rate = args.learning_rate * learning_rate_linear if epoch == 0: learning_rate = args.learning_rate * (global_step/total_steps) if epoch > 0: learning_rate_linear = lr_linear( (global_step-total_steps), var_warmup) learning_rate = args.learning_rate * learning_rate_linear solver.zero_grad() nn.forward_all([loss, train_error], clear_no_need_grad=True) loss.backward(clear_buffer=True) solver.weight_decay(args.weight_decay) solver.clip_grad_by_norm(args.max_grad_norm) solver.set_learning_rate(learning_rate) solver.update() monitor_loss.add( (train_dataloader.size//args.train_batch_size)*epoch+it, loss.d.copy()) monitor_train_error.add( (train_dataloader.size//args.train_batch_size)*epoch+it, train_error.d.copy()) monitor_lr.add(global_step, learning_rate) global_step += 1 train_loss += F.mean(loss.data) eval_task_names = ( "mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): print(eval_task) eval_dataset = BERTDataSource( args, tokenizer, evaluate=True, shuffle=False) if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) eval_dataloader = data_iterator( eval_dataset, batch_size=args.eval_batch_size) total_eval_steps = eval_dataloader.size // args.eval_batch_size eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None tmp_eval_loss, logits, eval_error = model(args, input_ids=input_ids_eval, attention_mask=attention_mask_eval, token_type_ids=token_type_ids_eval, labels=labels_eval, num_labels=args.num_labels, vocab_size=args.vocab_size, num_embed_dim=args.num_embed_dim, num_pos_ids=args.num_position_ids, num_attention_layers=args.num_attention_layers, num_attention_embed_dim=args.num_attention_embed_dim, num_attention_heads=args.num_attention_heads, num_attention_dim_feedforward=args.num_attention_dim_feedforward, attention_activation=activation, pool_outmap=args.num_pool_outmap, embed_dropout_prob=args.embed_dropout, attention_dropout_prob=args.attention_dropout, dropout_prob=args.last_dropout, test=True) tmp_eval_loss.persistent = True eval_loss += F.mean(tmp_eval_loss) for it in range(total_eval_steps): print(it, " ", total_eval_steps) batch_eval = eval_dataloader.next() input_ids_eval.d = batch_eval[0] attention_mask_eval.d = batch_eval[1] token_type_ids_eval.d = batch_eval[2] labels_eval.d = batch_eval[3] nb_eval_steps += 1 eval_loss.forward() monitor_eloss.add(it, eval_loss.d.copy()) if preds is None: preds = logits.d.copy() out_label_ids = labels_eval.d.copy() else: preds = np.append(preds, logits.d.copy(), axis=0) out_label_ids = np.append( out_label_ids, labels_eval.d.copy(), axis=0) eval_loss = eval_loss.d / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) output_eval_file = os.path.join( eval_output_dir, "", "eval_results.txt") with open(output_eval_file, "a") as writer: logger.info("***** Evaluation results {} *****".format("")) for key in sorted(result.keys()): logger.info("%d %s = %s\n", epoch + 1, key, str(result[key])) writer.write("%d %s = %s\n" % (epoch+1, key, str(result[key]))) print("results", results) return results
def _build(self): # inference graph self.infer_obs_t = nn.Variable((1, ) + self.obs_shape) with nn.parameter_scope('trainable'): infer_dist = policy_network(self.infer_obs_t, self.action_size, 'actor') self.infer_act_t, _ = _squash_action(infer_dist) self.deterministic_act_t = infer_dist.mean() # training graph self.obss_t = nn.Variable((self.batch_size, ) + self.obs_shape) self.acts_t = nn.Variable((self.batch_size, self.action_size)) self.rews_tp1 = nn.Variable((self.batch_size, 1)) self.obss_tp1 = nn.Variable((self.batch_size, ) + self.obs_shape) self.ters_tp1 = nn.Variable((self.batch_size, 1)) with nn.parameter_scope('trainable'): self.log_temp = get_parameter_or_create('temp', [1, 1], ConstantInitializer(0.0)) dist_t = policy_network(self.obss_t, self.action_size, 'actor') dist_tp1 = policy_network(self.obss_tp1, self.action_size, 'actor') squashed_act_t, log_prob_t = _squash_action(dist_t) squashed_act_tp1, log_prob_tp1 = _squash_action(dist_tp1) q1_t = q_network(self.obss_t, self.acts_t, 'critic/1') q2_t = q_network(self.obss_t, self.acts_t, 'critic/2') q1_t_with_actor = q_network(self.obss_t, squashed_act_t, 'critic/1') q2_t_with_actor = q_network(self.obss_t, squashed_act_t, 'critic/2') with nn.parameter_scope('target'): q1_tp1 = q_network(self.obss_tp1, squashed_act_tp1, 'critic/1') q2_tp1 = q_network(self.obss_tp1, squashed_act_tp1, 'critic/2') # q function loss q_tp1 = F.minimum2(q1_tp1, q2_tp1) entropy_tp1 = F.exp(self.log_temp) * log_prob_tp1 mask = (1.0 - self.ters_tp1) q_target = self.rews_tp1 + self.gamma * (q_tp1 - entropy_tp1) * mask q_target.need_grad = False q1_loss = 0.5 * F.mean(F.squared_error(q1_t, q_target)) q2_loss = 0.5 * F.mean(F.squared_error(q2_t, q_target)) self.critic_loss = q1_loss + q2_loss # policy function loss q_t = F.minimum2(q1_t_with_actor, q2_t_with_actor) entropy_t = F.exp(self.log_temp) * log_prob_t self.actor_loss = F.mean(entropy_t - q_t) # temperature loss temp_target = log_prob_t - self.action_size temp_target.need_grad = False self.temp_loss = -F.mean(F.exp(self.log_temp) * temp_target) # trainable parameters with nn.parameter_scope('trainable'): with nn.parameter_scope('critic'): critic_params = nn.get_parameters() with nn.parameter_scope('actor'): actor_params = nn.get_parameters() # target parameters with nn.parameter_scope('target/critic'): target_params = nn.get_parameters() # target update update_targets = [] sync_targets = [] for key, src in critic_params.items(): dst = target_params[key] updated_dst = (1.0 - self.tau) * dst + self.tau * src update_targets.append(F.assign(dst, updated_dst)) sync_targets.append(F.assign(dst, src)) self.update_target_expr = F.sink(*update_targets) self.sync_target_expr = F.sink(*sync_targets) # setup solvers self.critic_solver = S.Adam(self.critic_lr) self.critic_solver.set_parameters(critic_params) self.actor_solver = S.Adam(self.actor_lr) self.actor_solver.set_parameters(actor_params) self.temp_solver = S.Adam(self.temp_lr) self.temp_solver.set_parameters({'temp': self.log_temp})
def sr_loss(ctx, pred0, pred1): with nn.context_scope(ctx): loss_sr = F.mean(F.squared_error(pred0, pred1)) return loss_sr
def cifar10_resnet32_loss(pred, label): loss = F.mean(F.softmax_cross_entropy(pred, label)) return loss
def train(): """ Naive Multi-Device Training NOTE: the communicator exposes low-level interfaces * Parse command line arguments. * Instantiate a communicator and set parameter variables. * Specify contexts for computation. * Initialize DataIterator. * Construct a computation graph for training and one for validation. * Initialize solver and set parameter variables to that. * Create monitor instances for saving and displaying training stats. * Training loop * Computate error rate for validation data (periodically) * Get a next minibatch. * Execute forwardprop * Set parameter gradients zero * Execute backprop. * AllReduce for gradients * Solver updates parameters by using gradients computed by backprop and all reduce. * Compute training error """ # Parse args args = get_args() n_train_samples = 50000 n_valid_samples = 10000 bs_valid = args.batch_size # Create Communicator and Context extension_module = "cudnn" ctx = get_extension_context(extension_module, type_config=args.type_config) comm = C.MultiProcessDataParalellCommunicator(ctx) comm.init() n_devices = comm.size mpi_rank = comm.rank mpi_local_rank = comm.local_rank device_id = mpi_local_rank ctx.device_id = str(device_id) nn.set_default_context(ctx) # Model rng = np.random.RandomState(313) comm_syncbn = comm if args.sync_bn else None if args.net == "cifar10_resnet23": prediction = functools.partial(resnet23_prediction, rng=rng, ncls=10, nmaps=32, act=F.relu, comm=comm_syncbn) data_iterator = data_iterator_cifar10 if args.net == "cifar100_resnet23": prediction = functools.partial(resnet23_prediction, rng=rng, ncls=100, nmaps=384, act=F.elu, comm=comm_syncbn) data_iterator = data_iterator_cifar100 # Create training graphs image_train = nn.Variable((args.batch_size, 3, 32, 32)) label_train = nn.Variable((args.batch_size, 1)) pred_train = prediction(image_train, test=False) pred_train.persistent = True loss_train = (loss_function(pred_train, label_train) / n_devices).apply(persistent=True) error_train = F.mean(F.top_n_error(pred_train, label_train, axis=1)).apply(persistent=True) loss_error_train = F.sink(loss_train, error_train) input_image_train = {"image": image_train, "label": label_train} # Create validation graph image_valid = nn.Variable((bs_valid, 3, 32, 32)) label_valid = nn.Variable((args.batch_size, 1)) pred_valid = prediction(image_valid, test=True) error_valid = F.mean(F.top_n_error(pred_valid, label_valid, axis=1)) input_image_valid = {"image": image_valid, "label": label_valid} # Solvers solver = S.Adam() solver.set_parameters(nn.get_parameters()) base_lr = args.learning_rate warmup_iter = int( 1. * n_train_samples / args.batch_size / n_devices) * args.warmup_epoch warmup_slope = base_lr * (n_devices - 1) / warmup_iter solver.set_learning_rate(base_lr) # Create monitor from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=10) monitor_err = MonitorSeries("Training error", monitor, interval=10) monitor_time = MonitorTimeElapsed("Training time", monitor, interval=10) monitor_verr = MonitorSeries("Validation error", monitor, interval=1) monitor_vtime = MonitorTimeElapsed("Validation time", monitor, interval=1) # Data Iterator rng = np.random.RandomState(device_id) _, tdata = data_iterator(args.batch_size, True, rng) vsource, vdata = data_iterator(args.batch_size, False) # loss_error_train.forward() # Training-loop ve = nn.Variable() for i in range(int(args.max_iter / n_devices)): # Validation if i % int(n_train_samples / args.batch_size / n_devices) == 0: ve_local = 0. k = 0 idx = np.random.permutation(n_valid_samples) val_images = vsource.images[idx] val_labels = vsource.labels[idx] for j in range(int(n_valid_samples / n_devices * mpi_rank), int(n_valid_samples / n_devices * (mpi_rank + 1)), bs_valid): image = val_images[j:j + bs_valid] label = val_labels[j:j + bs_valid] if len(image ) != bs_valid: # note that smaller batch is ignored continue input_image_valid["image"].d = image input_image_valid["label"].d = label error_valid.forward(clear_buffer=True) ve_local += error_valid.d.copy() k += 1 ve_local /= k ve.d = ve_local comm.all_reduce(ve.data, division=True, inplace=True) # Save model if device_id == 0: monitor_verr.add(i * n_devices, ve.d.copy()) monitor_vtime.add(i * n_devices) if i % int(args.model_save_interval / n_devices) == 0: nn.save_parameters( os.path.join(args.model_save_path, 'params_%06d.h5' % i)) # Forward/Zerograd image, label = tdata.next() input_image_train["image"].d = image input_image_train["label"].d = label loss_error_train.forward(clear_no_need_grad=True) solver.zero_grad() # Backward/AllReduce backward_and_all_reduce( loss_error_train, comm, with_all_reduce_callback=args.with_all_reduce_callback) # Solvers update solver.update() # Linear Warmup if i <= warmup_iter: lr = base_lr + warmup_slope * i solver.set_learning_rate(lr) if device_id == 0: # loss and error locally, and elapsed time monitor_loss.add(i * n_devices, loss_train.d.copy()) monitor_err.add(i * n_devices, error_train.d.copy()) monitor_time.add(i * n_devices) # exit(0) if device_id == 0: nn.save_parameters( os.path.join(args.model_save_path, 'params_%06d.h5' % (args.max_iter / n_devices)))
def train(args): """ Main script. """ # Get context. from nnabla.contrib.context import extension_context extension_module = args.context if args.context is None: extension_module = 'cpu' logger.info("Running in %s" % extension_module) ctx = extension_context(extension_module, device_id=args.device_id) nn.set_default_context(ctx) # Create CNN network for both training and testing. margin = 1.0 # Margin for contrastive loss. # TRAIN # Create input variables. image0 = nn.Variable([args.batch_size, 1, 28, 28]) image1 = nn.Variable([args.batch_size, 1, 28, 28]) label = nn.Variable([args.batch_size]) # Create predition graph. pred = mnist_lenet_siamese(image0, image1, test=False) # Create loss function. loss = F.mean(contrastive_loss(pred, label, margin)) # TEST # Create input variables. vimage0 = nn.Variable([args.batch_size, 1, 28, 28]) vimage1 = nn.Variable([args.batch_size, 1, 28, 28]) vlabel = nn.Variable([args.batch_size]) # Create predition graph. vpred = mnist_lenet_siamese(vimage0, vimage1, test=True) vloss = F.mean(contrastive_loss(vpred, vlabel, margin)) # Create Solver. solver = S.Adam(args.learning_rate) solver.set_parameters(nn.get_parameters()) # Create monitor. import nnabla.monitor as M monitor = M.Monitor(args.monitor_path) monitor_loss = M.MonitorSeries("Training loss", monitor, interval=10) monitor_time = M.MonitorTimeElapsed("Training time", monitor, interval=100) monitor_vloss = M.MonitorSeries("Test loss", monitor, interval=10) # Initialize DataIterator for MNIST. rng = np.random.RandomState(313) data = siamese_data_iterator(args.batch_size, True, rng) vdata = siamese_data_iterator(args.batch_size, False, rng) # Training loop. for i in range(args.max_iter): if i % args.val_interval == 0: # Validation ve = 0.0 for j in range(args.val_iter): vimage0.d, vimage1.d, vlabel.d = vdata.next() vloss.forward(clear_buffer=True) ve += vloss.d monitor_vloss.add(i, ve / args.val_iter) if i % args.model_save_interval == 0: nn.save_parameters(os.path.join( args.model_save_path, 'params_%06d.h5' % i)) image0.d, image1.d, label.d = data.next() solver.zero_grad() # Training forward, backward and update loss.forward(clear_no_need_grad=True) loss.backward(clear_buffer=True) solver.weight_decay(args.weight_decay) solver.update() monitor_loss.add(i, loss.d.copy()) monitor_time.add(i) parameter_file = os.path.join( args.model_save_path, 'params_%06d.h5' % args.max_iter) nn.save_parameters(parameter_file) nnp_file = os.path.join( args.model_save_path, 'siamese_%06d.nnp' % (args.max_iter)) runtime_contents = { 'networks': [ {'name': 'Validation', 'batch_size': args.batch_size, 'outputs': {'y': vpred}, 'names': {'x0': vimage0, 'x1': vimage1}}], 'executors': [ {'name': 'Runtime', 'network': 'Validation', 'data': ['x0', 'x1'], 'output': ['y']}]} save.save(nnp_file, runtime_contents) from cpp_forward_check import check_cpp_forward check_cpp_forward(args.model_save_path, [vimage0.d, vimage1.d], [ vimage0, vimage1], vpred, nnp_file)
def ce_soft(pred, label): elms = - F.softmax(label, axis=1) * F.log(F.softmax(pred, axis=1)) loss = F.mean(F.sum(elms, axis=1)) return loss
def loss_function(pred, label): loss = F.mean(F.softmax_cross_entropy(pred, label)) return loss
def train(): args = get_args() # Get context. from nnabla.ext_utils import get_extension_context logger.info("Running in %s" % args.context) ctx = get_extension_context( args.context, device_id=args.device_id, type_config=args.type_config) nn.set_default_context(ctx) # Create CNN network for both training and testing. if args.net == "cifar10_resnet23_prediction": model_prediction = cifar10_resnet23_prediction # TRAIN maps = 64 data_iterator = data_iterator_cifar10 c = 3 h = w = 32 n_train = 50000 n_valid = 10000 # Create input variables. image = nn.Variable([args.batch_size, c, h, w]) label = nn.Variable([args.batch_size, 1]) # Create model_prediction graph. pred = model_prediction(image, maps=maps, test=False) pred.persistent = True # Create loss function. loss = F.mean(F.softmax_cross_entropy(pred, label)) # SSL Regularization loss += ssl_regularization(nn.get_parameters(), args.filter_decay, args.channel_decay) # TEST # Create input variables. vimage = nn.Variable([args.batch_size, c, h, w]) vlabel = nn.Variable([args.batch_size, 1]) # Create prediction graph. vpred = model_prediction(vimage, maps=maps, test=True) # Create Solver. solver = S.Adam(args.learning_rate) solver.set_parameters(nn.get_parameters()) # Create monitor. from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=10) monitor_err = MonitorSeries("Training error", monitor, interval=10) monitor_time = MonitorTimeElapsed("Training time", monitor, interval=100) monitor_verr = MonitorSeries("Test error", monitor, interval=1) # Initialize DataIterator data = data_iterator(args.batch_size, True) vdata = data_iterator(args.batch_size, False) best_ve = 1.0 ve = 1.0 # Training loop. for i in range(args.max_iter): if i % args.val_interval == 0: # Validation ve = 0.0 for j in range(int(n_valid / args.batch_size)): vimage.d, vlabel.d = vdata.next() vpred.forward(clear_buffer=True) ve += categorical_error(vpred.d, vlabel.d) ve /= int(n_valid / args.batch_size) monitor_verr.add(i, ve) if ve < best_ve: nn.save_parameters(os.path.join( args.model_save_path, 'params_%06d.h5' % i)) best_ve = ve # Training forward image.d, label.d = data.next() solver.zero_grad() loss.forward(clear_no_need_grad=True) loss.backward(clear_buffer=True) solver.weight_decay(args.weight_decay) solver.update() e = categorical_error(pred.d, label.d) monitor_loss.add(i, loss.d.copy()) monitor_err.add(i, e) monitor_time.add(i) ve = 0.0 for j in range(int(n_valid / args.batch_size)): vimage.d, vlabel.d = vdata.next() vpred.forward(clear_buffer=True) ve += categorical_error(vpred.d, vlabel.d) ve /= int(n_valid / args.batch_size) monitor_verr.add(i, ve) parameter_file = os.path.join( args.model_save_path, 'params_{:06}.h5'.format(args.max_iter)) nn.save_parameters(parameter_file)
def vae(x, shape_z, test=False): """ Function for calculate Elbo(evidence lowerbound) loss. This sample is a Bernoulli generator version. Args: x(`~nnabla.Variable`): N-D array shape_z(tuple of int): size of z test : True=train, False=test Returns: ~nnabla.Variable: Elbo loss """ ############################################# # Encoder of 2 fully connected layers # ############################################# # Normalize input xa = x / 256. batch_size = x.shape[0] # 2 fully connected layers, and Elu replaced from original Softplus. h = F.elu(PF.affine(xa, (500, ), name='fc1')) h = F.elu(PF.affine(h, (500, ), name='fc2')) # The outputs are the parameters of Gauss probability density. mu = PF.affine(h, shape_z, name='fc_mu') logvar = PF.affine(h, shape_z, name='fc_logvar') sigma = F.exp(0.5 * logvar) # The prior variable and the reparameterization trick if not test: # training with reparameterization trick epsilon = F.randn(mu=0, sigma=1, shape=(batch_size, ) + shape_z) z = mu + sigma * epsilon else: # test without randomness z = mu ############################################# # Decoder of 2 fully connected layers # ############################################# # 2 fully connected layers, and Elu replaced from original Softplus. h = F.elu(PF.affine(z, (500, ), name='fc3')) h = F.elu(PF.affine(h, (500, ), name='fc4')) # The outputs are the parameters of Bernoulli probabilities for each pixel. prob = PF.affine(h, (1, 28, 28), name='fc5') ############################################# # Elbo components and loss objective # ############################################# # Binarized input xb = F.greater_equal_scalar(xa, 0.5) # E_q(z|x)[log(q(z|x))] # without some constant terms that will canceled after summation of loss logqz = 0.5 * F.sum(1.0 + logvar, axis=1) # E_q(z|x)[log(p(z))] # without some constant terms that will canceled after summation of loss logpz = 0.5 * F.sum(mu * mu + sigma * sigma, axis=1) # E_q(z|x)[log(p(x|z))] logpx = F.sum(F.sigmoid_cross_entropy(prob, xb), axis=(1, 2, 3)) # Vae loss, the negative evidence lowerbound loss = F.mean(logpx + logpz - logqz) return loss
def main(): # Get arguments args = get_args() data_file = "https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.train.txt" model_file = args.work_dir + "model.h5" # Load Dataset itow, wtoi, dataset = load_ptbset(data_file) # Computation environment settings from nnabla.contrib.context import extension_context extension_module = args.context if args.context is None: extension_module = 'cpu' logger.info("Running in %s" % extension_module) ctx = extension_context(extension_module, device_id=args.device_id) nn.set_default_context(ctx) # Create data provider n_word = len(wtoi) n_dim = args.embed_dim batchsize = args.batchsize half_window = args.half_window_length n_negative = args.n_negative_sample di = DataIteratorForEmbeddingLearning( batchsize=batchsize, half_window=half_window, n_negative=n_negative, dataset=dataset) # Create model # - Real batch size including context samples and negative samples size = batchsize * (1 + n_negative) * (2 * (half_window - 1)) # Model for learning # - input variables xl = nn.Variable((size,)) # variable for word yl = nn.Variable((size,)) # variable for context # Embed layers for word embedding function # - f_embed : word index x to get y, the n_dim vector # -- for each sample in a minibatch hx = PF.embed(xl, n_word, n_dim, name="e1") # feature vector for word hy = PF.embed(yl, n_word, n_dim, name="e1") # feature vector for context hl = F.sum(hx * hy, axis=1) # -- Approximated likelihood of context prediction # pos: word context, neg negative samples tl = nn.Variable([size, ], need_grad=False) loss = F.sigmoid_cross_entropy(hl, tl) loss = F.mean(loss) # Model for test of searching similar words xr = nn.Variable((1,), need_grad=False) hr = PF.embed(xr, n_word, n_dim, name="e1") # feature vector for test # Create solver solver = S.Adam(args.learning_rate) solver.set_parameters(nn.get_parameters()) # Create monitor. monitor = M.Monitor(args.work_dir) monitor_loss = M.MonitorSeries( "Training loss", monitor, interval=args.monitor_interval) monitor_time = M.MonitorTimeElapsed( "Training time", monitor, interval=args.monitor_interval) # Do training max_epoch = args.max_epoch for epoch in range(max_epoch): # iteration per epoch for i in range(di.n_batch): # get minibatch xi, yi, ti = di.next() # learn solver.zero_grad() xl.d, yl.d, tl.d = xi, yi, ti loss.forward(clear_no_need_grad=True) loss.backward(clear_buffer=True) solver.update() # monitor itr = epoch * di.n_batch + i monitor_loss.add(itr, loss.d) monitor_time.add(itr) # Save model nn.save_parameters(model_file) # Evaluate by similarity max_check_words = args.max_check_words for i in range(max_check_words): # prediction xr.d = i hr.forward(clear_buffer=True) h = hr.d # similarity calculation w = nn.get_parameters()['e1/embed/W'].d s = np.sqrt((w * w).sum(1)) w /= s.reshape((s.shape[0], 1)) similarity = w.dot(h[0]) / s[i] # for understanding output_similar_words(itow, i, similarity)
def train(): parser = argparse.ArgumentParser() parser.add_argument("--train-file", type=str) parser.add_argument("--valid-file", type=str) parser.add_argument("--num-training-examples", type=int, default=50) parser.add_argument("--accum-grad", type=int, default=1) parser.add_argument("--valid-interval", type=int, default=200) parser.add_argument("--threshold", type=float, default=0.95) parser.add_argument("--context", type=str, default="cpu") parser.add_argument("--device-id", type=int, default=0) args = parser.parse_args() from nnabla.ext_utils import get_extension_context extension_module = args.context ctx = get_extension_context(extension_module, device_id=args.device_id) nn.set_default_context(ctx) # prepare data iterators tdata = data_iterator( BAbI15DataSource(args.train_file, args.num_training_examples, shuffle=True), 1, False, False, False) vdata = data_iterator( BAbI15DataSource(args.valid_file, 1000, shuffle=True), 1, False, False, False) # prepare monitors monitor = M.Monitor("./bAbI15") tloss = M.MonitorSeries("Training Loss", monitor, interval=10) terror = M.MonitorSeries("Training Error", monitor, interval=10) verror = M.MonitorSeries("Validation Error", monitor, interval=1) # prepare solver solver = S.Adam() solver_initialized = False cnt = 0 while True: l = 0.0 e = 0.0 solver.zero_grad() for _ in range(args.accum_grad): # read next data x = tdata.next() V = x[1][0][0] E = x[2][0][0] ans = x[3][0][0] # construct GGNN output = predict(V, E) output = F.reshape(output, (1, output.shape[0])) # initialize solver if not solver_initialized: solver.set_parameters(nn.get_parameters()) solver_initialized = True solver.zero_grad() # calculate loss/error label = nn.Variable((1, 1)) label.data.data[0, 0] = ans output2 = output.unlinked() loss = F.mean(F.softmax_cross_entropy(output, label)) error = F.mean(F.top_n_error(output2, label)) F.sink(loss, error).forward(clear_no_need_grad=True) loss.backward(clear_buffer=True) l += loss.data.data e += error.data.data # dump log tloss.add(cnt, l / args.accum_grad) terror.add(cnt, e / args.accum_grad) l = 0.0 e = 0.0 solver.update() cnt += 1 if cnt % args.valid_interval == 0: # validation validation_error = 0 correct_example = None wrong_example = None for _ in range(vdata.size): x = vdata.next() id2str = x[0][0][0] V = x[1][0][0] E = x[2][0][0] ans = x[3][0][0] output = predict(V, E) output = F.reshape(output, (1, output.shape[0])) # calculate error label = nn.Variable((1, 1)) label.data.data[0, 0] = ans error = F.top_n_error(output, label) error.forward(clear_no_need_grad=True) if error.data.data > 0.5: if wrong_example is None: wrong_example = (id2str, V, E, ans, output.data.data) else: if correct_example is None: correct_example = (id2str, V, E, ans, output.data.data) validation_error += error.data.data validation_error /= vdata.size verror.add(cnt, validation_error) accuracy = 1 - validation_error if accuracy >= args.threshold: def show(example): for i, j in example[2]["is"]: print("{} is {}.".format(example[0][i], example[0][j])) for i, j in example[2]["has_fear"]: print("{} are afraid of {}.".format( example[0][i], example[0][j])) i = np.argmax(example[1]) print("What is {} afraid of?".format(example[0][i])) i = np.argmax(example[4]) print("Expected: {}, Actual: {}".format( example[0][example[3]], example[0][i])) if correct_example is not None: show(correct_example) if wrong_example is not None: show(wrong_example) break
def ce_loss_soft(ctx, pred, target): with nn.context_scope(ctx): #todo: devide or not loss = - F.mean(F.sum(F.softmax(target) * F.log(F.softmax(pred)), axis=1)) return loss
def main(): conf = get_config() extension_module = conf.nnabla_context.context ctx = get_extension_context(extension_module, device_id=conf.nnabla_context.device_id) comm = CommunicatorWrapper(ctx) nn.set_default_context(comm.ctx) print("#GPU Count: ", comm.n_procs) data_iterator_train = jsi_iterator(conf.batch_size, conf, train=True) if conf.scaling_factor == 1: d_t = nn.Variable((conf.batch_size, 80, 80, 3), need_grad=True) l_t = nn.Variable((conf.batch_size, 80, 80, 3), need_grad=True) else: d_t = nn.Variable((conf.batch_size, 160 / conf.scaling_factor, 160 / conf.scaling_factor, 3), need_grad=True) l_t = nn.Variable((conf.batch_size, 160, 160, 3), need_grad=True) if comm.n_procs > 1: data_iterator_train = data_iterator_train.slice( rng=None, num_of_slices=comm.n_procs, slice_pos=comm.rank) monitor_path = './nnmonitor' + \ str(datetime.datetime.now().strftime("%Y%m%d%H%M%S")) monitor = Monitor(monitor_path) jsi_monitor = setup_monitor(conf, monitor) with nn.parameter_scope("jsinet"): nn.load_parameters(conf.pre_trained_model) net = model(d_t, conf.scaling_factor) net.pred.persistent = True rec_loss = F.mean(F.squared_error(net.pred, l_t)) rec_loss.persistent = True g_final_loss = rec_loss if conf.jsigan: net_gan = gan_model(l_t, net.pred, conf) d_final_fm_loss = net_gan.d_adv_loss d_final_fm_loss.persistent = True d_final_detail_loss = net_gan.d_detail_adv_loss d_final_detail_loss.persistent = True g_final_loss = conf.rec_lambda * rec_loss + conf.adv_lambda * ( net_gan.g_adv_loss + net_gan.g_detail_adv_loss ) + conf.fm_lambda * (net_gan.fm_loss + net_gan.fm_detail_loss) g_final_loss.persistent = True max_iter = data_iterator_train._size // (conf.batch_size) if comm.rank == 0: print("max_iter", data_iterator_train._size, max_iter) iteration = 0 if not conf.jsigan: start_epoch = 0 end_epoch = conf.adv_weight_point lr = conf.learning_rate * comm.n_procs else: start_epoch = conf.adv_weight_point end_epoch = conf.epoch lr = conf.learning_rate * comm.n_procs w_d = conf.weight_decay * comm.n_procs # Set generator parameters with nn.parameter_scope("jsinet"): solver_jsinet = S.Adam(alpha=lr, beta1=0.9, beta2=0.999, eps=1e-08) solver_jsinet.set_parameters(nn.get_parameters()) if conf.jsigan: solver_disc_fm = S.Adam(alpha=lr, beta1=0.9, beta2=0.999, eps=1e-08) solver_disc_detail = S.Adam(alpha=lr, beta1=0.9, beta2=0.999, eps=1e-08) with nn.parameter_scope("Discriminator_FM"): solver_disc_fm.set_parameters(nn.get_parameters()) with nn.parameter_scope("Discriminator_Detail"): solver_disc_detail.set_parameters(nn.get_parameters()) for epoch in range(start_epoch, end_epoch): for index in range(max_iter): d_t.d, l_t.d = data_iterator_train.next() if not conf.jsigan: # JSI-net -> Generator lr_stair_decay_points = [200, 225] lr_net = get_learning_rate(lr, iteration, lr_stair_decay_points, conf.lr_decreasing_factor) g_final_loss.forward(clear_no_need_grad=True) solver_jsinet.zero_grad() if comm.n_procs > 1: all_reduce_callback = comm.get_all_reduce_callback() g_final_loss.backward( clear_buffer=True, communicator_callbacks=all_reduce_callback) else: g_final_loss.backward(clear_buffer=True) solver_jsinet.set_learning_rate(lr_net) solver_jsinet.update() else: # GAN part (discriminator + generator) lr_gan = lr if epoch < conf.gan_lr_linear_decay_point \ else lr * (end_epoch - epoch) / (end_epoch - conf.gan_lr_linear_decay_point) lr_gan = lr_gan * conf.gan_ratio net.pred.need_grad = False # Discriminator_FM solver_disc_fm.zero_grad() d_final_fm_loss.forward(clear_no_need_grad=True) if comm.n_procs > 1: all_reduce_callback = comm.get_all_reduce_callback() d_final_fm_loss.backward( clear_buffer=True, communicator_callbacks=all_reduce_callback) else: d_final_fm_loss.backward(clear_buffer=True) solver_disc_fm.set_learning_rate(lr_gan) solver_disc_fm.weight_decay(w_d) solver_disc_fm.update() # Discriminator_Detail solver_disc_detail.zero_grad() d_final_detail_loss.forward(clear_no_need_grad=True) if comm.n_procs > 1: all_reduce_callback = comm.get_all_reduce_callback() d_final_detail_loss.backward( clear_buffer=True, communicator_callbacks=all_reduce_callback) else: d_final_detail_loss.backward(clear_buffer=True) solver_disc_detail.set_learning_rate(lr_gan) solver_disc_detail.weight_decay(w_d) solver_disc_detail.update() # Generator net.pred.need_grad = True solver_jsinet.zero_grad() g_final_loss.forward(clear_no_need_grad=True) if comm.n_procs > 1: all_reduce_callback = comm.get_all_reduce_callback() g_final_loss.backward( clear_buffer=True, communicator_callbacks=all_reduce_callback) else: g_final_loss.backward(clear_buffer=True) solver_jsinet.set_learning_rate(lr_gan) solver_jsinet.update() iteration += 1 if comm.rank == 0: train_psnr = compute_psnr(net.pred.d, l_t.d, 1.) jsi_monitor['psnr'].add(iteration, train_psnr) jsi_monitor['rec_loss'].add(iteration, rec_loss.d.copy()) jsi_monitor['time'].add(iteration) if comm.rank == 0: if conf.jsigan: jsi_monitor['g_final_loss'].add(iteration, g_final_loss.d.copy()) jsi_monitor['g_adv_loss'].add(iteration, net_gan.g_adv_loss.d.copy()) jsi_monitor['g_detail_adv_loss'].add( iteration, net_gan.g_detail_adv_loss.d.copy()) jsi_monitor['d_final_fm_loss'].add( iteration, d_final_fm_loss.d.copy()) jsi_monitor['d_final_detail_loss'].add( iteration, d_final_detail_loss.d.copy()) jsi_monitor['fm_loss'].add(iteration, net_gan.fm_loss.d.copy()) jsi_monitor['fm_detail_loss'].add( iteration, net_gan.fm_detail_loss.d.copy()) jsi_monitor['lr'].add(iteration, lr_gan) if comm.rank == 0: if not os.path.exists(conf.output_dir): os.makedirs(conf.output_dir) with nn.parameter_scope("jsinet"): nn.save_parameters( os.path.join(conf.output_dir, "model_param_%04d.h5" % epoch))
def sigma_regularization(ctx, log_var, one): with nn.context_scope(ctx): h = F.exp(log_var) h = F.pow_scalar(h, 0.5) r = F.mean(F.squared_error(h, one)) return r
def cifar10_resnet23_loss(pred, label): loss = F.mean(F.softmax_cross_entropy(pred, label)) return loss
train_data_iter = data_iterator_simple(load_train_func, len(x_train), batch_size, shuffle=True, with_file_cache=False) valid_data_iter = data_iterator_simple(load_valid_func, len(x_valid), batch_size, shuffle=True, with_file_cache=False) x = nn.Variable([batch_size, window_size * 2]) with nn.parameter_scope('W_in'): h = PF.embed(x, vocab_size, embedding_size) h = F.mean(h, axis=1) h = expand_dims(h, axis=-1) # (batch_size, embedding_size, 1) t = nn.Variable([batch_size, 1]) t_neg = nn.Variable([batch_size, k]) with nn.parameter_scope('W_out'): _t = PF.embed(t, vocab_size, embedding_size) # (batch_size, 1, embedding_size) _t_neg = PF.embed(t_neg, vocab_size, embedding_size) # (batch_size, k, embedding_size) t_score = F.sigmoid(F.reshape(F.batch_matmul(_t, h), shape=(batch_size, 1))) t_neg_score = F.sigmoid( F.reshape(F.batch_matmul(_t_neg, h), shape=(batch_size, k))) t_loss = F.binary_cross_entropy(t_score, F.constant(1, shape=(batch_size, 1))) t_neg_loss = F.binary_cross_entropy(t_neg_score,
def train(args): """ Main script. """ # Get context. from nnabla.contrib.context import extension_context extension_module = args.context if args.context is None: extension_module = 'cpu' logger.info("Running in %s" % extension_module) ctx = extension_context(extension_module, device_id=args.device_id) nn.set_default_context(ctx) # Create CNN network for both training and testing. # TRAIN # Fake path x1 = nn.Variable([args.batch_size, 1, 28, 28]) #z = nn.Variable([args.batch_size, VEC_SIZE, 1, 1]) #z = vectorizer(x1,maxh = 1024) #fake = generator(z,maxh= 1024) z = vectorizer(x1) fake = generator(z) fake.persistent = True # Not to clear at backward pred_fake = discriminator(fake) loss_gen = F.mean( F.sigmoid_cross_entropy(pred_fake, F.constant(1, pred_fake.shape))) loss_vec = F.mean(F.squared_error(fake, x1)) fake_dis = fake.unlinked() pred_fake_dis = discriminator(fake_dis) loss_dis = F.mean( F.sigmoid_cross_entropy(pred_fake_dis, F.constant(0, pred_fake_dis.shape))) # Real path x = nn.Variable([args.batch_size, 1, 28, 28]) pred_real = discriminator(x) loss_dis += F.mean( F.sigmoid_cross_entropy(pred_real, F.constant(1, pred_real.shape))) # Create Solver. solver_gen = S.Adam(args.learning_rate, beta1=0.5) solver_dis = S.Adam(args.learning_rate, beta1=0.5) solver_vec = S.Adam(args.learning_rate, beta1=0.5) with nn.parameter_scope("vec"): solver_vec.set_parameters(nn.get_parameters()) with nn.parameter_scope("gen"): solver_vec.set_parameters(nn.get_parameters()) with nn.parameter_scope("gen"): solver_gen.set_parameters(nn.get_parameters()) with nn.parameter_scope("dis"): solver_dis.set_parameters(nn.get_parameters()) # Create monitor. import nnabla.monitor as M monitor = M.Monitor(args.monitor_path) monitor_loss_gen = M.MonitorSeries("Generator loss", monitor, interval=10) monitor_loss_dis = M.MonitorSeries("Discriminator loss", monitor, interval=10) monitor_loss_vec = M.MonitorSeries("Vectorizer loss", monitor, interval=10) monitor_time = M.MonitorTimeElapsed("Time", monitor, interval=100) monitor_fake = M.MonitorImageTile("Fake images", monitor, normalize_method=lambda x: x + 1 / 2.) monitor_vec1 = M.MonitorImageTile("vec images1", monitor, normalize_method=lambda x: x + 1 / 2.) monitor_vec2 = M.MonitorImageTile("vec images2", monitor, normalize_method=lambda x: x + 1 / 2.) #data = data_iterator_mnist(args.batch_size, True) data = iterator.simple_data_iterator(load_kanji_data(), args.batch_size, True) # Training loop. for i in range(args.max_iter): if i % args.model_save_interval == 0: with nn.parameter_scope("gen"): nn.save_parameters( os.path.join(args.model_save_path, "generator_param_%06d.h5" % i)) with nn.parameter_scope("dis"): nn.save_parameters( os.path.join(args.model_save_path, "discriminator_param_%06d.h5" % i)) # Training forward image, _ = data.next() x1.d = image / 255. - 0.5 # Generator update. solver_vec.zero_grad() loss_vec.forward(clear_no_need_grad=True) loss_vec.backward(clear_buffer=True) solver_vec.weight_decay(args.weight_decay) solver_vec.update() monitor_vec1.add(i, fake) monitor_vec2.add(i, x1) monitor_loss_vec.add(i, loss_vec.d.copy()) x.d = image / 255. - 0.5 # [0, 255] to [-1, 1] z.d = np.random.randn(*z.shape) # Generator update. solver_gen.zero_grad() loss_gen.forward(clear_no_need_grad=True) loss_gen.backward(clear_buffer=True) solver_gen.weight_decay(args.weight_decay) solver_gen.update() monitor_fake.add(i, fake) monitor_loss_gen.add(i, loss_gen.d.copy()) # Discriminator update. solver_dis.zero_grad() loss_dis.forward(clear_no_need_grad=True) loss_dis.backward(clear_buffer=True) solver_dis.weight_decay(args.weight_decay) solver_dis.update() monitor_loss_dis.add(i, loss_dis.d.copy()) monitor_time.add(i) with nn.parameter_scope("gen"): nn.save_parameters( os.path.join(args.model_save_path, "generator_param_%06d.h5" % i)) with nn.parameter_scope("dis"): nn.save_parameters( os.path.join(args.model_save_path, "discriminator_param_%06d.h5" % i))
def __init__(self, solver, tinput=None, tlabel=None, tpred=None, tdata=None, vinput=None, vlabel=None, vpred=None, vdata=None, monitor_path=None, model_save_path=None, max_epoch=1, iter_per_epoch=None, val_iter=None): # Monitors monitor = Monitor(monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=10) monitor_vloss = MonitorSeries("Valid loss", monitor, interval=1) monitor_time = MonitorTimeElapsed("Training time", monitor, interval=10) # Loss tpred = tpred.apply(persistent=True) tloss = F.mean(F.squared_error(tpred, tlabel)) vpred = vpred.apply(persistent=True) vloss = F.mean(F.squared_error(vpred, vlabel)) # Updater def tdata_feeder(): tinput.d, tlabel.d = tdata.next() def update_callback_on_finish(i): monitor_loss.add(i, tloss.d) monitor_time.add(i) updater = Updater( solver, tloss, data_feeder=tdata_feeder, forward_callback_on_finish=forward_callback_on_finish, update_callback_on_finish=update_callback_on_finish) # Evaluator def vdata_feeder(): vinput.d, vlabel.d = vdata.next() def vloss_callback_on_finish(i, v): monitor_vloss.add(i, v) val_iter = val_iter if val_iter is not None else vdata.size // vdata.batch_size evaluator = Evaluator(vloss, data_feeder=vdata_feeder, val_iter=val_iter, callback_on_finish=vloss_callback_on_finish) # Trainer iter_per_epoch = iter_per_epoch if iter_per_epoch is not None \ else tdata.size // tdata.batch_size self.trainer = Trainer(updater, evaluator, model_save_path, max_epoch=max_epoch, iter_per_epoch=iter_per_epoch)
def main(): """ Main script. Steps: * Get and set context. * Load Dataset * Initialize DataIterator. * Create Networks * Net for Labeled Data * Net for Unlabeled Data * Net for Test Data * Create Solver. * Training Loop. * Test * Training * by Labeled Data * Calculate Cross Entropy Loss * by Unlabeled Data * Estimate Adversarial Direction * Calculate LDS Loss """ args = get_args() # Get context. from nnabla.contrib.context import extension_context extension_module = args.context if args.context is None: extension_module = 'cpu' logger.info("Running in %s" % extension_module) ctx = extension_context(extension_module, device_id=args.device_id) nn.set_default_context(ctx) shape_x = (1, 28, 28) n_h = args.n_units n_y = args.n_class # Load MNist Dataset from mnist_data import MnistDataSource with MnistDataSource(train=True) as d: x_t = d.images t_t = d.labels with MnistDataSource(train=False) as d: x_v = d.images t_v = d.labels x_t = np.array(x_t / 256.0).astype(np.float32) x_t, t_t = x_t[:args.n_train], t_t[:args.n_train] x_v, t_v = x_v[:args.n_valid], t_v[:args.n_valid] # Create Semi-supervised Datasets x_l, t_l, x_u, _ = split_dataset(x_t, t_t, args.n_labeled, args.n_class) x_u = np.r_[x_l, x_u] x_v = np.array(x_v / 256.0).astype(np.float32) # Create DataIterators for datasets of labeled, unlabeled and validation di_l = DataIterator(args.batchsize_l, [x_l, t_l]) di_u = DataIterator(args.batchsize_u, [x_u]) di_v = DataIterator(args.batchsize_v, [x_v, t_v]) # Create networks # feed-forward-net building function def forward(x, test=False): return mlp_net(x, n_h, n_y, test) # Net for learning labeled data xl = nn.Variable((args.batchsize_l,) + shape_x, need_grad=False) hl = forward(xl, test=False) tl = nn.Variable((args.batchsize_l, 1), need_grad=False) loss_l = F.mean(F.softmax_cross_entropy(hl, tl)) # Net for learning unlabeled data xu = nn.Variable((args.batchsize_u,) + shape_x, need_grad=False) r = nn.Variable((args.batchsize_u,) + shape_x, need_grad=True) eps = nn.Variable((args.batchsize_u,) + shape_x, need_grad=False) loss_u, yu = vat(xu, r, eps, forward, distance) # Net for evaluating valiation data xv = nn.Variable((args.batchsize_v,) + shape_x, need_grad=False) hv = forward(xv, test=True) tv = nn.Variable((args.batchsize_v, 1), need_grad=False) # Create solver solver = S.Adam(args.learning_rate) solver.set_parameters(nn.get_parameters()) # Monitor trainig and validation stats. import nnabla.monitor as M monitor = M.Monitor(args.model_save_path) monitor_verr = M.MonitorSeries("Test error", monitor, interval=240) monitor_time = M.MonitorTimeElapsed("Elapsed time", monitor, interval=240) # Training Loop. t0 = time.time() for i in range(args.max_iter): # Validation Test if i % args.val_interval == 0: n_error = calc_validation_error( di_v, xv, tv, hv, args.val_iter) monitor_verr.add(i, n_error) ################################# ## Training by Labeled Data ##### ################################# # input minibatch of labeled data into variables xl.d, tl.d = di_l.next() # initialize gradients solver.zero_grad() # forward, backward and update loss_l.forward(clear_no_need_grad=True) loss_l.backward(clear_buffer=True) solver.weight_decay(args.weight_decay) solver.update() ################################# ## Training by Unlabeled Data ### ################################# # input minibatch of unlabeled data into variables xu.d, = di_u.next() ##### Calculate Adversarial Noise ##### # Sample random noise n = np.random.normal(size=xu.shape).astype(np.float32) # Normalize noise vector and input to variable r.d = get_direction(n) # Set xi, the power-method scaling parameter. eps.data.fill(args.xi_for_vat) # Calculate y without noise, only once. yu.forward(clear_buffer=True) # Do power method iteration for k in range(args.n_iter_for_power_method): # Initialize gradient to receive value r.grad.zero() # forward, backward, without update loss_u.forward(clear_no_need_grad=True) loss_u.backward(clear_buffer=True) # Normalize gradinet vector and input to variable r.d = get_direction(r.g) ##### Calculate loss for unlabeled data ##### # Clear remained gradients solver.zero_grad() # Set epsilon, the adversarial noise scaling parameter. eps.data.fill(args.eps_for_vat) # forward, backward and update loss_u.forward(clear_no_need_grad=True) loss_u.backward(clear_buffer=True) solver.weight_decay(args.weight_decay) solver.update() ##### Learning rate update ##### if i % args.iter_per_epoch == 0: solver.set_learning_rate( solver.learning_rate() * args.learning_rate_decay) monitor_time.add(i) # Evaluate the final model by the error rate with validation dataset valid_error = calc_validation_error(di_v, xv, tv, hv, args.val_iter) monitor_verr.add(i, valid_error) monitor_time.add(i) # Save the model. nnp_file = os.path.join( args.model_save_path, 'vat_%06d.nnp' % args.max_iter) runtime_contents = { 'networks': [ {'name': 'Validation', 'batch_size': args.batchsize_v, 'outputs': {'y': hv}, 'names': {'x': xv}}], 'executors': [ {'name': 'Runtime', 'network': 'Validation', 'data': ['x'], 'output': ['y']}]} save.save(nnp_file, runtime_contents) from cpp_forward_check import check_cpp_forward check_cpp_forward(args.model_save_path, [xv.d], [xv], hv, nnp_file)
def train(args): """ Main script. """ # Get context. from nnabla.contrib.context import extension_context extension_module = args.context if args.context is None: extension_module = 'cpu' logger.info("Running in %s" % extension_module) ctx = extension_context(extension_module, device_id=args.device_id) nn.set_default_context(ctx) # Create CNN network for both training and testing. # TRAIN # Fake path z = nn.Variable([args.batch_size, 100, 1, 1]) fake = generator(z) fake.persistent = True # Not to clear at backward pred_fake = discriminator(fake) loss_gen = F.mean(F.sigmoid_cross_entropy( pred_fake, F.constant(1, pred_fake.shape))) fake_dis = fake.unlinked() pred_fake_dis = discriminator(fake_dis) loss_dis = F.mean(F.sigmoid_cross_entropy( pred_fake_dis, F.constant(0, pred_fake_dis.shape))) # Real path x = nn.Variable([args.batch_size, 1, 28, 28]) pred_real = discriminator(x) loss_dis += F.mean(F.sigmoid_cross_entropy(pred_real, F.constant(1, pred_real.shape))) # Create Solver. solver_gen = S.Adam(args.learning_rate, beta1=0.5) solver_dis = S.Adam(args.learning_rate, beta1=0.5) with nn.parameter_scope("gen"): solver_gen.set_parameters(nn.get_parameters()) with nn.parameter_scope("dis"): solver_dis.set_parameters(nn.get_parameters()) # Create monitor. import nnabla.monitor as M monitor = M.Monitor(args.monitor_path) monitor_loss_gen = M.MonitorSeries("Generator loss", monitor, interval=10) monitor_loss_dis = M.MonitorSeries( "Discriminator loss", monitor, interval=10) monitor_time = M.MonitorTimeElapsed("Time", monitor, interval=100) monitor_fake = M.MonitorImageTile( "Fake images", monitor, normalize_method=lambda x: x + 1 / 2.) data = data_iterator_mnist(args.batch_size, True) # Training loop. for i in range(args.max_iter): if i % args.model_save_interval == 0: with nn.parameter_scope("gen"): nn.save_parameters(os.path.join( args.model_save_path, "generator_param_%06d.h5" % i)) with nn.parameter_scope("dis"): nn.save_parameters(os.path.join( args.model_save_path, "discriminator_param_%06d.h5" % i)) # Training forward image, _ = data.next() x.d = image / 255. - 0.5 # [0, 255] to [-1, 1] z.d = np.random.randn(*z.shape) # Generator update. solver_gen.zero_grad() loss_gen.forward(clear_no_need_grad=True) loss_gen.backward(clear_buffer=True) solver_gen.weight_decay(args.weight_decay) solver_gen.update() monitor_fake.add(i, fake) monitor_loss_gen.add(i, loss_gen.d.copy()) # Discriminator update. solver_dis.zero_grad() loss_dis.forward(clear_no_need_grad=True) loss_dis.backward(clear_buffer=True) solver_dis.weight_decay(args.weight_decay) solver_dis.update() monitor_loss_dis.add(i, loss_dis.d.copy()) monitor_time.add(i) nnp = os.path.join( args.model_save_path, 'dcgan_%06d.nnp' % args.max_iter) runtime_contents = { 'networks': [ {'name': 'Generator', 'batch_size': args.batch_size, 'outputs': {'G': fake}, 'names': {'z': z}}, {'name': 'Discriminator', 'batch_size': args.batch_size, 'outputs': {'D': pred_real}, 'names': {'x': x}}], 'executors': [ {'name': 'Generator', 'network': 'Generator', 'data': ['z'], 'output': ['G']}, {'name': 'Discriminator', 'network': 'Discriminator', 'data': ['x'], 'output': ['D']}]} save.save(nnp, runtime_contents) from cpp_forward_check import check_cpp_forward check_cpp_forward(args.model_save_path, [z.d], [z], fake, nnp, "Generator")
def kl_divergence(ctx, pred, label): with nn.context_scope(ctx): elms = F.softmax(label, axis=1) * F.log(F.softmax(pred, axis=1)) loss = -F.mean(F.sum(elms, axis=1)) return loss
def train(): """ Main script. Steps: * Parse command line arguments. * Specify a context for computation. * Initialize DataIterator for MNIST. * Construct a computation graph for training and validation. * Initialize a solver and set parameter variables to it. * Create monitor instances for saving and displaying training stats. * Training loop * Computate error rate for validation data (periodically) * Get a next minibatch. * Set parameter gradients zero * Execute forwardprop on the training graph. * Execute backprop. * Solver updates parameters by using gradients computed by backprop. * Compute training error """ args = get_args(monitor_path='tmp.monitor.bnn') # Get context. from nnabla.contrib.context import extension_context extension_module = args.context if args.context is None: extension_module = 'cpu' logger.info("Running in %s" % extension_module) ctx = extension_context(extension_module, device_id=args.device_id) nn.set_default_context(ctx) # Initialize DataIterator for MNIST. data = data_iterator_mnist(args.batch_size, True) vdata = data_iterator_mnist(args.batch_size, False) # Create CNN network for both training and testing. mnist_cnn_prediction = mnist_inq_lenet_prediction if args.net == 'inq': mnist_cnn_prediction = mnist_inq_lenet_prediction elif args.net == 'inq_resnet': mnist_cnn_prediction = mnist_inq_resnet_prediction # TRAIN # Create input variables. image = nn.Variable([args.batch_size, 1, 28, 28]) label = nn.Variable([args.batch_size, 1]) # Create predition graph. pred = mnist_cnn_prediction(image / 255, test=False) pred.persistent = True # Create loss function. loss = F.mean(F.softmax_cross_entropy(pred, label)) # TEST # Create input variables. vimage = nn.Variable([args.batch_size, 1, 28, 28]) vlabel = nn.Variable([args.batch_size, 1]) # Create predition graph. vpred = mnist_cnn_prediction(vimage / 255, test=True) # Create Solver. solver = S.Adam(args.learning_rate) solver.set_parameters(nn.get_parameters()) # Create monitor. import nnabla.monitor as M monitor = M.Monitor(args.monitor_path) monitor_loss = M.MonitorSeries("Training loss", monitor, interval=10) monitor_err = M.MonitorSeries("Training error", monitor, interval=10) monitor_time = M.MonitorTimeElapsed("Training time", monitor, interval=100) monitor_verr = M.MonitorSeries("Test error", monitor, interval=10) # Training loop. for i in range(args.max_iter): if i % args.val_interval == 0: # Validation ve = 0.0 for j in range(args.val_iter): vimage.d, vlabel.d = vdata.next() vpred.forward(clear_buffer=True) ve += categorical_error(vpred.d, vlabel.d) monitor_verr.add(i, ve / args.val_iter) if i % args.model_save_interval == 0: nn.save_parameters(os.path.join( args.model_save_path, 'params_%06d.h5' % i)) # Training forward image.d, label.d = data.next() solver.zero_grad() loss.forward(clear_no_need_grad=True) # Training backward & update loss.backward(clear_buffer=True) solver.weight_decay(args.weight_decay) solver.update() # Monitor e = categorical_error(pred.d, label.d) monitor_loss.add(i, loss.d.copy()) monitor_err.add(i, e) monitor_time.add(i) parameter_file = os.path.join( args.model_save_path, 'params_%06d.h5' % args.max_iter) nn.save_parameters(parameter_file)
def test_recording_to_training(ctx, func_name, seed, precision_mode, graph_ref, graph_act): from .graph_converter_test_utils import structure_tester, value_tester cfg = QATConfig() cfg.bn_folding = True cfg.bn_self_folding = True cfg.channel_last = False cfg.precision_mode = precision_mode cfg.skip_inputs_layers = [] cfg.skip_outputs_layers = [] # Random number np.random.seed(seed) rng = np.random.RandomState(seed) # Graph with nn.context_scope(ctx): x_data = rng.randn(batch_size, 3, 32, 32) gt_label = nn.Variable((batch_size, 1)) x = nn.Variable((batch_size, 3, 32, 32)) y_tgt = graph_act(x, test=False, w_bias=True) loss = F.mean(F.softmax_cross_entropy(y_tgt, gt_label)) solver = S.Adam(0.001) solver.set_parameters(nn.get_parameters(grad_only=True)) # train the float32 network for i in range(100): input_data = np.random.random((batch_size, 3, 32, 32)) input_label = np.random.randint(0, 10, size=(batch_size, 1)) gt_label.d = input_label x.d = input_data loss.forward() loss.backward() solver.update() # BN folding & BN self folding modifiers = [] if cfg.bn_folding: modifiers.append( GC.BatchNormalizationFoldingModifier( opposite=False, channel_last=cfg.channel_last)) modifiers.append( GC.BatchNormalizationFoldingModifier( opposite=True, channel_last=cfg.channel_last)) # Go through BN self folding if cfg.bn_self_folding: modifiers.append(GC.BatchNormalizationSelfFoldingModifier()) if len(modifiers) > 0: y_tgt_without_bn = GC.GraphConverter(modifiers).convert(y_tgt) y_tgt.rewire_on(y_tgt_without_bn) # convert to recording funcrankrecorder = FunctionsRankRecorder() y_tgt.visit(funcrankrecorder) modifiers = [ GC.QuantizeNonQNNToRecordingModifier( funcrankrecorder.functions_ranks, config=cfg) ] y_act_rec = GC.GraphConverter(modifiers).convert(y_tgt) y_tgt.rewire_on(y_act_rec) y_tgt.need_grad = False # solver.clear_parameters() solver.set_parameters(nn.get_parameters(grad_only=True)) for i in range(100): input_data = np.random.random((batch_size, 3, 32, 32)) input_label = np.random.randint(0, 10, size=(batch_size, 1)) gt_label.d = input_label x.d = input_data loss.forward() loss.backward() solver.update() # Remove recorder modifiers = [] modifiers.append( GC.RemoveFunctionModifier(rm_funcs=[ cfg.recorder_activation().name(), cfg.recorder_weight().name() ])) y_tgt = GC.GraphConverter(modifiers).convert(y_tgt) # Collect functions rank funcrankrecorder = FunctionsRankRecorder() y_tgt.visit(funcrankrecorder) # convert to training modifiers = [ GC.QuantizeRecordingToTrainingModifier( funcrankrecorder.functions_ranks, config=cfg) ] y_act = GC.GraphConverter(modifiers).convert(y_tgt) y_act.forward() # # # Ref Graph y_ref = graph_ref(x, cfg, test=True) # # # Test structure_tester(y_ref, y_act)
def recon_loss(x, y): return F.mean(F.absolute_error(x, y))
def train(): args = get_args() # Get context. from nnabla.contrib.context import extension_context extension_module = args.context if args.context is None: extension_module = 'cpu' logger.info("Running in %s" % extension_module) ctx = extension_context(extension_module, device_id=args.device_id) nn.set_default_context(ctx) # Create CNN network for both training and testing. mnist_cnn_prediction = mnist_lenet_prediction # TRAIN reference = "reference" # Create input variables. image = nn.Variable([args.batch_size, 1, 28, 28]) label = nn.Variable([args.batch_size, 1]) # Create `reference` prediction graph. pred = mnist_cnn_prediction(image, scope=reference, test=False) pred.persistent = True # Create loss function. loss = F.mean(F.softmax_cross_entropy(pred, label)) # TEST # Create input variables. vimage = nn.Variable([args.batch_size, 1, 28, 28]) vlabel = nn.Variable([args.batch_size, 1]) # Create reference predition graph. vpred = mnist_cnn_prediction(vimage, scope=reference, test=True) # Create Solver. solver = S.Adam(args.learning_rate) solver.set_parameters(nn.get_parameters()) # Create monitor. from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=10) monitor_err = MonitorSeries("Training error", monitor, interval=10) monitor_time = MonitorTimeElapsed("Training time", monitor, interval=100) monitor_verr = MonitorSeries("Test error", monitor, interval=10) # Initialize DataIterator for MNIST. data = data_iterator_mnist(args.batch_size, True) vdata = data_iterator_mnist(args.batch_size, False) best_ve = 1.0 ve = 1.0 # Training loop. for i in range(args.max_iter): if i % args.val_interval == 0: # Validation ve = 0.0 for j in range(args.val_iter): vimage.d, vlabel.d = vdata.next() vpred.forward(clear_buffer=True) ve += categorical_error(vpred.d, vlabel.d) ve /= args.val_iter monitor_verr.add(i, ve) if ve < best_ve: nn.save_parameters( os.path.join(args.model_save_path, 'params_%06d.h5' % i)) best_ve = ve # Training forward image.d, label.d = data.next() solver.zero_grad() loss.forward(clear_no_need_grad=True) loss.backward(clear_buffer=True) solver.weight_decay(args.weight_decay) solver.update() e = categorical_error(pred.d, label.d) monitor_loss.add(i, loss.d.copy()) monitor_err.add(i, e) monitor_time.add(i) ve = 0.0 for j in range(args.val_iter): vimage.d, vlabel.d = vdata.next() vpred.forward(clear_buffer=True) ve += categorical_error(vpred.d, vlabel.d) monitor_verr.add(i, ve / args.val_iter) parameter_file = os.path.join(args.model_save_path, 'params_{:06}.h5'.format(args.max_iter)) nn.save_parameters(parameter_file)
def train(): """ Main script. Steps: * Parse command line arguments. * Specify a context for computation. * Initialize DataIterator for MNIST. * Construct a computation graph for training and validation. * Initialize a solver and set parameter variables to it. * Create monitor instances for saving and displaying training stats. * Training loop * Computate error rate for validation data (periodically) * Get a next minibatch. * Execute forwardprop on the training graph. * Compute training error * Set parameter gradients zero * Execute backprop. * Solver updates parameters by using gradients computed by backprop. """ args = get_args() # Get context. from nnabla.contrib.context import extension_context extension_module = args.context if args.context is None: extension_module = 'cpu' logger.info("Running in %s" % extension_module) ctx = extension_context(extension_module, device_id=args.device_id) nn.set_default_context(ctx) # Create CNN network for both training and testing. mnist_cnn_prediction = mnist_lenet_prediction if args.net == 'resnet': mnist_cnn_prediction = mnist_resnet_prediction # TRAIN # Create input variables. image = nn.Variable([args.batch_size, 1, 28, 28]) label = nn.Variable([args.batch_size, 1]) # Create prediction graph. pred = mnist_cnn_prediction(image, test=False) pred.persistent = True # Create loss function. loss = F.mean(F.softmax_cross_entropy(pred, label)) # TEST # Create input variables. vimage = nn.Variable([args.batch_size, 1, 28, 28]) vlabel = nn.Variable([args.batch_size, 1]) # Create predition graph. vpred = mnist_cnn_prediction(vimage, test=True) # Create Solver. solver = S.Adam(args.learning_rate) solver.set_parameters(nn.get_parameters()) # Create monitor. from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=10) monitor_err = MonitorSeries("Training error", monitor, interval=10) monitor_time = MonitorTimeElapsed("Training time", monitor, interval=100) monitor_verr = MonitorSeries("Test error", monitor, interval=10) # Initialize DataIterator for MNIST. data = data_iterator_mnist(args.batch_size, True) vdata = data_iterator_mnist(args.batch_size, False) # Training loop. for i in range(args.max_iter): if i % args.val_interval == 0: # Validation ve = 0.0 for j in range(args.val_iter): vimage.d, vlabel.d = vdata.next() vpred.forward(clear_buffer=True) ve += categorical_error(vpred.d, vlabel.d) monitor_verr.add(i, ve / args.val_iter) if i % args.model_save_interval == 0: nn.save_parameters(os.path.join( args.model_save_path, 'params_%06d.h5' % i)) # Training forward image.d, label.d = data.next() solver.zero_grad() loss.forward(clear_no_need_grad=True) loss.backward(clear_buffer=True) solver.weight_decay(args.weight_decay) solver.update() e = categorical_error(pred.d, label.d) monitor_loss.add(i, loss.d.copy()) monitor_err.add(i, e) monitor_time.add(i) ve = 0.0 for j in range(args.val_iter): vimage.d, vlabel.d = vdata.next() vpred.forward(clear_buffer=True) ve += categorical_error(vpred.d, vlabel.d) monitor_verr.add(i, ve / args.val_iter) parameter_file = os.path.join( args.model_save_path, '{}_params_{:06}.h5'.format(args.net, args.max_iter)) nn.save_parameters(parameter_file)