def train(): """ Main script. Steps: * Parse command line arguments. * Specify a context for computation. * Initialize DataIterator for CIFAR10. * Construct a computation graph for training and validation. * Initialize a solver and set parameter variables to it. * Training loop * Computate error rate for validation data (periodically) * Get a next minibatch. * Execute forwardprop on the training graph. * Compute training error * Set parameter gradients zero * Execute backprop. * Solver updates parameters by using gradients computed by backprop. """ # define training parameters augmented_shift = True augmented_flip = True batch_size = 128 vbatch_size = 100 num_classes = 10 weight_decay = 0.0002 momentum = 0.9 learning_rates = (cfg.initial_learning_rate,)*80 + \ (cfg.initial_learning_rate / 10.,)*40 + \ (cfg.initial_learning_rate / 100.,)*40 print('lr={}'.format(learning_rates)) print('weight_decay={}'.format(weight_decay)) print('momentum={}'.format(momentum)) # create nabla context from nnabla.ext_utils import get_extension_context ctx = get_extension_context('cudnn', device_id=args.gpu) nn.set_default_context(ctx) # Initialize DataIterator for CIFAR10. logger.info("Get CIFAR10 Data ...") data = cifar_data.DataIterator(batch_size, augmented_shift=augmented_shift, augmented_flip=augmented_flip) vdata = cifar_data.DataIterator(vbatch_size, val=True) if cfg.weightfile is not None: logger.info(f"Loading weights from {cfg.weightfile}") nn.load_parameters(cfg.weightfile) # TRAIN # Create input variables. image = nn.Variable([batch_size, 3, 32, 32]) label = nn.Variable([batch_size, 1]) # Create prediction graph. pred, hidden = resnet_cifar10(image, num_classes=num_classes, cfg=cfg, test=False) pred.persistent = True # Compute initial network size num_weights, kbytes_weights = network_size_weights() kbytes_weights.forward() print(f"Initial network size (weights) is {float(kbytes_weights.d):.3f}KB " f"(total number of weights: {int(num_weights):d}).") num_activations, kbytes_activations = network_size_activations() kbytes_activations.forward() print( f"Initial network size (activations) is {float(kbytes_activations.d):.3f}KB " f"(total number of activations: {int(num_activations):d}).") # Create loss function. cost_lambda2 = nn.Variable(()) cost_lambda2.d = cfg.initial_cost_lambda2 cost_lambda2.persistent = True cost_lambda3 = nn.Variable(()) cost_lambda3.d = cfg.initial_cost_lambda3 cost_lambda3.persistent = True loss1 = F.mean(F.softmax_cross_entropy(pred, label)) loss1.persistent = True if cfg.target_weight_kbytes > 0: loss2 = F.relu(kbytes_weights - cfg.target_weight_kbytes)**2 loss2.persistent = True else: loss2 = nn.Variable(()) loss2.d = 0 loss2.persistent = True if cfg.target_activation_kbytes > 0: loss3 = F.relu(kbytes_activations - cfg.target_activation_kbytes)**2 loss3.persistent = True else: loss3 = nn.Variable(()) loss3.d = 0 loss3.persistent = True loss = loss1 + cost_lambda2 * loss2 + cost_lambda3 * loss3 # VALID # Create input variables. vimage = nn.Variable([vbatch_size, 3, 32, 32]) vlabel = nn.Variable([vbatch_size, 1]) # Create predition graph. vpred, vhidden = resnet_cifar10(vimage, num_classes=num_classes, cfg=cfg, test=True) vpred.persistent = True # Create Solver. if cfg.optimizer == "adam": solver = S.Adam(alpha=learning_rates[0]) else: solver = S.Momentum(learning_rates[0], momentum) solver.set_parameters(nn.get_parameters()) # Training loop (epochs) logger.info("Start Training ...") i = 0 best_v_err = 1.0 # logs of the results iters = [] res_train_err = [] res_train_loss = [] res_val_err = [] # print all variables that exist for k in nn.get_parameters(): print(k) res_n_b = collections.OrderedDict() res_n_w = collections.OrderedDict() res_n_a = collections.OrderedDict() res_d_b = collections.OrderedDict() res_d_w = collections.OrderedDict() res_d_a = collections.OrderedDict() res_xmin_b = collections.OrderedDict() res_xmin_w = collections.OrderedDict() res_xmin_a = collections.OrderedDict() res_xmax_b = collections.OrderedDict() res_xmax_w = collections.OrderedDict() res_xmax_a = collections.OrderedDict() for k in nn.get_parameters(): if (k.split('/')[-1] == 'n') and (k.split('/')[-3] == 'bquant'): res_n_b[k] = [] for k in nn.get_parameters(): if (k.split('/')[-1] == 'n') and (k.split('/')[-3] == 'Wquant'): res_n_w[k] = [] for k in nn.get_parameters(): if (k.split('/')[-1] == 'n') and (k.split('/')[-3] == 'Aquant'): res_n_a[k] = [] for k in nn.get_parameters(): if (k.split('/')[-1] == 'd') and (k.split('/')[-3] == 'bquant'): res_d_b[k] = [] for k in nn.get_parameters(): if (k.split('/')[-1] == 'd') and (k.split('/')[-3] == 'Wquant'): res_d_w[k] = [] for k in nn.get_parameters(): if (k.split('/')[-1] == 'd') and (k.split('/')[-3] == 'Aquant'): res_d_a[k] = [] for k in nn.get_parameters(): if (k.split('/')[-1] == 'xmin') and (k.split('/')[-3] == 'bquant'): res_xmin_b[k] = [] for k in nn.get_parameters(): if (k.split('/')[-1] == 'xmin') and (k.split('/')[-3] == 'Wquant'): res_xmin_w[k] = [] for k in nn.get_parameters(): if (k.split('/')[-1] == 'xmin') and (k.split('/')[-3] == 'Aquant'): res_xmin_a[k] = [] for k in nn.get_parameters(): if (k.split('/')[-1] == 'xmax') and (k.split('/')[-3] == 'bquant'): res_xmax_b[k] = [] for k in nn.get_parameters(): if (k.split('/')[-1] == 'xmax') and (k.split('/')[-3] == 'Wquant'): res_xmax_w[k] = [] for k in nn.get_parameters(): if (k.split('/')[-1] == 'xmax') and (k.split('/')[-3] == 'Aquant'): res_xmax_a[k] = [] for epoch in range(len(learning_rates)): train_loss = list() train_loss1 = list() train_loss2 = list() train_loss3 = list() train_err = list() # check whether we need to adapt the learning rate if epoch > 0 and learning_rates[epoch - 1] != learning_rates[epoch]: solver.set_learning_rate(learning_rates[epoch]) # Training loop (iterations) start_epoch = True while data.current != 0 or start_epoch: start_epoch = False # Next batch image.d, label.d = data.next() # Training forward/backward solver.zero_grad() loss.forward() loss.backward() if weight_decay is not None: solver.weight_decay(weight_decay) # scale gradients if cfg.target_weight_kbytes > 0 or cfg.target_activation_kbytes > 0: clip_quant_grads() solver.update() e = categorical_error(pred.d, label.d) train_loss += [loss.d] train_loss1 += [loss1.d] train_loss2 += [loss2.d] train_loss3 += [loss3.d] train_err += [e] # make sure that parametric values are clipped to correct values (if outside) clip_quant_vals() # Intermediate Validation (when constraint is set and fulfilled) kbytes_weights.forward() kbytes_activations.forward() if ((cfg.target_weight_kbytes > 0 and (cfg.target_weight_kbytes <= 0 or float(kbytes_weights.d) <= cfg.target_weight_kbytes) and (cfg.target_activation_kbytes <= 0 or float( kbytes_activations.d) <= cfg.target_activation_kbytes))): ve = list() start_epoch_ = True while vdata.current != 0 or start_epoch_: start_epoch_ = False vimage.d, vlabel.d = vdata.next() vpred.forward() ve += [categorical_error(vpred.d, vlabel.d)] v_err = np.array(ve).mean() if v_err < best_v_err: best_v_err = v_err nn.save_parameters( os.path.join(cfg.params_dir, 'params_best.h5')) print( f'Best validation error (fulfilling constraints: {best_v_err}' ) sys.stdout.flush() sys.stderr.flush() i += 1 # Validation ve = list() start_epoch = True while vdata.current != 0 or start_epoch: start_epoch = False vimage.d, vlabel.d = vdata.next() vpred.forward() ve += [categorical_error(vpred.d, vlabel.d)] v_err = np.array(ve).mean() kbytes_weights.forward() kbytes_activations.forward() if ((v_err < best_v_err and (cfg.target_weight_kbytes <= 0 or float(kbytes_weights.d) <= cfg.target_weight_kbytes) and (cfg.target_activation_kbytes <= 0 or float(kbytes_activations.d) <= cfg.target_activation_kbytes))): best_v_err = v_err nn.save_parameters(os.path.join(cfg.params_dir, 'params_best.h5')) sys.stdout.flush() sys.stderr.flush() if cfg.target_weight_kbytes > 0: print( f"Current network size (weights) is {float(kbytes_weights.d):.3f}KB " f"(#params: {int(num_weights)}, " f"avg. bitwidth: {8. * 1024. * kbytes_weights.d / num_weights})" ) sys.stdout.flush() sys.stderr.flush() if cfg.target_activation_kbytes > 0: print( f"Current network size (activations) is {float(kbytes_activations.d):.3f}KB" ) sys.stdout.flush() sys.stderr.flush() for k in nn.get_parameters(): if k.split('/')[-1] == 'n': print(f'{k}', f'{nn.get_parameters()[k].d}', f'{nn.get_parameters()[k].g}') sys.stdout.flush() sys.stderr.flush() if k.split('/')[-3] == 'bquant': res_n_b[k].append(np.asscalar(nn.get_parameters()[k].d)) elif k.split('/')[-3] == 'Wquant': res_n_w[k].append(np.asscalar(nn.get_parameters()[k].d)) elif k.split('/')[-3] == 'Aquant': res_n_a[k].append(np.asscalar(nn.get_parameters()[k].d)) elif k.split('/')[-1] == 'd': print(f'{k}', f'{nn.get_parameters()[k].d}', f'{nn.get_parameters()[k].g}') sys.stdout.flush() sys.stderr.flush() if k.split('/')[-3] == 'bquant': res_d_b[k].append(np.asscalar(nn.get_parameters()[k].d)) elif k.split('/')[-3] == 'Wquant': res_d_w[k].append(np.asscalar(nn.get_parameters()[k].d)) elif k.split('/')[-3] == 'Aquant': res_d_a[k].append(np.asscalar(nn.get_parameters()[k].d)) elif k.split('/')[-1] == 'xmin': print(f'{k}', f'{nn.get_parameters()[k].d}', f'{nn.get_parameters()[k].g}') sys.stdout.flush() sys.stderr.flush() if k.split('/')[-3] == 'bquant': res_xmin_b[k].append(np.asscalar(nn.get_parameters()[k].d)) elif k.split('/')[-3] == 'Wquant': res_xmin_w[k].append(np.asscalar(nn.get_parameters()[k].d)) elif k.split('/')[-3] == 'Aquant': res_xmin_a[k].append(np.asscalar(nn.get_parameters()[k].d)) elif k.split('/')[-1] == 'xmax': print(f'{k}', f'{nn.get_parameters()[k].d}', f'{nn.get_parameters()[k].g}') sys.stdout.flush() sys.stderr.flush() if k.split('/')[-3] == 'bquant': res_xmax_b[k].append(np.asscalar(nn.get_parameters()[k].d)) elif k.split('/')[-3] == 'Wquant': res_xmax_w[k].append(np.asscalar(nn.get_parameters()[k].d)) elif k.split('/')[-3] == 'Aquant': res_xmax_a[k].append(np.asscalar(nn.get_parameters()[k].d)) # Print logger.info(f'epoch={epoch}(iter={i}); ' f'overall cost={np.array(train_loss).mean()}; ' f'cross-entropy cost={np.array(train_loss1).mean()}; ' f'weight-size cost={np.array(train_loss2).mean()}; ' f'activations-size cost={np.array(train_loss3).mean()}; ' f'TrainErr={np.array(train_err).mean()}; ' f'ValidErr={v_err}; BestValidErr={best_v_err}') sys.stdout.flush() sys.stderr.flush() # update the logs iters.append(i) res_train_err.append(np.array(train_err).mean()) res_train_loss.append([ np.array(train_loss).mean(), np.array(train_loss1).mean(), np.array(train_loss2).mean(), np.array(train_loss3).mean() ]) res_val_err.append(np.array(v_err).mean()) res_ges = np.concatenate([ np.array(iters)[:, np.newaxis], np.array(res_train_err)[:, np.newaxis], np.array(res_val_err)[:, np.newaxis], np.array(res_train_loss) ], axis=-1) # save the results np.savetxt(cfg.params_dir + '/results.csv', np.array(res_ges), fmt='%10.8f', header='iter,train_err,val_err,loss,loss1,loss2,loss3', comments='', delimiter=',') for rs, res in zip([ 'res_n_b.csv', 'res_n_w.csv', 'res_n_a.csv', 'res_d_b.csv', 'res_d_w.csv', 'res_d_a.csv', 'res_min_b.csv', 'res_min_w.csv', 'res_min_a.csv', 'res_max_b.csv', 'res_max_w.csv', 'res_max_a.csv' ], [ res_n_b, res_n_w, res_n_a, res_d_b, res_d_w, res_d_a, res_xmin_b, res_xmin_w, res_xmin_a, res_xmax_b, res_xmax_w, res_xmax_a ]): res_mat = np.array([res[i] for i in res]) if res_mat.shape[0] > 1 and res_mat.shape[1] > 1: np.savetxt( cfg.params_dir + '/' + rs, np.array([[i, j, res_mat[i, j]] for i, j in product( range(res_mat.shape[0]), range(res_mat.shape[1]))]), fmt='%10.8f', comments='', delimiter=',')
def _create_optimizer(ctx, o, networks, datasets): class Optimizer: pass optimizer = Optimizer() optimizer.comm = current_communicator() comm_size = optimizer.comm.size if optimizer.comm else 1 optimizer.start_iter = (o.start_iter - 1) // comm_size + \ 1 if o.start_iter > 0 else 0 optimizer.end_iter = (o.end_iter - 1) // comm_size + \ 1 if o.end_iter > 0 else 0 optimizer.name = o.name optimizer.order = o.order optimizer.update_interval = o.update_interval if o.update_interval > 0 else 1 optimizer.network = networks[o.network_name] optimizer.data_iterators = OrderedDict() for d in o.dataset_name: optimizer.data_iterators[d] = datasets[d].data_iterator optimizer.dataset_assign = OrderedDict() for d in o.data_variable: optimizer.dataset_assign[optimizer.network.variables[ d.variable_name]] = d.data_name optimizer.generator_assign = OrderedDict() for g in o.generator_variable: optimizer.generator_assign[optimizer.network.variables[ g.variable_name]] = _get_generator(g) optimizer.loss_variables = [] for l in o.loss_variable: optimizer.loss_variables.append( optimizer.network.variables[l.variable_name]) optimizer.parameter_learning_rate_multipliers = OrderedDict() for p in o.parameter_variable: param_variable_names = _get_matching_variable_names( p.variable_name, optimizer.network.variables.keys()) for v_name in param_variable_names: optimizer.parameter_learning_rate_multipliers[ optimizer.network. variables[v_name]] = p.learning_rate_multiplier with nn.context_scope(ctx): if o.solver.type == 'Adagrad': optimizer.solver = S.Adagrad(o.solver.adagrad_param.lr, o.solver.adagrad_param.eps) init_lr = o.solver.adagrad_param.lr elif o.solver.type == 'Adadelta': optimizer.solver = S.Adadelta(o.solver.adadelta_param.lr, o.solver.adadelta_param.decay, o.solver.adadelta_param.eps) init_lr = o.solver.adadelta_param.lr elif o.solver.type == 'Adam': optimizer.solver = S.Adam(o.solver.adam_param.alpha, o.solver.adam_param.beta1, o.solver.adam_param.beta2, o.solver.adam_param.eps) init_lr = o.solver.adam_param.alpha elif o.solver.type == 'Adamax': optimizer.solver = S.Adamax(o.solver.adamax_param.alpha, o.solver.adamax_param.beta1, o.solver.adamax_param.beta2, o.solver.adamax_param.eps) init_lr = o.solver.adamax_param.alpha elif o.solver.type == 'AdaBound': optimizer.solver = S.AdaBound(o.solver.adabound_param.alpha, o.solver.adabound_param.beta1, o.solver.adabound_param.beta2, o.solver.adabound_param.eps, o.solver.adabound_param.final_lr, o.solver.adabound_param.gamma) init_lr = o.solver.adabound_param.alpha elif o.solver.type == 'AMSGRAD': optimizer.solver = S.AMSGRAD(o.solver.amsgrad_param.alpha, o.solver.amsgrad_param.beta1, o.solver.amsgrad_param.beta2, o.solver.amsgrad_param.eps) init_lr = o.solver.amsgrad_param.alpha elif o.solver.type == 'AMSBound': optimizer.solver = S.AMSBound(o.solver.amsbound_param.alpha, o.solver.amsbound_param.beta1, o.solver.amsbound_param.beta2, o.solver.amsbound_param.eps, o.solver.amsbound_param.final_lr, o.solver.amsbound_param.gamma) init_lr = o.solver.amsbound_param.alpha elif o.solver.type == 'Eve': p = o.solver.eve_param optimizer.solver = S.Eve(p.alpha, p.beta1, p.beta2, p.beta3, p.k, p.k2, p.eps) init_lr = p.alpha elif o.solver.type == 'Momentum': optimizer.solver = S.Momentum(o.solver.momentum_param.lr, o.solver.momentum_param.momentum) init_lr = o.solver.momentum_param.lr elif o.solver.type == 'Nesterov': optimizer.solver = S.Nesterov(o.solver.nesterov_param.lr, o.solver.nesterov_param.momentum) init_lr = o.solver.nesterov_param.lr elif o.solver.type == 'RMSprop': optimizer.solver = S.RMSprop(o.solver.rmsprop_param.lr, o.solver.rmsprop_param.decay, o.solver.rmsprop_param.eps) init_lr = o.solver.rmsprop_param.lr elif o.solver.type == 'Sgd' or o.solver.type == 'SGD': optimizer.solver = S.Sgd(o.solver.sgd_param.lr) init_lr = o.solver.sgd_param.lr else: raise ValueError('Solver "' + o.solver.type + '" is not supported.') parameters = { v.name: v.variable_instance for v, local_lr in optimizer.parameter_learning_rate_multipliers.items() if local_lr > 0.0 } optimizer.solver.set_parameters(parameters) optimizer.parameters = OrderedDict( sorted(parameters.items(), key=lambda x: x[0])) optimizer.weight_decay = o.solver.weight_decay # keep following 2 lines for backward compatibility optimizer.lr_decay = o.solver.lr_decay if o.solver.lr_decay > 0.0 else 1.0 optimizer.lr_decay_interval = o.solver.lr_decay_interval if o.solver.lr_decay_interval > 0 else 1 optimizer.solver.set_states_from_protobuf(o) optimizer.comm = current_communicator() comm_size = optimizer.comm.size if optimizer.comm else 1 optimizer.scheduler = ExponentialScheduler(init_lr, 1.0, 1) if o.solver.lr_scheduler_type == 'Polynomial': if o.solver.polynomial_scheduler_param.power != 0.0: optimizer.scheduler = PolynomialScheduler( init_lr, o.solver.polynomial_scheduler_param.max_iter // comm_size, o.solver.polynomial_scheduler_param.power) elif o.solver.lr_scheduler_type == 'Cosine': optimizer.scheduler = CosineScheduler( init_lr, o.solver.cosine_scheduler_param.max_iter // comm_size) elif o.solver.lr_scheduler_type == 'Exponential': if o.solver.exponential_scheduler_param.gamma != 1.0: optimizer.scheduler = ExponentialScheduler( init_lr, o.solver.exponential_scheduler_param.gamma, o.solver.exponential_scheduler_param.iter_interval // comm_size if o.solver.exponential_scheduler_param.iter_interval > comm_size else 1) elif o.solver.lr_scheduler_type == 'Step': if o.solver.step_scheduler_param.gamma != 1.0 and len( o.solver.step_scheduler_param.iter_steps) > 0: optimizer.scheduler = StepScheduler( init_lr, o.solver.step_scheduler_param.gamma, [ step // comm_size for step in o.solver.step_scheduler_param.iter_steps ]) elif o.solver.lr_scheduler_type == 'Custom': # ToDo raise NotImplementedError() elif o.solver.lr_scheduler_type == '': if o.solver.lr_decay_interval != 0 or o.solver.lr_decay != 0.0: optimizer.scheduler = ExponentialScheduler( init_lr, o.solver.lr_decay if o.solver.lr_decay > 0.0 else 1.0, o.solver.lr_decay_interval // comm_size if o.solver.lr_decay_interval > comm_size else 1) else: raise ValueError('Learning Rate Scheduler "' + o.solver.lr_scheduler_type + '" is not supported.') if o.solver.lr_warmup_scheduler_type == 'Linear': if o.solver.linear_warmup_scheduler_param.warmup_iter >= comm_size: optimizer.scheduler = LinearWarmupScheduler( optimizer.scheduler, o.solver.linear_warmup_scheduler_param.warmup_iter // comm_size) optimizer.forward_sequence = optimizer.network.get_forward_sequence( optimizer.loss_variables) optimizer.backward_sequence = optimizer.network.get_backward_sequence( optimizer.loss_variables, optimizer.parameter_learning_rate_multipliers) return optimizer
def main(): """ Main script. Steps: * Get and set context. * Load Dataset * Initialize DataIterator. * Create Networks * Net for Labeled Data * Net for Unlabeled Data * Net for Test Data * Create Solver. * Training Loop. * Test * Training * by Labeled Data * Calculate Cross Entropy Loss * by Unlabeled Data * Estimate Adversarial Direction * Calculate LDS Loss """ args = get_args() # Get context. from nnabla.contrib.context import extension_context extension_module = args.context if args.context is None: extension_module = 'cpu' logger.info("Running in %s" % extension_module) ctx = extension_context(extension_module, device_id=args.device_id) nn.set_default_context(ctx) shape_x = (1, 28, 28) n_h = args.n_units n_y = args.n_class # Load MNist Dataset from mnist_data import MnistDataSource with MnistDataSource(train=True) as d: x_t = d.images t_t = d.labels with MnistDataSource(train=False) as d: x_v = d.images t_v = d.labels x_t = np.array(x_t / 256.0).astype(np.float32) x_t, t_t = x_t[:args.n_train], t_t[:args.n_train] x_v, t_v = x_v[:args.n_valid], t_v[:args.n_valid] # Create Semi-supervised Datasets x_l, t_l, x_u, _ = split_dataset(x_t, t_t, args.n_labeled, args.n_class) x_u = np.r_[x_l, x_u] x_v = np.array(x_v / 256.0).astype(np.float32) # Create DataIterators for datasets of labeled, unlabeled and validation di_l = DataIterator(args.batchsize_l, [x_l, t_l]) di_u = DataIterator(args.batchsize_u, [x_u]) di_v = DataIterator(args.batchsize_v, [x_v, t_v]) # Create networks # feed-forward-net building function def forward(x, test=False): return mlp_net(x, n_h, n_y, test) # Net for learning labeled data xl = nn.Variable((args.batchsize_l,) + shape_x, need_grad=False) hl = forward(xl, test=False) tl = nn.Variable((args.batchsize_l, 1), need_grad=False) loss_l = F.mean(F.softmax_cross_entropy(hl, tl)) # Net for learning unlabeled data xu = nn.Variable((args.batchsize_u,) + shape_x, need_grad=False) r = nn.Variable((args.batchsize_u,) + shape_x, need_grad=True) eps = nn.Variable((args.batchsize_u,) + shape_x, need_grad=False) loss_u, yu = vat(xu, r, eps, forward, distance) # Net for evaluating valiation data xv = nn.Variable((args.batchsize_v,) + shape_x, need_grad=False) hv = forward(xv, test=True) tv = nn.Variable((args.batchsize_v, 1), need_grad=False) # Create solver solver = S.Adam(args.learning_rate) solver.set_parameters(nn.get_parameters()) # Monitor trainig and validation stats. import nnabla.monitor as M monitor = M.Monitor(args.model_save_path) monitor_verr = M.MonitorSeries("Test error", monitor, interval=240) monitor_time = M.MonitorTimeElapsed("Elapsed time", monitor, interval=240) # Training Loop. t0 = time.time() for i in range(args.max_iter): # Validation Test if i % args.val_interval == 0: n_error = calc_validation_error( di_v, xv, tv, hv, args.val_iter) monitor_verr.add(i, n_error) ################################# ## Training by Labeled Data ##### ################################# # input minibatch of labeled data into variables xl.d, tl.d = di_l.next() # initialize gradients solver.zero_grad() # forward, backward and update loss_l.forward(clear_no_need_grad=True) loss_l.backward(clear_buffer=True) solver.weight_decay(args.weight_decay) solver.update() ################################# ## Training by Unlabeled Data ### ################################# # input minibatch of unlabeled data into variables xu.d, = di_u.next() ##### Calculate Adversarial Noise ##### # Sample random noise n = np.random.normal(size=xu.shape).astype(np.float32) # Normalize noise vector and input to variable r.d = get_direction(n) # Set xi, the power-method scaling parameter. eps.data.fill(args.xi_for_vat) # Calculate y without noise, only once. yu.forward(clear_buffer=True) # Do power method iteration for k in range(args.n_iter_for_power_method): # Initialize gradient to receive value r.grad.zero() # forward, backward, without update loss_u.forward(clear_no_need_grad=True) loss_u.backward(clear_buffer=True) # Normalize gradinet vector and input to variable r.d = get_direction(r.g) ##### Calculate loss for unlabeled data ##### # Clear remained gradients solver.zero_grad() # Set epsilon, the adversarial noise scaling parameter. eps.data.fill(args.eps_for_vat) # forward, backward and update loss_u.forward(clear_no_need_grad=True) loss_u.backward(clear_buffer=True) solver.weight_decay(args.weight_decay) solver.update() ##### Learning rate update ##### if i % args.iter_per_epoch == 0: solver.set_learning_rate( solver.learning_rate() * args.learning_rate_decay) monitor_time.add(i) # Evaluate the final model by the error rate with validation dataset valid_error = calc_validation_error(di_v, xv, tv, hv, args.val_iter) monitor_verr.add(i, valid_error) monitor_time.add(i) # Save the model. nnp_file = os.path.join( args.model_save_path, 'vat_%06d.nnp' % args.max_iter) runtime_contents = { 'networks': [ {'name': 'Validation', 'batch_size': args.batchsize_v, 'outputs': {'y': hv}, 'names': {'x': xv}}], 'executors': [ {'name': 'Runtime', 'network': 'Validation', 'data': ['x'], 'output': ['y']}]} save.save(nnp_file, runtime_contents) from cpp_forward_check import check_cpp_forward check_cpp_forward(args.model_save_path, [xv.d], [xv], hv, nnp_file)
def train(): """ Main script. Steps: * Parse command line arguments. * Specify a context for computation. * Initialize DataIterator for MNIST. * Construct a computation graph for training and validation. * Initialize a solver and set parameter variables to it. * Create monitor instances for saving and displaying training stats. * Training loop * Computate error rate for validation data (periodically) * Get a next minibatch. * Set parameter gradients zero * Execute forwardprop on the training graph. * Execute backprop. * Solver updates parameters by using gradients computed by backprop. * Compute training error """ args = get_args(monitor_path='tmp.monitor.bnn') # Get context. from nnabla.contrib.context import extension_context extension_module = args.context if args.context is None: extension_module = 'cpu' logger.info("Running in %s" % extension_module) ctx = extension_context(extension_module, device_id=args.device_id) nn.set_default_context(ctx) # Initialize DataIterator for MNIST. data = data_iterator_mnist(args.batch_size, True) vdata = data_iterator_mnist(args.batch_size, False) # Create CNN network for both training and testing. mnist_cnn_prediction = mnist_inq_lenet_prediction if args.net == 'inq': mnist_cnn_prediction = mnist_inq_lenet_prediction elif args.net == 'inq_resnet': mnist_cnn_prediction = mnist_inq_resnet_prediction # TRAIN # Create input variables. image = nn.Variable([args.batch_size, 1, 28, 28]) label = nn.Variable([args.batch_size, 1]) # Create predition graph. pred = mnist_cnn_prediction(image / 255, test=False) pred.persistent = True # Create loss function. loss = F.mean(F.softmax_cross_entropy(pred, label)) # TEST # Create input variables. vimage = nn.Variable([args.batch_size, 1, 28, 28]) vlabel = nn.Variable([args.batch_size, 1]) # Create predition graph. vpred = mnist_cnn_prediction(vimage / 255, test=True) # Create Solver. solver = S.Adam(args.learning_rate) solver.set_parameters(nn.get_parameters()) # Create monitor. import nnabla.monitor as M monitor = M.Monitor(args.monitor_path) monitor_loss = M.MonitorSeries("Training loss", monitor, interval=10) monitor_err = M.MonitorSeries("Training error", monitor, interval=10) monitor_time = M.MonitorTimeElapsed("Training time", monitor, interval=100) monitor_verr = M.MonitorSeries("Test error", monitor, interval=10) # Training loop. for i in range(args.max_iter): if i % args.val_interval == 0: # Validation ve = 0.0 for j in range(args.val_iter): vimage.d, vlabel.d = vdata.next() vpred.forward(clear_buffer=True) ve += categorical_error(vpred.d, vlabel.d) monitor_verr.add(i, ve / args.val_iter) if i % args.model_save_interval == 0: nn.save_parameters(os.path.join( args.model_save_path, 'params_%06d.h5' % i)) # Training forward image.d, label.d = data.next() solver.zero_grad() loss.forward(clear_no_need_grad=True) # Training backward & update loss.backward(clear_buffer=True) solver.weight_decay(args.weight_decay) solver.update() # Monitor e = categorical_error(pred.d, label.d) monitor_loss.add(i, loss.d.copy()) monitor_err.add(i, e) monitor_time.add(i) parameter_file = os.path.join( args.model_save_path, 'params_%06d.h5' % args.max_iter) nn.save_parameters(parameter_file)
def main(): # Get arguments args = get_args() data_file = "https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.train.txt" model_file = args.work_dir + "model.h5" # Load Dataset itow, wtoi, dataset = load_ptbset(data_file) # Computation environment settings from nnabla.contrib.context import extension_context extension_module = args.context if args.context is None: extension_module = 'cpu' logger.info("Running in %s" % extension_module) ctx = extension_context(extension_module, device_id=args.device_id) nn.set_default_context(ctx) # Create data provider n_word = len(wtoi) n_dim = args.embed_dim batchsize = args.batchsize half_window = args.half_window_length n_negative = args.n_negative_sample di = DataIteratorForEmbeddingLearning( batchsize=batchsize, half_window=half_window, n_negative=n_negative, dataset=dataset) # Create model # - Real batch size including context samples and negative samples size = batchsize * (1 + n_negative) * (2 * (half_window - 1)) # Model for learning # - input variables xl = nn.Variable((size,)) # variable for word yl = nn.Variable((size,)) # variable for context # Embed layers for word embedding function # - f_embed : word index x to get y, the n_dim vector # -- for each sample in a minibatch hx = PF.embed(xl, n_word, n_dim, name="e1") # feature vector for word hy = PF.embed(yl, n_word, n_dim, name="e1") # feature vector for context hl = F.sum(hx * hy, axis=1) # -- Approximated likelihood of context prediction # pos: word context, neg negative samples tl = nn.Variable([size, ], need_grad=False) loss = F.sigmoid_cross_entropy(hl, tl) loss = F.mean(loss) # Model for test of searching similar words xr = nn.Variable((1,), need_grad=False) hr = PF.embed(xr, n_word, n_dim, name="e1") # feature vector for test # Create solver solver = S.Adam(args.learning_rate) solver.set_parameters(nn.get_parameters()) # Create monitor. monitor = M.Monitor(args.work_dir) monitor_loss = M.MonitorSeries( "Training loss", monitor, interval=args.monitor_interval) monitor_time = M.MonitorTimeElapsed( "Training time", monitor, interval=args.monitor_interval) # Do training max_epoch = args.max_epoch for epoch in range(max_epoch): # iteration per epoch for i in range(di.n_batch): # get minibatch xi, yi, ti = di.next() # learn solver.zero_grad() xl.d, yl.d, tl.d = xi, yi, ti loss.forward(clear_no_need_grad=True) loss.backward(clear_buffer=True) solver.update() # monitor itr = epoch * di.n_batch + i monitor_loss.add(itr, loss.d) monitor_time.add(itr) # Save model nn.save_parameters(model_file) # Evaluate by similarity max_check_words = args.max_check_words for i in range(max_check_words): # prediction xr.d = i hr.forward(clear_buffer=True) h = hr.d # similarity calculation w = nn.get_parameters()['e1/embed/W'].d s = np.sqrt((w * w).sum(1)) w /= s.reshape((s.shape[0], 1)) similarity = w.dot(h[0]) / s[i] # for understanding output_similar_words(itow, i, similarity)
def meta_train(args, shape_x, train_data, valid_data, test_data): # Build episode generators train_episode_generator = EpisodeGenerator( train_data[0], train_data[1], args.n_class_tr, args.n_shot_tr, args.n_query_tr) valid_episode_generator = EpisodeGenerator( valid_data[0], valid_data[1], args.n_class, args.n_shot, args.n_query) test_episode_generator = EpisodeGenerator( test_data[0], test_data[1], args.n_class, args.n_shot, args.n_query) # Build training model xs_t = nn.Variable((args.n_class_tr * args.n_shot_tr, ) + shape_x) xq_t = nn.Variable((args.n_class_tr * args.n_query_tr, ) + shape_x) hq_t = net(args.n_class_tr, xs_t, xq_t, args.embedding, args.net_type, args.metric, False) yq_t = nn.Variable((args.n_class_tr * args.n_query_tr, 1)) loss_t = F.mean(F.softmax_cross_entropy(hq_t, yq_t)) # Build evaluation model xs_v = nn.Variable((args.n_class * args.n_shot, ) + shape_x) xq_v = nn.Variable((args.n_class * args.n_query, ) + shape_x) hq_v = net(args.n_class, xs_v, xq_v, args.embedding, args.net_type, args.metric, True) yq_v = nn.Variable((args.n_class * args.n_query, 1)) err_v = F.mean(F.top_n_error(hq_v, yq_v, n=1)) # Setup solver solver = S.Adam(args.learning_rate) solver.set_parameters(nn.get_parameters()) # Monitor outputs monitor = Monitor(args.work_dir) monitor_loss = MonitorSeries( "Training loss", monitor, interval=args.iter_per_epoch) monitor_valid_err = MonitorSeries( "Validation error", monitor, interval=args.iter_per_valid) monitor_test_err = MonitorSeries("Test error", monitor) monitor_test_conf = MonitorSeries("Test error confidence", monitor) # Output files param_file = args.work_dir + "/params.h5" tsne_file = args.work_dir + "/tsne.png" # Save NNP batch_size = 1 contents = save_nnp({'x0': xs_v, 'x1': xq_v}, { 'y': hq_v}, batch_size) save.save(os.path.join(args.work_dir, 'MetricMetaLearning_epoch0.nnp'), contents, variable_batch_size=False) # Training loop train_losses = [] best_err = 1.0 for i in range(args.max_iteration): # Decay learning rate if (i + 1) % args.lr_decay_interval == 0: solver.set_learning_rate(solver.learning_rate() * args.lr_decay) # Create an episode xs_t.d, xq_t.d, yq_t.d = train_episode_generator.next() # Training by the episode solver.zero_grad() loss_t.forward(clear_no_need_grad=True) loss_t.backward(clear_buffer=True) solver.update() train_losses.append(loss_t.d.copy()) # Evaluation if (i + 1) % args.iter_per_valid == 0: train_loss = np.mean(train_losses) train_losses = [] valid_errs = [] for k in range(args.n_episode_for_valid): xs_v.d, xq_v.d, yq_v.d = valid_episode_generator.next() err_v.forward(clear_no_need_grad=True, clear_buffer=True) valid_errs.append(np.float(err_v.d.copy())) valid_err = np.mean(valid_errs) monitor_loss.add(i + 1, loss_t.d.copy()) monitor_valid_err.add(i + 1, valid_err * 100) if valid_err < best_err: best_err = valid_err nn.save_parameters(param_file) # Final evaluation nn.load_parameters(param_file) v_errs = [] for k in range(args.n_episode_for_test): xs_v.d, xq_v.d, yq_v.d = test_episode_generator.next() err_v.forward(clear_no_need_grad=True, clear_buffer=True) v_errs.append(np.float(err_v.d.copy())) v_err_mean = np.mean(v_errs) v_err_std = np.std(v_errs) v_err_conf = 1.96 * v_err_std / np.sqrt(args.n_episode_for_test) monitor_test_err.add(0, v_err_mean * 100) monitor_test_conf.add(0, v_err_conf * 100) # Visualization n_class = 50 n_sample = 20 visualize_episode_generator = EpisodeGenerator( train_data[0], train_data[1], n_class, 0, n_sample) _, samples, labels = visualize_episode_generator.next() u = get_embeddings(samples, conv4) v = get_tsne(u) plot_tsne(v[:, 0], v[:, 1], labels[:, 0], tsne_file) # Save NNP contents = save_nnp({'x0': xs_v, 'x1': xq_v}, { 'y': hq_v}, batch_size) save.save(os.path.join(args.work_dir, 'MetricMetaLearning.nnp'), contents, variable_batch_size=False)
def main(): """ Main script. Steps: * Setup calculation environment * Initialize data iterator. * Create Networks * Create Solver. * Training Loop. * Training * Test * Save """ # Set args args = get_args(monitor_path='tmp.monitor.vae', max_iter=60000, model_save_path=None, learning_rate=3e-4, batch_size=100, weight_decay=0) # Get context. from nnabla.ext_utils import get_extension_context logger.info("Running in %s" % args.context) ctx = get_extension_context(args.context, device_id=args.device_id, type_config=args.type_config) nn.set_default_context(ctx) # Initialize data provider di_l = data_iterator_mnist(args.batch_size, True) di_t = data_iterator_mnist(args.batch_size, False) # Network shape_x = (1, 28, 28) shape_z = (50, ) x = nn.Variable((args.batch_size, ) + shape_x) loss_l = vae(x, shape_z, test=False) loss_t = vae(x, shape_z, test=True) # Create solver solver = S.Adam(args.learning_rate) solver.set_parameters(nn.get_parameters()) # Monitors for training and validation monitor = M.Monitor(args.model_save_path) monitor_training_loss = M.MonitorSeries("Training loss", monitor, interval=600) monitor_test_loss = M.MonitorSeries("Test loss", monitor, interval=600) monitor_time = M.MonitorTimeElapsed("Elapsed time", monitor, interval=600) # Training Loop. for i in range(args.max_iter): # Initialize gradients solver.zero_grad() # Forward, backward and update x.d, _ = di_l.next() loss_l.forward(clear_no_need_grad=True) loss_l.backward(clear_buffer=True) solver.weight_decay(args.weight_decay) solver.update() # Forward for test x.d, _ = di_t.next() loss_t.forward(clear_no_need_grad=True) # Monitor for logging monitor_training_loss.add(i, loss_l.d.copy()) monitor_test_loss.add(i, loss_t.d.copy()) monitor_time.add(i) # Save the model nn.save_parameters( os.path.join(args.model_save_path, 'params_%06d.h5' % args.max_iter))
def _create_optimizer(ctx, o, networks, datasets): class Optimizer: pass optimizer = Optimizer() optimizer.name = o.name optimizer.order = o.order optimizer.update_interval = o.update_interval if o.update_interval > 0 else 1 optimizer.network = networks[o.network_name] optimizer.data_iterator = datasets[o.dataset_name].data_iterator optimizer.dataset_assign = OrderedDict() for d in o.data_variable: optimizer.dataset_assign[ optimizer.network.variables[d.variable_name]] = d.data_name optimizer.generator_assign = OrderedDict() for g in o.generator_variable: optimizer.generator_assign[optimizer.network.variables[ g.variable_name]] = _get_generator(g) optimizer.loss_variables = [] for l in o.loss_variable: optimizer.loss_variables.append( optimizer.network.variables[l.variable_name]) optimizer.parameter_learning_rate_multipliers = OrderedDict() for p in o.parameter_variable: param_variable_names = [v_name for v_name in optimizer.network.variables.keys( ) if v_name.find(p.variable_name) == 0] for v_name in param_variable_names: optimizer.parameter_learning_rate_multipliers[ optimizer.network.variables[v_name]] = p.learning_rate_multiplier with nn.context_scope(ctx): if o.solver.type == 'Adagrad': optimizer.solver = S.Adagrad( o.solver.adagrad_param.lr, o.solver.adagrad_param.eps) elif o.solver.type == 'Adadelta': optimizer.solver = S.Adadelta( o.solver.adadelta_param.lr, o.solver.adadelta_param.decay, o.solver.adadelta_param.eps) elif o.solver.type == 'Adam': optimizer.solver = S.Adam(o.solver.adam_param.alpha, o.solver.adam_param.beta1, o.solver.adam_param.beta2, o.solver.adam_param.eps) elif o.solver.type == 'Adamax': optimizer.solver = S.Adamax(o.solver.adamax_param.alpha, o.solver.adamax_param.beta1, o.solver.adamax_param.beta2, o.solver.adamax_param.eps) elif o.solver.type == 'Eve': p = o.solver.eve_param optimizer.solver = S.Eve( p.alpha, p.beta1, p.beta2, p.beta3, p.k, p.k2, p.eps) elif o.solver.type == 'Momentum': optimizer.solver = S.Momentum( o.solver.momentum_param.lr, o.solver.momentum_param.momentum) elif o.solver.type == 'Nesterov': optimizer.solver = S.Nesterov( o.solver.nesterov_param.lr, o.solver.nesterov_param.momentum) elif o.solver.type == 'RMSprop': optimizer.solver = S.RMSprop( o.solver.rmsprop_param.lr, o.solver.rmsprop_param.decay, o.solver.rmsprop_param.eps) elif o.solver.type == 'Sgd' or o.solver.type == 'SGD': optimizer.solver = S.Sgd(o.solver.sgd_param.lr) else: raise ValueError('Solver "' + o.solver.type + '" is not supported.') optimizer.solver.set_parameters({v.name: v.variable_instance for v, local_lr in optimizer.parameter_learning_rate_multipliers.items() if local_lr > 0.0}) optimizer.weight_decay = o.solver.weight_decay optimizer.lr_decay = o.solver.lr_decay if o.solver.lr_decay > 0.0 else 1.0 optimizer.lr_decay_interval = o.solver.lr_decay_interval if o.solver.lr_decay_interval > 0 else 1 optimizer.forward_sequence = optimizer.network.get_forward_sequence( optimizer.loss_variables) optimizer.backward_sequence = optimizer.network.get_backward_sequence( optimizer.loss_variables, optimizer.parameter_learning_rate_multipliers) return optimizer
def train_and_eval(): # Settings args = get_args() n_class = args.n_class n_shot = args.n_shot n_query = args.n_query n_class_tr = args.n_class_tr n_shot_tr = args.n_shot_tr if n_shot_tr == 0: n_shot_tr = n_shot n_query_tr = args.n_query_tr if n_query_tr == 0: n_query_tr = n_query dataset = args.dataset dataset_root = args.dataset_root init_type = args.init_type embedding = args.embedding net_type = args.net_type metric = args.metric max_iteration = args.max_iteration lr_decay_interval = args.lr_decay_interval lr_decay = args.lr_decay iter_per_epoch = args.iter_per_epoch iter_per_valid = args.iter_per_valid n_episode_for_valid = args.n_episode_for_valid n_episode_for_test = args.n_episode_for_test work_dir = args.work_dir # Set context from nnabla.ext_utils import get_extension_context logger.info("Running in %s" % args.context) ctx = get_extension_context(args.context, device_id=args.device_id, type_config=args.type_config) nn.set_default_context(ctx) # Monitor outputs from nnabla.monitor import Monitor, MonitorSeries monitor = Monitor(args.work_dir) monitor_loss = MonitorSeries("Training loss", monitor, interval=iter_per_epoch) monitor_valid_err = MonitorSeries("Validation error", monitor, interval=iter_per_valid) monitor_test_err = MonitorSeries("Test error", monitor) monitor_test_conf = MonitorSeries("Test error confidence", monitor) # Output files param_file = work_dir + "params.h5" tsne_file = work_dir + "tsne.png" # Load data shape_x = (1, 28, 28) train_data, valid_data, test_data = load_omniglot(dataset_root + "/omniglot/data/") train_episode_generator = EpisodeGenerator(n_class_tr, n_shot_tr, n_query_tr, shape_x, train_data) valid_episode_generator = EpisodeGenerator(n_class, n_shot, n_query, shape_x, valid_data) test_episode_generator = EpisodeGenerator(n_class, n_shot, n_query, shape_x, test_data) # Build training model xs_t = nn.Variable((n_class_tr * n_shot_tr, ) + shape_x) xq_t = nn.Variable((n_class_tr * n_query_tr, ) + shape_x) hq_t = net(n_class_tr, xs_t, xq_t, init_type, embedding, net_type, metric, False) yq_t = nn.Variable((n_class_tr * n_query_tr, 1)) loss_t = F.mean(F.softmax_cross_entropy(hq_t, yq_t)) # Build evaluation model xs_v = nn.Variable((n_class * n_shot, ) + shape_x) xq_v = nn.Variable((n_class * n_query, ) + shape_x) hq_v = net(n_class, xs_v, xq_v, init_type, embedding, net_type, metric, True) yq_v = nn.Variable((n_class * n_query, 1)) err_v = F.mean(F.top_n_error(hq_v, yq_v, n=1)) # Setup solver solver = S.Adam(1.0e-3) solver.set_parameters(nn.get_parameters()) learning_rate_decay_activate = True # Training loop train_losses = [] best_err = 1.0 for i in range(max_iteration): # Decay learning rate if learning_rate_decay_activate and ((i + 1) % lr_decay_interval == 0): solver.set_learning_rate(solver.learning_rate() * lr_decay) # Create an episode xs_t.d, xq_t.d, yq_t.d = train_episode_generator.next() # Training by the episode solver.zero_grad() loss_t.forward(clear_no_need_grad=True) loss_t.backward(clear_buffer=True) solver.update() train_losses.append(loss_t.d.copy()) # Evaluation if (i + 1) % iter_per_valid == 0: train_loss = np.mean(train_losses) train_losses = [] valid_errs = [] for k in range(n_episode_for_valid): xs_v.d, xq_v.d, yq_v.d = valid_episode_generator.next() err_v.forward(clear_no_need_grad=True, clear_buffer=True) valid_errs.append(np.float(err_v.d.copy())) valid_err = np.mean(valid_errs) #monitor_loss.add(i + 1, train_loss) monitor_valid_err.add(i + 1, valid_err * 100) if valid_err < best_err: best_err = valid_err nn.save_parameters(param_file) # Final evaluation nn.load_parameters(param_file) v_errs = [] for k in range(n_episode_for_test): xs_v.d, xq_v.d, yq_v.d = test_episode_generator.next() err_v.forward(clear_no_need_grad=True, clear_buffer=True) v_errs.append(np.float(err_v.d.copy())) v_err = np.mean(v_errs) v_err_conf = 1.96 * np.std(v_errs) / np.sqrt(n_episode_for_test) monitor_test_err.add(0, v_err * 100) monitor_test_conf.add(0, v_err_conf) # Visualization n_class = 50 n_sample = 20 batch = test_data[:n_class].reshape(n_class * n_sample, 1, 28, 28) label = [] for i in range(n_class): label.extend(np.ones(n_sample) * (i % 50)) u = get_embeddings(batch, conv4) v = get_tsne(u) plot_tsne(v[:, 0], v[:, 1], label, tsne_file)
def meta_train(exp_string, monitor, args): # Set monitors monitor_loss = MonitorSeries('Training loss', monitor, interval=args.print_interval, verbose=False) monitor_valid_err = MonitorSeries('Validation error', monitor, interval=args.test_print_interval, verbose=False) # Load data if args.datasource == 'omniglot': shape_x = (1, 28, 28) train_data, valid_data, _ = load_omniglot( os.path.join(args.dataset_root, 'omniglot/data/'), shape_x) else: raise ValueError('Unrecognized data source.') train_data_generator = DataGenerator(args.num_classes, args.train_num_shots, args.train_num_queries, shape_x, train_data, args.meta_batch_size) valid_data_generator = DataGenerator(args.num_classes, args.num_shots, args.num_queries, shape_x, valid_data, args.meta_batch_size) # Build training models # a: training data for inner gradient, b: test data for meta gradient inputa_t = nn.Variable((train_data_generator.num_classes * train_data_generator.num_shots, ) + train_data_generator.shape_x) inputb_t = nn.Variable((train_data_generator.num_classes * train_data_generator.num_queries, ) + train_data_generator.shape_x) labela_t = nn.Variable( (train_data_generator.num_classes * train_data_generator.num_shots, 1)) labelb_t = nn.Variable( (train_data_generator.num_classes * train_data_generator.num_queries, 1)) # Build evaluation models # a: training data for inner gradient, b: test data for meta gradient inputa_v = nn.Variable((valid_data_generator.num_classes * valid_data_generator.num_shots, ) + valid_data_generator.shape_x) inputb_v = nn.Variable((valid_data_generator.num_classes * valid_data_generator.num_queries, ) + valid_data_generator.shape_x) labela_v = nn.Variable( (valid_data_generator.num_classes * valid_data_generator.num_shots, 1)) labelb_v = nn.Variable( (valid_data_generator.num_classes * valid_data_generator.num_queries, 1)) with nn.parameter_scope('meta'): # Set weights _ = net(inputa_t, labela_t, True, args) # only definition of weights weights = nn.get_parameters() # Setup solver solver = S.Adam(args.meta_lr) solver.set_parameters(weights) if args.num_updates > 1: print( "[WARNING]: A number of updates in an inner loop is changed from " + str(args.updates) + " to 1") args.num_updates = 1 print('Done initializing, starting training.') # Training loop for itr in range(1, args.metatrain_iterations + 1): solver.zero_grad() lossesa, lossesb, accuraciesa, accuraciesb = inner_train_test( inputa_t, inputb_t, labela_t, labelb_t, train_data_generator, True, args) solver.update() # Evaluation if itr % args.print_interval == 0: preaccuracies = np.mean(accuraciesa, axis=0) postaccuracy = np.mean(accuraciesb, axis=0) print_str = 'Iteration {}: '.format(itr) for j in range(len(preaccuracies)): print_str += ' %.4f ->' % preaccuracies[j] print_str += '->-> %.4f (final accuracy at queries)' % postaccuracy print(print_str) monitor_loss.add(itr, np.mean(lossesb, axis=0)) if itr % args.test_print_interval == 0: # Inner training & testing lossesa, lossesb, accuraciesa, accuraciesb = inner_train_test( inputa_v, inputb_v, labela_v, labelb_v, valid_data_generator, False, args) # Validation preaccuracies = np.mean(accuraciesa, axis=0) postaccuracy = np.mean(accuraciesb, axis=0) print_str = 'Validation results: ' for j in range(len(preaccuracies)): print_str += ' %.4f ->' % preaccuracies[j] print_str += '->-> %.4f (final accuracy at queries)' % postaccuracy print(print_str) monitor_valid_err.add(itr, (1.0 - postaccuracy) * 100.0) if itr % args.save_interval == 0: nn.save_parameters( os.path.join(args.logdir, exp_string, 'params{}.h5'.format(itr))) if itr % args.save_interval != 0: nn.save_parameters( os.path.join(args.logdir, exp_string, 'params{}.h5'.format(itr)))
def main(): """ Main script. Steps: * Get and set context. * Load Dataset * Initialize DataIterator. * Create Networks * Net for Labeled Data * Net for Unlabeled Data * Net for Test Data * Create Solver. * Training Loop. * Test * Training * by Labeled Data * Calculate Supervised Loss * by Unlabeled Data * Calculate Virtual Adversarial Noise * Calculate Unsupervised Loss """ args = get_args() # Get context. from nnabla.ext_utils import get_extension_context logger.info("Running in %s" % args.context) ctx = get_extension_context(args.context, device_id=args.device_id, type_config=args.type_config) nn.set_default_context(ctx) shape_x = (1, 28, 28) n_h = args.n_units n_y = args.n_class # Load MNIST Dataset from mnist_data import load_mnist, data_iterator_mnist images, labels = load_mnist(train=True) rng = np.random.RandomState(706) inds = rng.permutation(len(images)) def feed_labeled(i): j = inds[i] return images[j], labels[j] def feed_unlabeled(i): j = inds[i] return images[j], labels[j] di_l = data_iterator_simple(feed_labeled, args.n_labeled, args.batchsize_l, shuffle=True, rng=rng, with_file_cache=False) di_u = data_iterator_simple(feed_unlabeled, args.n_train, args.batchsize_u, shuffle=True, rng=rng, with_file_cache=False) di_v = data_iterator_mnist(args.batchsize_v, train=False) # Create networks # feed-forward-net building function def forward(x, test=False): return mlp_net(x, n_h, n_y, test) # Net for learning labeled data xl = nn.Variable((args.batchsize_l, ) + shape_x, need_grad=False) yl = forward(xl, test=False) tl = nn.Variable((args.batchsize_l, 1), need_grad=False) loss_l = F.mean(F.softmax_cross_entropy(yl, tl)) # Net for learning unlabeled data xu = nn.Variable((args.batchsize_u, ) + shape_x, need_grad=False) yu = forward(xu, test=False) y1 = yu.get_unlinked_variable() y1.need_grad = False noise = nn.Variable((args.batchsize_u, ) + shape_x, need_grad=True) r = noise / (F.sum(noise**2, [1, 2, 3], keepdims=True))**0.5 r.persistent = True y2 = forward(xu + args.xi_for_vat * r, test=False) y3 = forward(xu + args.eps_for_vat * r, test=False) loss_k = F.mean(distance(y1, y2)) loss_u = F.mean(distance(y1, y3)) # Net for evaluating validation data xv = nn.Variable((args.batchsize_v, ) + shape_x, need_grad=False) hv = forward(xv, test=True) tv = nn.Variable((args.batchsize_v, 1), need_grad=False) err = F.mean(F.top_n_error(hv, tv, n=1)) # Create solver solver = S.Adam(args.learning_rate) solver.set_parameters(nn.get_parameters()) # Monitor training and validation stats. import nnabla.monitor as M monitor = M.Monitor(args.model_save_path) monitor_verr = M.MonitorSeries("Test error", monitor, interval=240) monitor_time = M.MonitorTimeElapsed("Elapsed time", monitor, interval=240) # Training Loop. t0 = time.time() for i in range(args.max_iter): # Validation Test if i % args.val_interval == 0: valid_error = calc_validation_error(di_v, xv, tv, err, args.val_iter) monitor_verr.add(i, valid_error) ################################# ## Training by Labeled Data ##### ################################# # forward, backward and update xl.d, tl.d = di_l.next() xl.d = xl.d / 255 solver.zero_grad() loss_l.forward(clear_no_need_grad=True) loss_l.backward(clear_buffer=True) solver.weight_decay(args.weight_decay) solver.update() ################################# ## Training by Unlabeled Data ### ################################# # Calculate y without noise, only once. xu.d, _ = di_u.next() xu.d = xu.d / 255 yu.forward(clear_buffer=True) ##### Calculate Adversarial Noise ##### # Do power method iteration noise.d = np.random.normal(size=xu.shape).astype(np.float32) for k in range(args.n_iter_for_power_method): r.grad.zero() loss_k.forward(clear_no_need_grad=True) loss_k.backward(clear_buffer=True) noise.data.copy_from(r.grad) ##### Calculate loss for unlabeled data ##### # forward, backward and update solver.zero_grad() loss_u.forward(clear_no_need_grad=True) loss_u.backward(clear_buffer=True) solver.weight_decay(args.weight_decay) solver.update() ##### Learning rate update ##### if i % args.iter_per_epoch == 0: solver.set_learning_rate(solver.learning_rate() * args.learning_rate_decay) monitor_time.add(i) # Evaluate the final model by the error rate with validation dataset valid_error = calc_validation_error(di_v, xv, tv, err, args.val_iter) monitor_verr.add(i, valid_error) monitor_time.add(i) # Save the model. parameter_file = os.path.join(args.model_save_path, 'params_%06d.h5' % args.max_iter) nn.save_parameters(parameter_file)
def meta_train(args, train_data, valid_data, test_data): # Build episode generators shape_x = (1, 28, 28) train_episode_generator = EpisodeGenerator(args.n_class_tr, args.n_shot_tr, args.n_query_tr, shape_x, train_data) valid_episode_generator = EpisodeGenerator(args.n_class, args.n_shot, args.n_query, shape_x, valid_data) test_episode_generator = EpisodeGenerator(args.n_class, args.n_shot, args.n_query, shape_x, test_data) # Build training model xs_t = nn.Variable((args.n_class_tr * args.n_shot_tr, ) + shape_x) xq_t = nn.Variable((args.n_class_tr * args.n_query_tr, ) + shape_x) hq_t = net(args.n_class_tr, xs_t, xq_t, args.embedding, args.net_type, args.metric, False) yq_t = nn.Variable((args.n_class_tr * args.n_query_tr, 1)) loss_t = F.mean(F.softmax_cross_entropy(hq_t, yq_t)) # Build evaluation model xs_v = nn.Variable((args.n_class * args.n_shot, ) + shape_x) xq_v = nn.Variable((args.n_class * args.n_query, ) + shape_x) hq_v = net(args.n_class, xs_v, xq_v, args.embedding, args.net_type, args.metric, True) yq_v = nn.Variable((args.n_class * args.n_query, 1)) err_v = F.mean(F.top_n_error(hq_v, yq_v, n=1)) # Setup solver solver = S.Adam(args.learning_rate) solver.set_parameters(nn.get_parameters()) # Monitor outputs monitor = Monitor(args.work_dir) monitor_loss = MonitorSeries("Training loss", monitor, interval=args.iter_per_epoch) monitor_valid_err = MonitorSeries("Validation error", monitor, interval=args.iter_per_valid) monitor_test_err = MonitorSeries("Test error", monitor) monitor_test_conf = MonitorSeries("Test error confidence", monitor) # Output files param_file = args.work_dir + "params.h5" tsne_file = args.work_dir + "tsne.png" # Training loop train_losses = [] best_err = 1.0 for i in range(args.max_iteration): # Decay learning rate if (i + 1) % args.lr_decay_interval == 0: solver.set_learning_rate(solver.learning_rate() * args.lr_decay) # Create an episode xs_t.d, xq_t.d, yq_t.d = train_episode_generator.next() # Training by the episode solver.zero_grad() loss_t.forward(clear_no_need_grad=True) loss_t.backward(clear_buffer=True) solver.update() train_losses.append(loss_t.d.copy()) # Evaluation if (i + 1) % args.iter_per_valid == 0: train_loss = np.mean(train_losses) train_losses = [] valid_errs = [] for k in range(args.n_episode_for_valid): xs_v.d, xq_v.d, yq_v.d = valid_episode_generator.next() err_v.forward(clear_no_need_grad=True, clear_buffer=True) valid_errs.append(np.float(err_v.d.copy())) valid_err = np.mean(valid_errs) monitor_valid_err.add(i + 1, valid_err * 100) if valid_err < best_err: best_err = valid_err nn.save_parameters(param_file) # Final evaluation nn.load_parameters(param_file) v_errs = [] for k in range(args.n_episode_for_test): xs_v.d, xq_v.d, yq_v.d = test_episode_generator.next() err_v.forward(clear_no_need_grad=True, clear_buffer=True) v_errs.append(np.float(err_v.d.copy())) v_err_mean = np.mean(v_errs) v_err_std = np.std(v_errs) v_err_conf = 1.96 * v_err_std / np.sqrt(args.n_episode_for_test) monitor_test_err.add(0, v_err_mean * 100) monitor_test_conf.add(0, v_err_conf * 100) # Visualization n_class = 50 n_sample = 20 batch = test_data[:n_class].reshape(n_class * n_sample, 1, 28, 28) label = [] for i in range(n_class): label.extend(np.ones(n_sample) * (i % 50)) u = get_embeddings(batch, conv4) v = get_tsne(u) plot_tsne(v[:, 0], v[:, 1], label, tsne_file)