def test_clip_by_norm_forward(seed, shape, clip_norm, axis): rng = np.random.RandomState(seed) x_data = rng.randn(*shape) x = nn.Variable.from_numpy_array(x_data) with nn.auto_forward(True): y = F.clip_by_norm(x, clip_norm, axis) y_ref = ref_clip_by_norm(x_data, clip_norm, axis=axis) assert np.allclose(y.d, y_ref)
def execute_clip_by_norm(x, x_data, clip_norm, clip_norm_value, axis): if isinstance(clip_norm, (nn.Variable, nn.NdArray)): if clip_norm_value <= 0: pytest.skip() else: with nn.auto_forward(True): y = F.clip_by_norm(x, clip_norm, axis) y_ref = ref_clip_by_norm(x_data, clip_norm_value, axis=axis) assert_allclose(y.d, y_ref) else: if clip_norm_value > 0: with nn.auto_forward(True): y = F.clip_by_norm(x, clip_norm, axis) y_ref = ref_clip_by_norm(x_data, clip_norm_value, axis=axis) assert_allclose(y.d, y_ref) else: with pytest.raises(ValueError): y = F.clip_by_norm(x, clip_norm, axis)
def CNN_run(args, ops, alphas_dict): """ Based on the given model architecture, construct CNN and execute training. input: args: arguments set by user. ops: operations used in the network. arch_dict: a dictionary containing architecture information. """ data_iterator = data_iterator_cifar10 all_data = data_iterator(args.batch_size, True) tdata = all_data.slice(rng=None, slice_start=0, slice_end=25000) vdata = all_data.slice(rng=None, slice_start=25000, slice_end=50000) # CIFAR10 statistics, mean and variance CIFAR_MEAN = np.reshape([0.49139968, 0.48215827, 0.44653124], (1, 3, 1, 1)) CIFAR_STD = np.reshape([0.24703233, 0.24348505, 0.26158768], (1, 3, 1, 1)) channels, image_height, image_width = 3, 32, 32 batch_size = args.batch_size initial_model_lr = args.model_lr one_epoch = tdata.size // batch_size max_iter = args.epoch * one_epoch # Create monitor. monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=100) monitor_err = MonitorSeries("Training error", monitor, interval=100) monitor_vloss = MonitorSeries("Validation loss", monitor, interval=100) monitor_verr = MonitorSeries("Validation error", monitor, interval=100) # prepare variables and graph used for training image_train = nn.Variable( (batch_size, channels, image_height, image_width)) label_train = nn.Variable((batch_size, 1)) input_image_train = {"image": image_train, "label": label_train} pred_train = construct_networks(args, ops, image_train, test=False) loss_train = loss_function(pred_train, label_train) # prepare solvers for model parameters model_params_dict = \ {k: v for k, v in nn.get_parameters().items() if "alpha_" not in k} solver_model = S.Momentum(initial_model_lr) solver_model.set_parameters( { k: v for k, v in nn.get_parameters().items() if k in model_params_dict.keys() }, reset=False, retain_state=True) # prepare solvers for architecture parameters solver_archs = S.Adam(alpha=args.arch_lr, beta1=0.5, beta2=0.999) solver_archs.set_parameters( { k: v for k, v in nn.get_parameters().items() if k in alphas_dict.keys() }, reset=False, retain_state=True) # Training-loop for i in range(max_iter): # Update Model Parameters. if args.second_order: # store the weights before update. original_weights = { k: nn.Variable(v.shape, need_grad=True).apply( data=nn.NdArray(v.shape).copy_from(v.data)) for k, v in nn.get_parameters().items() if "alpha_" not in k } # gradients refuge accumulated_gradient = \ {k: nn.Variable(v.shape).apply(d=0) for k, v in alphas_dict.items()} image, label = tdata.next() image = image / 255.0 image = (image - CIFAR_MEAN) / CIFAR_STD input_image_train["image"].d = image input_image_train["label"].d = label loss_train.forward() e = categorical_error(pred_train.d, input_image_train["label"].d) monitor_loss.add(i, loss_train.d.copy()) monitor_err.add(i, e) if args.lr_control_model: new_lr = learning_rate_scheduler(i, max_iter, initial_model_lr, 0) solver_model.set_learning_rate(new_lr) solver_model.zero_grad() loss_train.backward(clear_buffer=True) if args.with_grad_clip_model: for k, v in model_params_dict.items(): v.grad.copy_from( F.clip_by_norm(v.grad, args.grad_clip_value_model)) solver_model.weight_decay(args.weight_decay_model) solver_model.update() # weights update ( w -> w') if args.second_order: updated_weights = { k: nn.Variable(v.shape, need_grad=True).apply( data=nn.NdArray(v.shape).copy_from(v.data)) for k, v in nn.get_parameters().items() if "alpha_" not in k } # Update Architecture Parameters. ve, vloss = 0., 0. v_image, v_label = vdata.next() v_image = v_image / 255.0 v_image = (v_image - CIFAR_MEAN) / CIFAR_STD input_image_train["image"].d = v_image input_image_train["label"].d = v_label # compute Loss_on_valid(w', alpha) loss_train.forward(clear_no_need_grad=True) ve = categorical_error(pred_train.d, input_image_train["label"].d) monitor_vloss.add(i, loss_train.d.copy()) monitor_verr.add(i, ve) solver_archs.zero_grad() solver_model.zero_grad() loss_train.backward(clear_buffer=True) # its gradient is stored if args.second_order: accumulated_gradient = store_gradient(accumulated_gradient, alphas_dict, coeff=1.) # grad_alpha_L_val(w', alpha). Note that gradient stored into .data delta_gradient_w = { k: nn.Variable(v.shape).apply(data=nn.NdArray( v.shape).copy_from(v.grad), need_grad=True) for k, v in nn.get_parameters().items() if "alpha_" not in k } epsilon = 0.01 / np.sum( [np.linalg.norm(v.d) for v in delta_gradient_w.values()]) coeff = 1.0 * epsilon # w -> w+ (= w + epsilon*grad_Loss_on_val(w', alpha)) weight_modify(original_weights, delta_gradient_w, model_params_dict, coeff) input_image_train["image"].d = image # reuse the same data input_image_train["label"].d = label # compute Loss_on_train(w+, alpha) loss_train.forward() solver_archs.zero_grad() solver_model.zero_grad() loss_train.backward(clear_buffer=True) # its gradient is stored # accumulate currently registered gradient coeff = (-1.) * args.eta / 2. * epsilon accumulated_gradient = store_gradient(accumulated_gradient, alphas_dict, coeff) coeff = -1.0 * epsilon # w -> w- (= w - epsilon*grad_Loss_on_val(w', alpha)) weight_modify(original_weights, delta_gradient_w, model_params_dict, coeff) # compute Loss_on_train(w-, alpha) loss_train.forward() solver_archs.zero_grad() solver_model.zero_grad() loss_train.backward(clear_buffer=True) # its gradient is stored # accumulate currently registered gradient again coeff = (+1.) * args.eta / 2. * epsilon accumulated_gradient = store_gradient(accumulated_gradient, alphas_dict, coeff) # replace the weights for k, v in alphas_dict.items(): nn.parameter.set_parameter( k, nn.Variable(v.shape).apply(data=v.data, grad=accumulated_gradient[k], need_grad=True)) for k, v in model_params_dict.items(): nn.parameter.set_parameter( k, nn.Variable(v.shape).apply(data=updated_weights[k].data, need_grad=True)) solver_archs.weight_decay(args.weight_decay_archs) solver_archs.update() if i % 1000 == 0: for k, v in alphas_dict.items(): keynames = k.split("_") print("\nParameters for {} cell, node {} to {};".format( keynames[1], keynames[2], keynames[3])) show_ops_and_prob(v.d, ops) return alphas_dict
def projection(x: nn.NdArray, eps: float = 1e-5) -> nn.NdArray: norm = F.pow_scalar(F.sum(x**2, axis=1), val=0.5) return F.where(condition=F.greater_equal_scalar(norm, val=1.), x_true=F.clip_by_norm(x, clip_norm=1 - eps, axis=1), x_false=x)
def CNN_run(args, both_archs, data_dict, with_train=False, after_search=False): """ """ num_cells = args.num_cells num_nodes = args.num_nodes if after_search: assert with_train is True, "when you train the network after architecture search, set with_train=True" tdata, mean_val_train, std_val_train = data_dict["train_data"] vdata, mean_val_valid, std_val_valid = data_dict["valid_data"] channels, image_height, image_width, num_class = data_dict["basic_info"] batch_size = args.batch_size output_filter = args.output_filter if with_train: if after_search: num_epoch = args.epoch_on_retrain if args.additional_filters_on_retrain > 0: output_filter += args.additional_filters_on_retrain else: num_epoch = args.epoch_per_search one_epoch = tdata.size // batch_size max_iter = num_epoch * one_epoch val_iter = args.val_iter monitor_path = args.monitor_path model_save_path = args.monitor_path decay_rate = args.weight_decay initial_lr = args.child_lr model_save_interval = args.model_save_interval image_valid = nn.Variable( (batch_size, channels, image_height, image_width)) input_image_valid = {"image": image_valid} vdata._reset() # rewind data test = True pred_valid, _, _ = construct_architecture(image_valid, num_class, num_cells, num_nodes, both_archs, output_filter, test) if with_train: if after_search: # setting for training after architecture search with_grad_clip = args.with_grad_clip_on_retrain grad_clip = args.grad_clip_value lr_control = args.lr_control_on_retrain else: with_grad_clip = args.with_grad_clip_on_search grad_clip = args.grad_clip_value lr_control = args.lr_control_on_search # prepare variables used for training image_train = nn.Variable( (batch_size, channels, image_height, image_width)) label_train = nn.Variable((batch_size, 1)) input_image_train = {"image": image_train, "label": label_train} tdata._reset() # rewind data test = False pred_train, aux_logits, used_weights = construct_architecture(image_train, num_class, num_cells, num_nodes, both_archs, output_filter, test) loss_train = loss_function(pred_train, aux_logits, label_train) used_weights_dict = {key_name: nn.get_parameters( )[key_name] for key_name in used_weights} # Create monitor. monitor = Monitor(monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=100) # modified to display accuracy. monitor_err = MonitorSeries("Training accuracy", monitor, interval=100) # modified to display accuracy. monitor_verr = MonitorSeries("Test accuracy", monitor, interval=1) # Solvers solver = S.Momentum(initial_lr) solver.set_parameters( used_weights_dict, reset=False, retain_state=True) # Training-loop for i in range(max_iter): if i > 0 and i % one_epoch == 0: # Validation during training. ve = 0. for j in range(val_iter): image, label = vdata.next() image = image / 255.0 image = (image - mean_val_valid) / std_val_valid input_image_valid["image"].d = image pred_valid.forward() ve += categorical_error(pred_valid.d, label) ve /= val_iter monitor_verr.add(i, 1.0 - ve) # modified to display accuracy. if after_search and int(i % args.model_save_interval) == 0: nn.save_parameters(os.path.join( args.model_save_path, 'params_%06d.h5' % i)) # Forward/Zerograd/Backward image, label = tdata.next() image = image / 255.0 image = (image - mean_val_train) / std_val_train input_image_train["image"].d = image input_image_train["label"].d = label loss_train.forward() if lr_control: new_lr = learning_rate_scheduler(i, max_iter, initial_lr, 0) solver.set_learning_rate(new_lr) solver.zero_grad() loss_train.backward() if with_grad_clip: for k, v in used_weights_dict.items(): if np.linalg.norm(v.g) > grad_clip: v.grad.copy_from(F.clip_by_norm(v.grad, grad_clip)) # Solvers update solver.weight_decay(decay_rate) solver.update() e = categorical_error(pred_train.d, input_image_train["label"].d) monitor_loss.add(i, loss_train.d.copy()) monitor_err.add(i, 1.0 - e) # modified to display accuracy. # Validation (After training or when called for evaluation only) ve = 0. for j in range(val_iter): image, label = vdata.next() image = image / 255.0 image = (image - mean_val_valid) / std_val_valid input_image_valid["image"].d = image pred_valid.forward() ve += categorical_error(pred_valid.d, label) ve /= val_iter if with_train: print("Validation Accuracy on Trained CNN:", '{:.2f}'.format(100*(1.0 - ve)), "%\n") if after_search: nn.save_parameters(os.path.join( args.model_save_path, 'params_%06d.h5' % (max_iter))) return 1.0 - ve
def CNN_run(args, ops, arch_dict): """ Based on the given model architecture, construct CNN and execute training. input: args: arguments set by user. ops: operations used in the network. arch_dict: a dictionary containing architecture information. """ data_iterator = data_iterator_cifar10 tdata = data_iterator(args.batch_size, True) vdata = data_iterator(args.batch_size, False) # CIFAR10 statistics, mean and variance CIFAR_MEAN = np.reshape([0.49139968, 0.48215827, 0.44653124], (1, 3, 1, 1)) CIFAR_STD = np.reshape([0.24703233, 0.24348505, 0.26158768], (1, 3, 1, 1)) channels, image_height, image_width = 3, 32, 32 batch_size = args.batch_size initial_model_lr = args.model_lr one_epoch = tdata.size // batch_size max_iter = args.epoch * one_epoch val_iter = 10000 // batch_size # Create monitor. monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=100) monitor_err = MonitorSeries("Training error", monitor, interval=100) monitor_vloss = MonitorSeries("Test loss", monitor, interval=100) monitor_verr = MonitorSeries("Test error", monitor, interval=100) # prepare variables and graph used for test image_valid = nn.Variable( (batch_size, channels, image_height, image_width)) label_valid = nn.Variable((batch_size, 1)) input_image_valid = {"image": image_valid, "label": label_valid} pred_valid, _ = construct_networks(args, ops, arch_dict, image_valid, test=True) loss_valid = loss_function(pred_valid, label_valid) # set dropout rate in advance nn.parameter.get_parameter_or_create("drop_rate", shape=(1, 1, 1, 1), need_grad=False) initial_drop_rate = nn.Variable((1, 1, 1, 1)).apply(d=args.dropout_rate) nn.parameter.set_parameter("drop_rate", initial_drop_rate) # prepare variables and graph used for training image_train = nn.Variable( (batch_size, channels, image_height, image_width)) label_train = nn.Variable((batch_size, 1)) input_image_train = {"image": image_train, "label": label_train} pred_train, aux_logits = construct_networks(args, ops, arch_dict, image_train, test=False) loss_train = loss_function(pred_train, label_train, aux_logits, args.auxiliary_weight) # prepare solvers model_params_dict = nn.get_parameters() solver_model = S.Momentum(initial_model_lr) solver_model.set_parameters(model_params_dict, reset=False, retain_state=True) # Training-loop for curr_epoch in range(args.epoch): print("epoch {}".format(curr_epoch)) curr_dropout_rate = F.add_scalar( F.mul_scalar(initial_drop_rate, (curr_epoch / args.epoch)), 1e-8) nn.parameter.set_parameter("drop_rate", curr_dropout_rate) for i in range(one_epoch): image, label = tdata.next() image = image / 255.0 image = (image - CIFAR_MEAN) / CIFAR_STD if args.cutout: image = cutout(image, args) input_image_train["image"].d = image input_image_train["label"].d = label loss_train.forward(clear_no_need_grad=True) e = categorical_error(pred_train.d, input_image_train["label"].d) monitor_loss.add(one_epoch * curr_epoch + i, loss_train.d.copy()) monitor_err.add(one_epoch * curr_epoch + i, e) if args.lr_control_model: new_lr = learning_rate_scheduler(one_epoch * curr_epoch + i, max_iter, initial_model_lr, 0) solver_model.set_learning_rate(new_lr) solver_model.zero_grad() loss_train.backward(clear_buffer=True) if args.with_grad_clip_model: for k, v in model_params_dict.items(): v.grad.copy_from( F.clip_by_norm(v.grad, args.grad_clip_value_model)) # update parameters solver_model.weight_decay(args.weight_decay_model) solver_model.update() if (one_epoch * curr_epoch + i) % args.model_save_interval == 0: nn.save_parameters( os.path.join( args.model_save_path, 'params_{}.h5'.format(one_epoch * curr_epoch + i))) # Validation during training. ve = 0. vloss = 0. for j in range(val_iter): image, label = vdata.next() image = image / 255.0 image = (image - CIFAR_MEAN) / CIFAR_STD input_image_valid["image"].d = image input_image_valid["label"].d = label loss_valid.forward(clear_no_need_grad=True) vloss += loss_valid.d.copy() ve += categorical_error(pred_valid.d.copy(), label) ve /= val_iter vloss /= val_iter monitor_vloss.add(one_epoch * curr_epoch + i, vloss) monitor_verr.add(one_epoch * curr_epoch + i, ve) return
def CNN_run(args, model): data_iterator_train, data_iterator_valid, num_class = \ get_data_iterator_and_num_class(args) channels, image_height, image_width = 3, args.height, args.width batch_size = args.batch_size initial_model_lr = args.model_lr one_epoch = data_iterator_train.size // batch_size max_iter = args.epoch * one_epoch val_iter = data_iterator_valid.size // batch_size # Create monitor. monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=100) monitor_err = MonitorSeries("Training error", monitor, interval=100) monitor_vloss = MonitorSeries("Test loss", monitor, interval=100) monitor_verr = MonitorSeries("Test error", monitor, interval=100) # prepare variables and graph used for test image_valid = nn.Variable( (batch_size, channels, image_height, image_width)) label_valid = nn.Variable((batch_size, 1)) input_image_valid = {"image": image_valid, "label": label_valid} pred_valid = construct_networks(args, image_valid, model, num_class, test=True) pred_valid.persistent = True loss_valid = loss_function(pred_valid, label_valid) top_1e_valid = F.mean(F.top_n_error(pred_valid, label_valid)) # prepare variables and graph used for training image_train = nn.Variable( (batch_size, channels, image_height, image_width)) label_train = nn.Variable((batch_size, 1)) input_image_train = {"image": image_train, "label": label_train} pred_train = construct_networks(args, image_train, model, num_class, test=False) loss_train = loss_function(pred_train, label_train) top_1e_train = F.mean(F.top_n_error(pred_train, label_train)) # prepare solvers solver = S.Momentum(initial_model_lr) solver.set_parameters(nn.get_parameters()) # Training-loop for i in range(max_iter): image, label = data_iterator_train.next() input_image_train["image"].d = image input_image_train["label"].d = label nn.forward_all([loss_train, top_1e_train], clear_no_need_grad=True) monitor_loss.add(i, loss_train.d.copy()) monitor_err.add(i, top_1e_train.d.copy()) if args.lr_control_model: new_lr = learning_rate_scheduler(i, max_iter, initial_model_lr, 0) solver.set_learning_rate(new_lr) solver.zero_grad() loss_train.backward(clear_buffer=True) if args.with_grad_clip_model: for k, v in nn.get_parameters().items(): v.grad.copy_from( F.clip_by_norm(v.grad, args.grad_clip_value_model)) # update parameters solver.weight_decay(args.weight_decay_model) solver.update() if i % args.model_save_interval == 0: # Validation during training. ve = 0. vloss = 0. for j in range(val_iter): v_image, v_label = data_iterator_valid.next() input_image_valid["image"].d = v_image input_image_valid["label"].d = v_label nn.forward_all([loss_valid, top_1e_valid], clear_buffer=True) vloss += loss_valid.d.copy() ve += top_1e_valid.d.copy() ve /= val_iter vloss /= val_iter monitor_vloss.add(i, vloss) monitor_verr.add(i, ve) nn.save_parameters( os.path.join(args.model_save_path, 'params_{}.h5'.format(i))) ve = 0. vloss = 0. for j in range(val_iter): v_image, v_label = data_iterator_valid.next() input_image_valid["image"].d = v_image input_image_valid["label"].d = v_label nn.forward_all([loss_valid, top_1e_valid], clear_buffer=True) vloss += loss_valid.d.copy() ve += top_1e_valid.d.copy() ve /= val_iter vloss /= val_iter monitor_vloss.add(i, vloss) monitor_verr.add(i, ve) nn.save_parameters( os.path.join(args.model_save_path, 'params_{}.h5'.format(i))) return