def __init__(self, args): # Create solvers (only Convolution kernels require weight decay). param_convweights = { k: v for k, v in nn.get_parameters().items() if k.endswith("conv/W") } param_others = { k: v for k, v in nn.get_parameters().items() if not k.endswith("conv/W") } convweights = S.Momentum(args.learning_rate, args.momentum) others = S.Momentum(args.learning_rate, args.momentum) convweights.set_parameters(param_convweights) others.set_parameters(param_others) # Init parameter gradients. convweights.zero_grad() others.zero_grad() # Set attributes. self.convweights = convweights self.others = others self.args = args self.batch_size = args.batch_size * args.accum_times self.rate = args.accum_times self.count = 0
def sample_arch_and_train(args, data_dict, controller_weights_dict): """ Execute these process. 1. For a certain number of times, let the controller construct sample architectures and test their performances. (By calling get_sample_and_feedback) 2. By using the performances acquired by the previous process, train the controller. 3. Select one architecture with the best validation accuracy and train its parameters. """ solver = S.Momentum(args.control_lr) # create solver for the controller solver.set_parameters(controller_weights_dict, reset=False, retain_state=True) solver.zero_grad() val_list = list() arch_list = list() with nn.auto_forward(): for c in range(args.num_candidate): output_line = " Architecture {} / {} ".format((c + 1), args.num_candidate) print("{0:-^80s}".format(output_line)) # sample one architecture and get its feedback for RL as loss loss, val_acc, sample_arch = get_sample_and_feedback( args, data_dict) val_list.append(val_acc) arch_list.append(sample_arch) loss.backward() # accumulate gradient each time print("{0:-^80s}\n".format(" Reinforcement Learning Phase ")) print("current accumulated loss:", loss.d) solver.weight_decay(0.025) solver.update() # train the controller print("\n{0:-^80s}\n".format(" CNN Learning Phase ")) best_idx = np.argmax(val_list) sample_arch = arch_list[best_idx] print("Train the model whose architecture is:") show_arch(sample_arch) print("and its accuracy is: {:.2f} %\n".format(100 * np.max(val_list))) print("Learnable Parameters:", params_count(nn.get_parameters())) # train a child network which achieves the best validation accuracy. val_acc = CNN_run(args, sample_arch, data_dict, with_train=True) return sample_arch, val_acc
def train(): """ Main script. """ args = get_args() # Get context. from nnabla.contrib.context import extension_context extension_module = args.context if args.context is None: extension_module = 'cpu' logger.info("Running in %s" % extension_module) ctx = extension_context(extension_module, device_id=args.device_id) nn.set_default_context(ctx) # Dataset # We use Tiny ImageNet from Stanford CS231N class. # https://tiny-imagenet.herokuapp.com/ # Tiny ImageNet consists of 200 categories, each category has 500 images # in training set. The image size is 64x64. To adapt ResNet into 64x64 # image inputs, the input image size of ResNet is set as 56x56, and # the stride in the first conv and the first max pooling are removed. data = data_iterator_tiny_imagenet(args.batch_size, 'train') vdata = data_iterator_tiny_imagenet(args.batch_size, 'val') num_classes = 200 tiny = True # TODO: Switch ILSVRC2012 dataset and TinyImageNet. t_model = get_model( args, num_classes, test=False, tiny=tiny) t_model.pred.persistent = True # Not clearing buffer of pred in backward v_model = get_model( args, num_classes, test=True, tiny=tiny) v_model.pred.persistent = True # Not clearing buffer of pred in forward # Create Solver. solver = S.Momentum(args.learning_rate, 0.9) solver.set_parameters(nn.get_parameters()) # Create monitor. import nnabla.monitor as M monitor = M.Monitor(args.monitor_path) monitor_loss = M.MonitorSeries("Training loss", monitor, interval=10) monitor_err = M.MonitorSeries("Training error", monitor, interval=10) monitor_vloss = M.MonitorSeries("Validation loss", monitor, interval=10) monitor_verr = M.MonitorSeries("Validation error", monitor, interval=10) monitor_time = M.MonitorTimeElapsed("Training time", monitor, interval=10) # Training loop. for i in range(args.max_iter): # Save parameters if i % args.model_save_interval == 0: nn.save_parameters(os.path.join( args.model_save_path, 'param_%06d.h5' % i)) # Validation if i % args.val_interval == 0: # Clear all intermediate memory to save memory. # t_model.loss.clear_recursive() l = 0.0 e = 0.0 for j in range(args.val_iter): images, labels = vdata.next() v_model.image.d = images v_model.label.d = labels v_model.image.data.cast(np.uint8, ctx) v_model.label.data.cast(np.int32, ctx) v_model.loss.forward(clear_buffer=True) l += v_model.loss.d e += categorical_error(v_model.pred.d, v_model.label.d) monitor_vloss.add(i, l / args.val_iter) monitor_verr.add(i, e / args.val_iter) # Clear all intermediate memory to save memory. # v_model.loss.clear_recursive() # Training l = 0.0 e = 0.0 solver.zero_grad() # Gradient accumulation loop for j in range(args.accum_grad): images, labels = data.next() t_model.image.d = images t_model.label.d = labels t_model.image.data.cast(np.uint8, ctx) t_model.label.data.cast(np.int32, ctx) t_model.loss.forward(clear_no_need_grad=True) t_model.loss.backward(clear_buffer=True) # Accumulating gradients l += t_model.loss.d e += categorical_error(t_model.pred.d, t_model.label.d) solver.weight_decay(args.weight_decay) solver.update() monitor_loss.add(i, l / args.accum_grad) monitor_err.add(i, e / args.accum_grad) monitor_time.add(i) # Learning rate decay at scheduled iter if i in args.learning_rate_decay_at: solver.set_learning_rate(solver.learning_rate() * 0.1) nn.save_parameters(os.path.join(args.model_save_path, 'param_%06d.h5' % args.max_iter))
def CNN_run(args, both_archs, data_dict, with_train=False, after_search=False): """ """ num_cells = args.num_cells num_nodes = args.num_nodes if after_search: assert with_train is True, "when you train the network after architecture search, set with_train=True" tdata, mean_val_train, std_val_train = data_dict["train_data"] vdata, mean_val_valid, std_val_valid = data_dict["valid_data"] channels, image_height, image_width, num_class = data_dict["basic_info"] batch_size = args.batch_size output_filter = args.output_filter if with_train: if after_search: num_epoch = args.epoch_on_retrain if args.additional_filters_on_retrain > 0: output_filter += args.additional_filters_on_retrain else: num_epoch = args.epoch_per_search one_epoch = tdata.size // batch_size max_iter = num_epoch * one_epoch val_iter = args.val_iter monitor_path = args.monitor_path model_save_path = args.monitor_path decay_rate = args.weight_decay initial_lr = args.child_lr model_save_interval = args.model_save_interval image_valid = nn.Variable( (batch_size, channels, image_height, image_width)) input_image_valid = {"image": image_valid} vdata._reset() # rewind data test = True pred_valid, _, _ = construct_architecture(image_valid, num_class, num_cells, num_nodes, both_archs, output_filter, test) if with_train: if after_search: # setting for training after architecture search with_grad_clip = args.with_grad_clip_on_retrain grad_clip = args.grad_clip_value lr_control = args.lr_control_on_retrain else: with_grad_clip = args.with_grad_clip_on_search grad_clip = args.grad_clip_value lr_control = args.lr_control_on_search # prepare variables used for training image_train = nn.Variable( (batch_size, channels, image_height, image_width)) label_train = nn.Variable((batch_size, 1)) input_image_train = {"image": image_train, "label": label_train} tdata._reset() # rewind data test = False pred_train, aux_logits, used_weights = construct_architecture(image_train, num_class, num_cells, num_nodes, both_archs, output_filter, test) loss_train = loss_function(pred_train, aux_logits, label_train) used_weights_dict = {key_name: nn.get_parameters( )[key_name] for key_name in used_weights} # Create monitor. monitor = Monitor(monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=100) # modified to display accuracy. monitor_err = MonitorSeries("Training accuracy", monitor, interval=100) # modified to display accuracy. monitor_verr = MonitorSeries("Test accuracy", monitor, interval=1) # Solvers solver = S.Momentum(initial_lr) solver.set_parameters( used_weights_dict, reset=False, retain_state=True) # Training-loop for i in range(max_iter): if i > 0 and i % one_epoch == 0: # Validation during training. ve = 0. for j in range(val_iter): image, label = vdata.next() image = image / 255.0 image = (image - mean_val_valid) / std_val_valid input_image_valid["image"].d = image pred_valid.forward() ve += categorical_error(pred_valid.d, label) ve /= val_iter monitor_verr.add(i, 1.0 - ve) # modified to display accuracy. if after_search and int(i % args.model_save_interval) == 0: nn.save_parameters(os.path.join( args.model_save_path, 'params_%06d.h5' % i)) # Forward/Zerograd/Backward image, label = tdata.next() image = image / 255.0 image = (image - mean_val_train) / std_val_train input_image_train["image"].d = image input_image_train["label"].d = label loss_train.forward() if lr_control: new_lr = learning_rate_scheduler(i, max_iter, initial_lr, 0) solver.set_learning_rate(new_lr) solver.zero_grad() loss_train.backward() if with_grad_clip: for k, v in used_weights_dict.items(): if np.linalg.norm(v.g) > grad_clip: v.grad.copy_from(F.clip_by_norm(v.grad, grad_clip)) # Solvers update solver.weight_decay(decay_rate) solver.update() e = categorical_error(pred_train.d, input_image_train["label"].d) monitor_loss.add(i, loss_train.d.copy()) monitor_err.add(i, 1.0 - e) # modified to display accuracy. # Validation (After training or when called for evaluation only) ve = 0. for j in range(val_iter): image, label = vdata.next() image = image / 255.0 image = (image - mean_val_valid) / std_val_valid input_image_valid["image"].d = image pred_valid.forward() ve += categorical_error(pred_valid.d, label) ve /= val_iter if with_train: print("Validation Accuracy on Trained CNN:", '{:.2f}'.format(100*(1.0 - ve)), "%\n") if after_search: nn.save_parameters(os.path.join( args.model_save_path, 'params_%06d.h5' % (max_iter))) return 1.0 - ve
def train(): """ Main script. """ args = get_args() # Get context. from nnabla.ext_utils import get_extension_context extension_module = args.context if args.context is None: extension_module = 'cpu' logger.info("Running in %s" % extension_module) ctx = get_extension_context(extension_module, device_id=args.device_id, type_config=args.type_config) nn.set_default_context(ctx) if args.tiny_mode: # We use Tiny ImageNet from Stanford CS231N class. # (Tiny ImageNet, https://tiny-imagenet.herokuapp.com/) # Tiny ImageNet consists of 200 categories, each category has 500 images # in training set. The image size is 64x64. To adapt ResNet into 64x64 # image inputs, the input image size of ResNet is set as 56x56, and # the stride in the first conv and the first max pooling are removed. # Please check README. data = data_iterator_tiny_imagenet(args.batch_size, 'train') vdata = data_iterator_tiny_imagenet(args.batch_size, 'val') num_classes = 200 else: # We use ImageNet. # (ImageNet, https://imagenet.herokuapp.com/) # ImageNet consists of 1000 categories, each category has 1280 images # in training set. The image size is various. To adapt ResNet into # 320x320 image inputs, the input image size of ResNet is set as # 224x224. We need to get tar file and create cache file(320x320 images). # Please check README. data = data_iterator_imagenet(args.batch_size, args.train_cachefile_dir) vdata = data_iterator_imagenet(args.batch_size, args.val_cachefile_dir) num_classes = 1000 t_model = get_model(args, num_classes, test=False, tiny=args.tiny_mode) t_model.pred.persistent = True # Not clearing buffer of pred in backward # TODO: need_grad should be passed to get_unlinked_variable after v1.0.3 fix. t_pred2 = t_model.pred.get_unlinked_variable() t_pred2.need_grad = False t_e = F.mean(F.top_n_error(t_pred2, t_model.label)) v_model = get_model(args, num_classes, test=True, tiny=args.tiny_mode) v_model.pred.persistent = True # Not clearing buffer of pred in forward # TODO: need_grad should be passed to get_unlinked_variable after v1.0.3 fix. v_pred2 = v_model.pred.get_unlinked_variable() v_pred2.need_grad = False v_e = F.mean(F.top_n_error(v_pred2, v_model.label)) # Save_nnp_Epoch0 contents = save_nnp({'x': v_model.image}, {'y': v_model.pred}, args.batch_size) save.save(os.path.join(args.model_save_path, 'Imagenet_result_epoch0.nnp'), contents) # Create Solver. solver = S.Momentum(args.learning_rate, 0.9) solver.set_parameters(nn.get_parameters()) start_point = 0 if args.checkpoint is not None: # load weights and solver state info from specified checkpoint file. start_point = load_checkpoint(args.checkpoint, solver) # Create monitor. import nnabla.monitor as M monitor = M.Monitor(args.monitor_path) monitor_loss = M.MonitorSeries("Training loss", monitor, interval=10) monitor_err = M.MonitorSeries("Training error", monitor, interval=10) monitor_vloss = M.MonitorSeries("Validation loss", monitor, interval=10) monitor_verr = M.MonitorSeries("Validation error", monitor, interval=10) monitor_time = M.MonitorTimeElapsed("Training time", monitor, interval=10) monitor_vtime = M.MonitorTimeElapsed("Validation time", monitor, interval=10) # Training loop. for i in range(start_point, args.max_iter): # Save parameters if i % args.model_save_interval == 0: # save checkpoint file save_checkpoint(args.model_save_path, i, solver) # Validation if i % args.val_interval == 0 and i != 0: # Clear all intermediate memory to save memory. # t_model.loss.clear_recursive() l = 0.0 e = 0.0 for j in range(args.val_iter): images, labels = vdata.next() v_model.image.d = images v_model.label.d = labels v_model.image.data.cast(np.uint8, ctx) v_model.label.data.cast(np.int32, ctx) v_model.loss.forward(clear_buffer=True) v_e.forward(clear_buffer=True) l += v_model.loss.d e += v_e.d monitor_vloss.add(i, l / args.val_iter) monitor_verr.add(i, e / args.val_iter) monitor_vtime.add(i) # Clear all intermediate memory to save memory. # v_model.loss.clear_recursive() # Training l = 0.0 e = 0.0 solver.zero_grad() def accumulate_error(l, e, t_model, t_e): l += t_model.loss.d e += t_e.d return l, e # Gradient accumulation loop for j in range(args.accum_grad): images, labels = data.next() t_model.image.d = images t_model.label.d = labels t_model.image.data.cast(np.uint8, ctx) t_model.label.data.cast(np.int32, ctx) t_model.loss.forward(clear_no_need_grad=True) t_model.loss.backward(clear_buffer=True) # Accumulating gradients t_e.forward(clear_buffer=True) l, e = accumulate_error(l, e, t_model, t_e) solver.weight_decay(args.weight_decay) solver.update() monitor_loss.add(i, l / args.accum_grad) monitor_err.add(i, e / args.accum_grad) monitor_time.add(i) # Learning rate decay at scheduled iter if i in args.learning_rate_decay_at: solver.set_learning_rate(solver.learning_rate() * 0.1) nn.save_parameters( os.path.join(args.model_save_path, 'param_%06d.h5' % args.max_iter)) # Save_nnp contents = save_nnp({'x': v_model.image}, {'y': v_model.pred}, args.batch_size) save.save(os.path.join(args.model_save_path, 'Imagenet_result.nnp'), contents)
def CNN_run(args, ops, arch_dict): """ Based on the given model architecture, construct CNN and execute training. input: args: arguments set by user. ops: operations used in the network. arch_dict: a dictionary containing architecture information. """ data_iterator = data_iterator_cifar10 tdata = data_iterator(args.batch_size, True) vdata = data_iterator(args.batch_size, False) # CIFAR10 statistics, mean and variance CIFAR_MEAN = np.reshape([0.49139968, 0.48215827, 0.44653124], (1, 3, 1, 1)) CIFAR_STD = np.reshape([0.24703233, 0.24348505, 0.26158768], (1, 3, 1, 1)) channels, image_height, image_width = 3, 32, 32 batch_size = args.batch_size initial_model_lr = args.model_lr one_epoch = tdata.size // batch_size max_iter = args.epoch * one_epoch val_iter = 10000 // batch_size # Create monitor. monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=100) monitor_err = MonitorSeries("Training error", monitor, interval=100) monitor_vloss = MonitorSeries("Test loss", monitor, interval=100) monitor_verr = MonitorSeries("Test error", monitor, interval=100) # prepare variables and graph used for test image_valid = nn.Variable( (batch_size, channels, image_height, image_width)) label_valid = nn.Variable((batch_size, 1)) input_image_valid = {"image": image_valid, "label": label_valid} pred_valid, _ = construct_networks(args, ops, arch_dict, image_valid, test=True) loss_valid = loss_function(pred_valid, label_valid) # set dropout rate in advance nn.parameter.get_parameter_or_create("drop_rate", shape=(1, 1, 1, 1), need_grad=False) initial_drop_rate = nn.Variable((1, 1, 1, 1)).apply(d=args.dropout_rate) nn.parameter.set_parameter("drop_rate", initial_drop_rate) # prepare variables and graph used for training image_train = nn.Variable( (batch_size, channels, image_height, image_width)) label_train = nn.Variable((batch_size, 1)) input_image_train = {"image": image_train, "label": label_train} pred_train, aux_logits = construct_networks(args, ops, arch_dict, image_train, test=False) loss_train = loss_function(pred_train, label_train, aux_logits, args.auxiliary_weight) # prepare solvers model_params_dict = nn.get_parameters() solver_model = S.Momentum(initial_model_lr) solver_model.set_parameters(model_params_dict, reset=False, retain_state=True) # Training-loop for curr_epoch in range(args.epoch): print("epoch {}".format(curr_epoch)) curr_dropout_rate = F.add_scalar( F.mul_scalar(initial_drop_rate, (curr_epoch / args.epoch)), 1e-8) nn.parameter.set_parameter("drop_rate", curr_dropout_rate) for i in range(one_epoch): image, label = tdata.next() image = image / 255.0 image = (image - CIFAR_MEAN) / CIFAR_STD if args.cutout: image = cutout(image, args) input_image_train["image"].d = image input_image_train["label"].d = label loss_train.forward(clear_no_need_grad=True) e = categorical_error(pred_train.d, input_image_train["label"].d) monitor_loss.add(one_epoch * curr_epoch + i, loss_train.d.copy()) monitor_err.add(one_epoch * curr_epoch + i, e) if args.lr_control_model: new_lr = learning_rate_scheduler(one_epoch * curr_epoch + i, max_iter, initial_model_lr, 0) solver_model.set_learning_rate(new_lr) solver_model.zero_grad() loss_train.backward(clear_buffer=True) if args.with_grad_clip_model: for k, v in model_params_dict.items(): v.grad.copy_from( F.clip_by_norm(v.grad, args.grad_clip_value_model)) # update parameters solver_model.weight_decay(args.weight_decay_model) solver_model.update() if (one_epoch * curr_epoch + i) % args.model_save_interval == 0: nn.save_parameters( os.path.join( args.model_save_path, 'params_{}.h5'.format(one_epoch * curr_epoch + i))) # Validation during training. ve = 0. vloss = 0. for j in range(val_iter): image, label = vdata.next() image = image / 255.0 image = (image - CIFAR_MEAN) / CIFAR_STD input_image_valid["image"].d = image input_image_valid["label"].d = label loss_valid.forward(clear_no_need_grad=True) vloss += loss_valid.d.copy() ve += categorical_error(pred_valid.d.copy(), label) ve /= val_iter vloss /= val_iter monitor_vloss.add(one_epoch * curr_epoch + i, vloss) monitor_verr.add(one_epoch * curr_epoch + i, ve) return
def main(args): from numpy.random import seed seed(46) # Get context. from nnabla.ext_utils import get_extension_context ctx = get_extension_context('cudnn', device_id='0', type_config='float') nn.set_default_context(ctx) # Create CNN network # === TRAIN === # Create input variables. image = nn.Variable([args.batch_size, 3, args.img_height, args.img_width]) label = nn.Variable([args.batch_size, 1, args.img_height, args.img_width]) # Create prediction graph. pred = depth_cnn_model(image, test=False) pred.persistent = True # Create loss function. loss = l1_loss(pred, label) # === VAL === #vimage = nn.Variable([args.batch_size, 3, args.img_height, args.img_width]) #vlabel = nn.Variable([args.batch_size, 1, args.img_height, args.img_width]) #vpred = depth_cnn_model(vimage, test=True) #vloss = l1_loss(vpred, vlabel) # Prepare monitors. monitor = Monitor(os.path.join(args.log_dir, 'nnmonitor')) monitors = { 'train_epoch_loss': MonitorSeries('Train epoch loss', monitor, interval=1), 'train_itr_loss': MonitorSeries('Train itr loss', monitor, interval=100), # 'val_epoch_loss': MonitorSeries('Val epoch loss', monitor, interval=1), 'train_viz': MonitorImageTile('Train images', monitor, interval=1000, num_images=4) } # Create Solver. If training from checkpoint, load the info. if args.optimizer == "adam": solver = S.Adam(alpha=args.learning_rate, beta1=0.9, beta2=0.999) elif args.optimizer == "sgd": solver = S.Momentum(lr=args.learning_rate, momentum=0.9) solver.set_parameters(nn.get_parameters()) # Initialize DataIterator data_dic = prepare_dataloader(args.dataset_path, datatype_list=['train', 'val'], batch_size=args.batch_size, img_size=(args.img_height, args.img_width)) # Training loop. logger.info("Start training!!!") total_itr_index = 0 for epoch in range(1, args.epochs + 1): ## === training === ## total_train_loss = 0 index = 0 while index < data_dic['train']['size']: # Preprocess image.d, label.d = data_dic['train']['itr'].next() loss.forward(clear_no_need_grad=True) # Initialize gradients solver.zero_grad() # Backward execution loss.backward(clear_buffer=True) # Update parameters by computed gradients if args.optimizer == 'sgd': solver.weight_decay(1e-4) solver.update() # Update log index += 1 total_itr_index += 1 total_train_loss += loss.d # Pass to monitor monitors['train_itr_loss'].add(total_itr_index, loss.d) # Visualization pred.forward(clear_buffer=True) train_viz = np.concatenate([ image.d, convert_depth2colormap(label.d), convert_depth2colormap(pred.d) ], axis=3) monitors['train_viz'].add(total_itr_index, train_viz) # Logger logger.info("[{}] {}/{} Train Loss {} ({})".format( epoch, index, data_dic['train']['size'], total_train_loss / index, loss.d)) # Pass training loss to a monitor. train_error = total_train_loss / data_dic['train']['size'] monitors['train_epoch_loss'].add(epoch, train_error) # Save Parameter out_param_file = os.path.join(args.log_dir, 'checkpoint' + str(epoch) + '.h5') nn.save_parameters(out_param_file)
def __init__(self, learning_rate, momentum=0.9): self.solver = S.Momentum(learning_rate, momentum) self.solver_bn = S.Momentum(learning_rate, momentum)
def train(): """ Main script. Naive Multi-Device Training NOTE: the communicator exposes low-level interfaces * Parse command line arguments. * Instantiate a communicator and set parameter variables. * Specify contexts for computation. * Initialize DataIterator. * Construct a computation graph for training and one for validation. * Initialize solver and set parameter variables to that. * Create monitor instances for saving and displaying training stats. * Training loop * Computate error rate for validation data (periodically) * Get a next minibatch. * Execute forwardprop * Set parameter gradients zero * Execute backprop. * Inplace allreduce (THIS IS THE MAIN difference from a single device training) * Solver updates parameters by using gradients computed by backprop. * Compute training error """ args = get_args() if args.tiny_mode: n_train_samples = 100000 else: n_train_samples = 1282167 # Communicator and Context from nnabla.ext_utils import get_extension_context extension_module = "cudnn" ctx = get_extension_context(extension_module, type_config=args.type_config) comm = C.MultiProcessDataParalellCommunicator(ctx) comm.init() n_devices = comm.size mpi_rank = comm.rank device_id = mpi_rank ctx.device_id = str(device_id) nn.set_default_context(ctx) # workarond to start with the same parameters. rng = np.random.RandomState(device_id) if args.tiny_mode: # We use Tiny ImageNet from Stanford CS231N class. # (Tiny ImageNet, https://tiny-imagenet.herokuapp.com/) # Tiny ImageNet consists of 200 categories, each category has 500 images # in training set. The image size is 64x64. To adapt ResNet into 64x64 # image inputs, the input image size of ResNet is set as 56x56, and # the stride in the first conv and the first max pooling are removed. # Please check README. data = data_iterator_tiny_imagenet(args.batch_size, 'train') vdata = data_iterator_tiny_imagenet(args.batch_size, 'val') num_classes = 200 else: # We use ImageNet. # (ImageNet, https://imagenet.herokuapp.com/) # ImageNet consists of 1000 categories, each category has 1280 images # in training set. The image size is various. To adapt ResNet into # 320x320 image inputs, the input image size of ResNet is set as # 224x224. We need to get tar file and create cache file(320x320 images). # Please check README. data = data_iterator_imagenet(args.batch_size, args.train_cachefile_dir, rng=rng) vdata = data_iterator_imagenet(args.batch_size, args.val_cachefile_dir) vdata = vdata.slice(rng=None, num_of_slices=n_devices, slice_pos=device_id) num_classes = 1000 # Workaround to start with the same initialized weights for all workers. np.random.seed(313) t_model = get_model(args, num_classes, test=False, tiny=args.tiny_mode) t_model.pred.persistent = True # Not clearing buffer of pred in backward t_pred2 = t_model.pred.unlinked() t_e = F.mean(F.top_n_error(t_pred2, t_model.label)) v_model = get_model(args, num_classes, test=True, tiny=args.tiny_mode) v_model.pred.persistent = True # Not clearing buffer of pred in forward v_pred2 = v_model.pred.unlinked() v_e = F.mean(F.top_n_error(v_pred2, v_model.label)) # Add parameters to communicator. comm.add_context_and_parameters((ctx, nn.get_parameters())) # Create Solver. solver = S.Momentum(args.learning_rate, 0.9) solver.set_parameters(nn.get_parameters()) # Setting warmup. base_lr = args.learning_rate / n_devices warmup_iter = int(1. * n_train_samples / args.batch_size / args.accum_grad / n_devices) * args.warmup_epoch warmup_slope = base_lr * (n_devices - 1) / warmup_iter solver.set_learning_rate(base_lr) # Create monitor. import nnabla.monitor as M monitor = M.Monitor(args.monitor_path) monitor_loss = M.MonitorSeries("Training loss", monitor, interval=10) monitor_err = M.MonitorSeries("Training error", monitor, interval=10) monitor_vloss = M.MonitorSeries("Validation loss", monitor, interval=1) monitor_verr = M.MonitorSeries("Validation error", monitor, interval=1) monitor_time = M.MonitorTimeElapsed("Training time", monitor, interval=10) monitor_vtime = M.MonitorTimeElapsed("Validation time", monitor, interval=1) # Training loop. vl = nn.Variable() ve = nn.Variable() for i in range(int(args.max_iter / n_devices)): # Save parameters if i % (args.model_save_interval // n_devices) == 0 and device_id == 0: nn.save_parameters( os.path.join(args.model_save_path, 'param_%06d.h5' % i)) # Validation if i % (args.val_interval // n_devices) == 0 and i != 0: ve_local = 0. vl_local = 0. val_iter_local = args.val_iter // n_devices for j in range(val_iter_local): images, labels = vdata.next() v_model.image.d = images v_model.label.d = labels v_model.image.data.cast(np.uint8, ctx) v_model.label.data.cast(np.int32, ctx) v_model.loss.forward(clear_buffer=True) v_e.forward(clear_buffer=True) vl_local += v_model.loss.d.copy() ve_local += v_e.d.copy() vl_local /= val_iter_local vl.d = vl_local comm.all_reduce(vl.data, division=True, inplace=True) ve_local /= val_iter_local ve.d = ve_local comm.all_reduce(ve.data, division=True, inplace=True) if device_id == 0: monitor_vloss.add(i * n_devices, vl.d.copy()) monitor_verr.add(i * n_devices, ve.d.copy()) monitor_vtime.add(i * n_devices) # Training l = 0.0 e = 0.0 solver.zero_grad() def accumulate_error(l, e, t_model, t_e): l += t_model.loss.d e += t_e.d return l, e # Gradient accumulation loop for j in range(args.accum_grad): images, labels = data.next() if j != 0: # Update e and l according to previous results of forward # propagation. # The update of last iteration is performed # after solver update to avoid unnecessary CUDA synchronization. # This is performed after data.next() in order to overlap # the data loading and graph execution. # TODO: Move this to the bottom of the loop when prefetch # data loader is available. l, e = accumulate_error(l, e, t_model, t_e) t_model.image.d = images t_model.label.d = labels t_model.image.data.cast(np.uint8, ctx) t_model.label.data.cast(np.int32, ctx) t_model.loss.forward(clear_no_need_grad=True) t_model.loss.backward(clear_buffer=True) # Accumulating gradients t_e.forward(clear_buffer=True) # AllReduce params = [x.grad for x in nn.get_parameters().values()] comm.all_reduce(params, division=False, inplace=False) # Update solver.weight_decay(args.weight_decay) solver.update() # Accumulate errors after solver update l, e = accumulate_error(l, e, t_model, t_e) # Linear Warmup if i <= warmup_iter: lr = base_lr + warmup_slope * i solver.set_learning_rate(lr) # Synchronize by averaging the weights over devices using allreduce if (i + 1) % args.sync_weight_every_itr == 0: weights = [x.data for x in nn.get_parameters().values()] comm.all_reduce(weights, division=True, inplace=True) if device_id == 0: monitor_loss.add(i * n_devices, l / args.accum_grad) monitor_err.add(i * n_devices, e / args.accum_grad) monitor_time.add(i * n_devices) # Learning rate decay at scheduled iter if i * n_devices in args.learning_rate_decay_at: solver.set_learning_rate(solver.learning_rate() * 0.1) if device_id == 0: nn.save_parameters( os.path.join(args.model_save_path, 'param_%06d.h5' % (args.max_iter / n_devices)))
def train(): bs_train, bs_valid = args.train_batch_size, args.val_batch_size extension_module = args.context ctx = get_extension_context( extension_module, device_id=args.device_id, type_config=args.type_config ) nn.set_default_context(ctx) if args.input: train_loader, val_loader, n_train_samples, n_val_samples = load_data( bs_train, bs_valid ) else: train_data_source = data_source_cifar10( train=True, shuffle=True, label_shuffle=True ) val_data_source = data_source_cifar10(train=False, shuffle=False) n_train_samples = len(train_data_source.labels) n_val_samples = len(val_data_source.labels) # Data Iterator train_loader = data_iterator( train_data_source, bs_train, None, False, False) val_loader = data_iterator( val_data_source, bs_valid, None, False, False) if args.shuffle_label: if not os.path.exists(args.output): os.makedirs(args.output) np.save(os.path.join(args.output, "x_train.npy"), train_data_source.images) np.save( os.path.join(args.output, "y_shuffle_train.npy"), train_data_source.labels, ) np.save(os.path.join(args.output, "y_train.npy"), train_data_source.raw_label) np.save(os.path.join(args.output, "x_val.npy"), val_data_source.images) np.save(os.path.join(args.output, "y_val.npy"), val_data_source.labels) if args.model == "resnet23": model_prediction = resnet23_prediction elif args.model == "resnet56": model_prediction = resnet56_prediction prediction = functools.partial( model_prediction, ncls=10, nmaps=64, act=F.relu, seed=args.seed) # Create training graphs test = False image_train = nn.Variable((bs_train, 3, 32, 32)) label_train = nn.Variable((bs_train, 1)) pred_train, _ = prediction(image_train, test) loss_train = loss_function(pred_train, label_train) # Create validation graph test = True image_valid = nn.Variable((bs_valid, 3, 32, 32)) label_valid = nn.Variable((bs_valid, 1)) pred_valid, _ = prediction(image_valid, test) loss_val = loss_function(pred_valid, label_valid) for param in nn.get_parameters().values(): param.grad.zero() cfg = read_yaml("./learning_rate.yaml") print(cfg) lr_sched = create_learning_rate_scheduler(cfg.learning_rate_config) solver = S.Momentum(momentum=0.9, lr=lr_sched.get_lr()) solver.set_parameters(nn.get_parameters()) start_point = 0 if args.checkpoint is not None: # load weights and solver state info from specified checkpoint file. start_point = load_checkpoint(args.checkpoint, solver) # Create monitor from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=1) monitor_err = MonitorSeries("Training error", monitor, interval=1) monitor_time = MonitorTimeElapsed("Training time", monitor, interval=1) monitor_verr = MonitorSeries("Test error", monitor, interval=1) monitor_vloss = MonitorSeries("Test loss", monitor, interval=1) # save_nnp contents = save_nnp({"x": image_valid}, {"y": pred_valid}, bs_valid) save.save( os.path.join(args.model_save_path, (args.model+"_epoch0_result.nnp")), contents ) train_iter = math.ceil(n_train_samples / bs_train) val_iter = math.ceil(n_val_samples / bs_valid) # Training-loop for i in range(start_point, args.train_epochs): lr_sched.set_epoch(i) solver.set_learning_rate(lr_sched.get_lr()) print("Learning Rate: ", lr_sched.get_lr()) # Validation ve = 0.0 vloss = 0.0 print("## Validation") for j in range(val_iter): image, label = val_loader.next() image_valid.d = image label_valid.d = label loss_val.forward() vloss += loss_val.data.data.copy() * bs_valid ve += categorical_error(pred_valid.d, label) ve /= args.val_iter vloss /= n_val_samples monitor_verr.add(i, ve) monitor_vloss.add(i, vloss) if int(i % args.model_save_interval) == 0: # save checkpoint file save_checkpoint(args.model_save_path, i, solver) # Forward/Zerograd/Backward print("## Training") e = 0.0 loss = 0.0 for k in range(train_iter): image, label = train_loader.next() image_train.d = image label_train.d = label loss_train.forward() solver.zero_grad() loss_train.backward() solver.update() e += categorical_error(pred_train.d, label_train.d) loss += loss_train.data.data.copy() * bs_train e /= train_iter loss /= n_train_samples e = categorical_error(pred_train.d, label_train.d) monitor_loss.add(i, loss) monitor_err.add(i, e) monitor_time.add(i) nn.save_parameters( os.path.join(args.model_save_path, "params_%06d.h5" % (args.train_epochs)) ) # save_nnp_lastepoch contents = save_nnp({"x": image_valid}, {"y": pred_valid}, bs_valid) save.save(os.path.join(args.model_save_path, (args.model+"_result.nnp")), contents)
def train(): ''' Run D3Net Semantic Segmentation Training ''' # Check NNabla version if get_nnabla_version_integer() < 12100: raise ValueError( 'This code does not work with nnabla version less than v1.21.0 since [ignore index less than 0](https://github.com/sony/nnabla/pull/945) is added in v1.21.0 . Please update the nnabla version.') args = get_args() # Load D3Net Hyper parameters (D3Net-L or D3Net-S) with open(args.config_file) as file: hparams = yaml.load(file, Loader=yaml.FullLoader) # Get context. ctx = get_extension_context(args.context, device_id=0) comm = CommunicatorWrapper(ctx) nn.set_default_context(comm.ctx) # Change max_iter, learning_rate and weight_decay according no. of gpu devices for multi-gpu training. default_batch_size = 8 train_scale_factor = comm.n_procs * \ (hparams['batch_size'] / default_batch_size) hparams['max_iter'] = int(hparams['max_iter'] // train_scale_factor) hparams['lr'] = hparams['lr'] * train_scale_factor hparams['min_lr'] = hparams['min_lr'] * train_scale_factor hparams['weight_decay'] = hparams['weight_decay'] * comm.n_procs # --------------------- # Create data iterators # --------------------- rng = np.random.RandomState() data = data_iterator_cityscapes( hparams['batch_size'], args.data_dir, rng=rng, train=True) if comm.n_procs > 1: data = data.slice(rng=rng, num_of_slices=comm.n_procs, slice_pos=comm.rank) if comm.rank == 0: if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) # Create monitors monitor = M.Monitor(args.output_dir) monitor_training_loss = M.MonitorSeries( 'Training loss', monitor, interval=args.log_interval) monitor_lr = M.MonitorSeries( 'Learning rate', monitor, interval=args.log_interval) monitor_time = M.MonitorTimeElapsed( "Training time per iteration", monitor, interval=args.log_interval) # --------------------- # Create Training Graph # --------------------- # Create input variables image = nn.Variable( (hparams['batch_size'], 3, hparams['image_height'], hparams['image_width'])) seg_gt = nn.Variable( (hparams['batch_size'], 1, hparams['image_height'], hparams['image_width'])) # D3Net prediction/output seg_pred = d3net_segmentation(image, hparams, recompute=args.recompute) # Configure loss loss = F.mean(F.softmax_cross_entropy(seg_pred, seg_gt, axis=1)) loss.persistent = True # Create Solver solver = S.Momentum(hparams['lr'], hparams['momentum']) solver.set_parameters(nn.get_parameters()) # Initialize LR Scheduler lr_scheduler = PolynomialScheduler(hparams) if args.pretrained is not None: # Initialize the D3Net backbone weights with nn.parameter_scope('backbone'): nn.load_parameters(args.pretrained) # ------------- # Training loop # ------------- for i in range(hparams['max_iter']): image.d, seg_gt.d = data.next() solver.zero_grad() lr = lr_scheduler.get_learning_rate(i) solver.set_learning_rate(lr) loss.forward(clear_no_need_grad=True) if comm.n_procs > 1: all_reduce_callback = comm.get_all_reduce_callback() loss.backward(clear_buffer=True, communicator_callbacks=all_reduce_callback) else: loss.backward(clear_buffer=True) solver.weight_decay(hparams['weight_decay']) solver.update() if comm.rank == 0: # Log monitors monitor_training_loss.add(i, loss.d.copy()) monitor_lr.add(i, lr) monitor_time.add(i) if (i % hparams['save_interval']) == 0: # Save intermediate model parameters nn.save_parameters(os.path.join( args.output_dir, "model_param_%08d.h5" % i)) solver.save_states(os.path.join( args.output_dir, "solver_states.h5")) if comm.rank == 0: # save final model parameters nn.save_parameters(os.path.join(args.output_dir, "final.h5"))
def train(): """ Main script. Naive Multi-Device Training NOTE: the communicator exposes low-level interfaces * Parse command line arguments. * Instantiate a communicator and set parameter variables. * Specify contexts for computation. * Initialize DataIterator. * Construct a computation graph for training and one for validation. * Initialize solver and set parameter variables to that. * Create monitor instances for saving and displaying training stats. * Training loop * Computate error rate for validation data (periodically) * Get a next minibatch. * Execute forwardprop * Set parameter gradients zero * Execute backprop. * Inplace allreduce (THIS IS THE MAIN difference from a single device training) * Solver updates parameters by using gradients computed by backprop. * Compute training error """ args = get_args() n_train_samples = 1281167 num_classes = 1000 # Communicator and Context from nnabla.ext_utils import get_extension_context extension_module = "cudnn" ctx = get_extension_context(extension_module, type_config=args.type_config) comm = C.MultiProcessDataParalellCommunicator(ctx) comm.init() n_devices = comm.size mpi_rank = comm.rank device_id = mpi_rank ctx.device_id = str(device_id) nn.set_default_context(ctx) # Pipelines and Iterators for training train_pipes = [ TrainPipeline(args.batch_size, args.num_threads, device_id, args.train_cachefile_dir, args.train_list, seed=device_id + 1, num_gpu=n_devices, random_area=args.random_area) ] train_pipes[0].build() data = DALIClassificationIterator(train_pipes, train_pipes[0].epoch_size("Reader") // n_devices, auto_reset=True, stop_at_epoch=False) # Pipelines and Iterators for validation val_pipes = [ ValPipeline(args.batch_size, args.num_threads, device_id, args.val_cachefile_dir, args.val_list, seed=device_id + 1, num_gpu=n_devices) ] val_pipes[0].build() vdata = DALIClassificationIterator(val_pipes, val_pipes[0].epoch_size("Reader") // n_devices, auto_reset=True, stop_at_epoch=False) # Network for training t_model = get_model(args, num_classes, n_devices, args.accum_grad, test=False) t_model.pred.persistent = True # Not clearing buffer of pred in backward t_pred2 = t_model.pred.get_unlinked_variable(need_grad=False) t_e = F.mean(F.top_n_error(t_pred2, t_model.label)) # Network for validation v_model = get_model(args, num_classes, n_devices, args.accum_grad, test=True) v_model.pred.persistent = True # Not clearing buffer of pred in forward v_pred2 = v_model.pred.get_unlinked_variable(need_grad=False) v_e = F.mean(F.top_n_error(v_pred2, v_model.label)) # Solver solver = S.Momentum(args.learning_rate, 0.9) solver.set_learning_rate(args.learning_rate) solver.set_parameters(nn.get_parameters()) # Monitors import nnabla.monitor as M monitor = M.Monitor(args.monitor_path) monitor_loss = M.MonitorSeries("Training loss", monitor, interval=10) monitor_err = M.MonitorSeries("Training error", monitor, interval=10) monitor_vloss = M.MonitorSeries("Validation loss", monitor, interval=1) monitor_verr = M.MonitorSeries("Validation error", monitor, interval=1) monitor_time = M.MonitorTimeElapsed("Training time", monitor, interval=10) monitor_vtime = M.MonitorTimeElapsed("Validation time", monitor, interval=1) # Training loop vl = nn.Variable() ve = nn.Variable() for i in range(int(args.max_iter / n_devices)): # Save parameters if i % (args.model_save_interval // n_devices) == 0 and device_id == 0: nn.save_parameters( os.path.join(args.model_save_path, 'param_%06d.h5' % i)) # Validation if i % (args.val_interval // n_devices) == 0 and i != 0: ve_local = 0. vl_local = 0. val_iter_local = args.val_iter // n_devices for j in range(val_iter_local): nextImage, nextLabel = vdata.next() v_model.image.data = nextImage v_model.label.data = nextLabel v_model.loss.forward(clear_buffer=True) v_e.forward(clear_buffer=True) vl_local += v_model.loss.d.copy() ve_local += v_e.d.copy() vl_local /= val_iter_local vl.d = vl_local comm.all_reduce(vl.data, division=True, inplace=True) ve_local /= val_iter_local ve.d = ve_local comm.all_reduce(ve.data, division=True, inplace=True) if device_id == 0: monitor_vloss.add(i * n_devices, vl.d.copy()) monitor_verr.add(i * n_devices, ve.d.copy()) monitor_vtime.add(i * n_devices) # Training l = 0.0 e = 0.0 solver.zero_grad() def accumulate_error(l, e, t_model, t_e): l += t_model.loss.d e += t_e.d return l, e # Gradient accumulation loop for j in range(args.accum_grad): nextImage, nextLabel = data.next() t_model.image.data = nextImage t_model.label.data = nextLabel t_model.loss.forward(clear_no_need_grad=True) t_model.loss.backward(clear_buffer=True) # Accumulating gradients t_e.forward(clear_buffer=True) l, e = accumulate_error(l, e, t_model, t_e) # AllReduce params = [x.grad for x in nn.get_parameters().values()] comm.all_reduce(params, division=False, inplace=False) # Update solver.weight_decay(args.weight_decay) solver.update() if device_id == 0: monitor_loss.add(i * n_devices, l / args.accum_grad) monitor_err.add(i * n_devices, e / args.accum_grad) monitor_time.add(i * n_devices) # Learning rate decay at scheduled iter if i * n_devices in args.learning_rate_decay_at: solver.set_learning_rate(solver.learning_rate() * 0.1) if device_id == 0: nn.save_parameters( os.path.join(args.model_save_path, 'param_%06d.h5' % (args.max_iter / n_devices)))
def infl_icml(model_info_dict, file_dir_dict, use_all_params, need_evaluate, alpha): num_epochs = 2 # params lr = 0.005 seed = model_info_dict['seed'] net_func = model_info_dict['net_func'] batch_size = model_info_dict['batch_size'] test_batch_size = 1000 target_epoch = model_info_dict['num_epochs'] # files and dirs save_dir = file_dir_dict['save_dir'] infl_filename = file_dir_dict['infl_filename'] final_model_name = file_dir_dict['model_filename'] final_model_path = os.path.join(save_dir, 'epoch%02d' % (target_epoch - 1), 'weights', final_model_name) input_dir_name = os.path.dirname(file_dir_dict['train_csv']) # setup trainset, valset, image_shape, n_classes, ntr, nval = init_dataset( file_dir_dict['train_csv'], file_dir_dict['val_csv'], seed) n_channels, _h, _w = image_shape resize_size = get_image_size((_h, _w)) idx_train = get_indices(ntr, seed) idx_val = get_indices(nval, seed) nn.load_parameters(final_model_path) trained_params = nn.get_parameters(grad_only=False) test = True grad_model = functools.partial(setup_model, net_func=net_func, n_classes=n_classes, n_channels=n_channels, resize_size=resize_size, test=test, reduction='mean') solver = S.Momentum(lr=lr, momentum=0.9) solver.set_parameters(trained_params) # gradient u = compute_gradient(grad_model, solver, valset, test_batch_size, idx_val, resize_size) # Hinv * u with SGD seed_train = 0 v = dict() for key, param in nn.get_parameters(grad_only=False).items(): v[key] = nn.Variable(param.d.shape, need_grad=True) v[key].d = 0 v[key].g = 0 solver.set_parameters(v) loss_train = [] loss_fn = None for epoch in range(num_epochs): # training seed_train = 0 np.random.seed(epoch) idx = get_batch_indices(ntr, batch_size, seed=epoch) for j, i in enumerate(idx): seeds = list(range(seed_train, seed_train + i.size)) seed_train += i.size X, y = get_batch_data(trainset, idx_train, i, resize_size, test=False, seeds=seeds) _, loss_fn, input_image = adjust_batch_size( grad_model, len(X), loss_fn) input_image["image"].d = X input_image["label"].d = y loss_fn.forward() grad_params = nn.grad(loss_fn, [ param for param in nn.get_parameters(grad_only=False).values() ]) vg = 0 for vv, g in zip(v.values(), grad_params): vg += F.sum(vv * g) for parameters in trained_params.values(): parameters.grad.zero() vgrad_params = nn.grad(vg, [ param for param in nn.get_parameters(grad_only=False).values() ]) loss_i = 0 for vgp, vv, uu in zip(vgrad_params, v.values(), u.values()): loss_i += 0.5 * F.sum(vgp * vv + alpha * vv * vv) - F.sum( uu * vv) loss_i.forward() solver.zero_grad() loss_i.backward(clear_buffer=True) solver.update() loss_train.append(loss_i.d.copy()) # influence infl_dict = dict() infl = np.zeros(ntr) for i in tqdm(range(ntr), desc='calc influence (3/3 steps)'): csv_idx = idx_train[i] file_name = trainset.get_filepath_to_data(csv_idx) file_name = os.path.join(input_dir_name, file_name) file_name = os.path.normpath(file_name) X, y = get_data(trainset, idx_train[i], resize_size, True, seed=i) _, loss_fn, input_image = adjust_batch_size(grad_model, len(X), loss_fn) input_image["image"].d = X input_image["label"].d = y loss_fn.forward() for parameters in trained_params.values(): parameters.grad.zero() loss_fn.backward(clear_buffer=True) infl_i = 0 for j, param in enumerate(nn.get_parameters(grad_only=False).values()): infl_i += (param.g.copy() * list(v.values())[j].d.copy()).sum() infl[i] = -infl_i / ntr infl_dict[csv_idx] = [file_name, y, infl[i]] infl_list = [val + [key] for key, val in infl_dict.items()] infl_list = sorted(infl_list, key=lambda x: (x[-2])) # save header = ['x:image', 'y:label', 'influence', 'datasource_index'] data_type = 'object,int,float,int' if need_evaluate: save_infl_for_analysis(infl_list, use_all_params, save_dir, infl_filename, epoch, header, data_type) save_to_csv(filename=infl_filename, header=header, list_to_save=infl_list, data_type=data_type)
def train(): """ Main script. Steps: * Parse command line arguments. * Specify contexts for computation. * Initialize DataIterator. * Construct a computation graph for training and one for validation. * Initialize solver and set parameter variables to that. * Create monitor instances for saving and displaying training stats. * Training loop * Computate error rate for validation data (periodically) * Get a next minibatch. * Execute forwardprop * Set parameter gradients zero * Execute backprop. * Solver updates parameters by using gradients computed by backprop. * Compute training error """ # Parse args args = get_args() n_valid_samples = 10000 bs_valid = args.batch_size extension_module = args.context ctx = get_extension_context(extension_module, device_id=args.device_id, type_config=args.type_config) nn.set_default_context(ctx) # Dataset data_iterator = data_iterator_cifar10 n_class = 10 # Model architecture if args.net == "resnet18": prediction = functools.partial(resnet18_prediction, ncls=n_class, nmaps=64, act=F.relu) if args.net == "resnet34": prediction = functools.partial(resnet34_prediction, ncls=n_class, nmaps=64, act=F.relu) # Create training graphs test = False if args.mixtype == "mixup": mdl = MixupLearning(args.batch_size, alpha=args.alpha) elif args.mixtype == "cutmix": mdl = CutmixLearning((args.batch_size, 3, 32, 32), alpha=args.alpha, cutmix_prob=1.0) elif args.mixtype == "vhmixup": mdl = VHMixupLearning((args.batch_size, 3, 32, 32), alpha=args.alpha) else: print("[ERROR] Unknown mixtype: " + args.mixtype) return image_train = nn.Variable((args.batch_size, 3, 32, 32)) label_train = nn.Variable((args.batch_size, 1)) mix_image, mix_label = mdl.mix_data(single_image_augment(image_train), F.one_hot(label_train, (n_class, ))) pred_train = prediction(mix_image, test) loss_train = mdl.loss(pred_train, mix_label) input_train = {"image": image_train, "label": label_train} # Create validation graph test = True image_valid = nn.Variable((bs_valid, 3, 32, 32)) pred_valid = prediction(image_valid, test) input_valid = {"image": image_valid} # Solvers if args.solver == "Adam": solver = S.Adam() elif args.solver == "Momentum": solver = S.Momentum(lr=args.learning_rate) solver.set_parameters(nn.get_parameters()) # Create monitor from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed monitor = Monitor(args.save_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=10) monitor_time = MonitorTimeElapsed("Training time", monitor, interval=10) monitor_verr = MonitorSeries("Test error", monitor, interval=1) # Data Iterator tdata = data_iterator(args.batch_size, True) vdata = data_iterator(args.batch_size, False) print("Size of the training data: %d " % tdata.size) # Training-loop for i in range(args.max_iter): # Forward/Zerograd/Backward image, label = tdata.next() input_train["image"].d = image input_train["label"].d = label mdl.set_mix_ratio() loss_train.forward() solver.zero_grad() loss_train.backward() # Model update by solver if args.solver == "Momentum": if i == args.max_iter / 2: solver.set_learning_rate(args.learning_rate / 10.0) if i == args.max_iter / 4 * 3: solver.set_learning_rate(args.learning_rate / 10.0**2) solver.update() # Validation if (i + 1) % args.val_interval == 0 or i == 0: ve = 0. vdata._reset() vdata_pred = np.zeros((n_valid_samples, n_class)) vdata_label = np.zeros((n_valid_samples, 1), dtype=np.int32) for j in range(0, n_valid_samples, args.batch_size): image, label = vdata.next() input_valid["image"].d = image pred_valid.forward() vdata_pred[j:min(j + args.batch_size, n_valid_samples )] = pred_valid.d[:min( args.batch_size, n_valid_samples - j)] vdata_label[j:min(j + args.batch_size, n_valid_samples )] = label[:min(args. batch_size, n_valid_samples - j)] ve = categorical_error(vdata_pred, vdata_label) monitor_verr.add(i + 1, ve) if int((i + 1) % args.model_save_interval) == 0: nn.save_parameters( os.path.join(args.save_path, 'params_%06d.h5' % (i + 1))) # Monitering monitor_loss.add(i + 1, loss_train.d.copy()) monitor_time.add(i + 1) nn.save_parameters( os.path.join(args.save_path, 'params_%06d.h5' % (args.max_iter)))
init_width = args.width init_height = args.height init_epoch = seen/nsamples yolo_x_nnabla, yolo_features_nnabla, yolo_vars, yolo_tvars, loss_nnabla = create_network( batch_size, init_height, init_width, args) from nnabla.ext_utils import get_extension_context ctx = get_extension_context("cudnn") nn.set_default_context(ctx) # Load parameters print("Load", args.weight, "...") nn.load_parameters(args.weight) print(nn.get_parameters()) param_convweights = { k: v for k, v in nn.get_parameters().items() if k.endswith("conv/W")} param_others = {k: v for k, v in nn.get_parameters().items() if not k.endswith("conv/W")} solver_convweights = S.Momentum(learning_rate, args.momentum) solver_others = S.Momentum(learning_rate, args.momentum) solver_convweights.set_parameters(param_convweights) solver_others.set_parameters(param_others) print(init_epoch, max_epochs) for epoch in range(int(init_epoch), int(max_epochs)): train(epoch)
def train(): """ Main script. """ args = get_args() _ = nn.load_parameters(args.pretrained_model_path) if args.fine_tune: nnabla.parameter.pop_parameter('decoder/logits/affine/conv/W') nnabla.parameter.pop_parameter('decoder/logits/affine/conv/b') n_train_samples = args.train_samples n_val_samples = args.val_samples distributed = args.distributed compute_acc = args.compute_acc if distributed: # Communicator and Context from nnabla.ext_utils import get_extension_context extension_module = "cudnn" ctx = get_extension_context( extension_module, type_config=args.type_config) comm = C.MultiProcessDataParalellCommunicator(ctx) comm.init() n_devices = comm.size mpi_rank = comm.rank device_id = mpi_rank ctx.device_id = str(device_id) nn.set_default_context(ctx) else: # Get context. from nnabla.ext_utils import get_extension_context extension_module = args.context if args.context is None: extension_module = 'cpu' logger.info("Running in %s" % extension_module) ctx = get_extension_context( extension_module, device_id=args.device_id, type_config=args.type_config) nn.set_default_context(ctx) n_devices = 1 device_id = 0 # training data data = data_iterator_segmentation( args.train_samples, args.batch_size, args.train_dir, args.train_label_dir, target_width=args.image_width, target_height=args.image_height) # validation data vdata = data_iterator_segmentation(args.val_samples, args.batch_size, args.val_dir, args.val_label_dir, target_width=args.image_width, target_height=args.image_height) if distributed: data = data.slice( rng=None, num_of_slices=n_devices, slice_pos=device_id) vdata = vdata.slice( rng=None, num_of_slices=n_devices, slice_pos=device_id) num_classes = args.num_class # Workaround to start with the same initialized weights for all workers. np.random.seed(313) t_model = get_model( args, test=False) t_model.pred.persistent = True # Not clearing buffer of pred in backward t_pred2 = t_model.pred.unlinked() t_e = F.sum(F.top_n_error(t_pred2, t_model.label, axis=1) * t_model.mask) / F.sum(t_model.mask) v_model = get_model( args, test=True) v_model.pred.persistent = True # Not clearing buffer of pred in forward v_pred2 = v_model.pred.unlinked() v_e = F.sum(F.top_n_error(v_pred2, v_model.label, axis=1) * v_model.mask) / F.sum(t_model.mask) # Create Solver solver = S.Momentum(args.learning_rate, 0.9) solver.set_parameters(nn.get_parameters()) # Load checkpoint start_point = 0 if args.checkpoint is not None: # load weights and solver state info from specified checkpoint file. start_point = load_checkpoint(args.checkpoint, solver) # Setting warmup. base_lr = args.learning_rate / n_devices warmup_iter = int(1. * n_train_samples / args.batch_size / args.accum_grad / n_devices) * args.warmup_epoch warmup_slope = base_lr * (n_devices - 1) / warmup_iter solver.set_learning_rate(base_lr) # Create monitor import nnabla.monitor as M monitor = M.Monitor(args.monitor_path) monitor_loss = M.MonitorSeries("Training loss", monitor, interval=10) monitor_err = M.MonitorSeries("Training error", monitor, interval=10) monitor_vloss = M.MonitorSeries("Validation loss", monitor, interval=1) monitor_verr = M.MonitorSeries("Validation error", monitor, interval=1) monitor_time = M.MonitorTimeElapsed("Training time", monitor, interval=10) monitor_miou = M.MonitorSeries("mean IOU", monitor, interval=10) monitor_vtime = M.MonitorTimeElapsed( "Validation time", monitor, interval=1) # save_nnp contents = save_nnp({'x': v_model.image}, { 'y': v_model.pred}, args.batch_size) save.save(os.path.join(args.model_save_path, 'Deeplabv3plus_result_epoch0.nnp'), contents, variable_batch_size=False) # Training loop for i in range(start_point, int(args.max_iter / n_devices)): # Save parameters if i % (args.model_save_interval // n_devices) == 0 and device_id == 0: save_checkpoint(args.model_save_path, i, solver) # Validation if i % (args.val_interval // n_devices) == 0 and i != 0: vmiou_local = 0. val_iter_local = n_val_samples // args.batch_size vl_local = nn.NdArray() vl_local.zero() ve_local = nn.NdArray() ve_local.zero() for j in range(val_iter_local): images, labels, masks = vdata.next() v_model.image.d = images v_model.label.d = labels v_model.mask.d = masks v_model.image.data.cast(np.float32, ctx) v_model.label.data.cast(np.int32, ctx) v_model.loss.forward(clear_buffer=True) v_e.forward(clear_buffer=True) vl_local += v_model.loss.data ve_local += v_e.data # Mean IOU computation if compute_acc: vmiou_local += compute_miou(num_classes, labels, np.argmax(v_model.pred.d, axis=1), masks) vl_local /= val_iter_local ve_local /= val_iter_local if compute_acc: vmiou_local /= val_iter_local vmiou_ndarray = nn.NdArray.from_numpy_array( np.array(vmiou_local)) if distributed: comm.all_reduce(vl_local, division=True, inplace=True) comm.all_reduce(ve_local, division=True, inplace=True) if compute_acc: comm.all_reduce(vmiou_ndarray, division=True, inplace=True) if device_id == 0: monitor_vloss.add(i * n_devices, vl_local.data.copy()) monitor_verr.add(i * n_devices, ve_local.data.copy()) if compute_acc: monitor_miou.add(i * n_devices, vmiou_local) monitor_vtime.add(i * n_devices) # Training l = 0.0 e = 0.0 solver.zero_grad() e_acc = nn.NdArray(t_e.shape) e_acc.zero() l_acc = nn.NdArray(t_model.loss.shape) l_acc.zero() # Gradient accumulation loop for j in range(args.accum_grad): images, labels, masks = data.next() t_model.image.d = images t_model.label.d = labels t_model.mask.d = masks t_model.image.data.cast(np.float32, ctx) t_model.label.data.cast(np.int32, ctx) t_model.loss.forward(clear_no_need_grad=True) t_model.loss.backward(clear_buffer=True) # Accumulating gradients t_e.forward(clear_buffer=True) e_acc += t_e.data l_acc += t_model.loss.data # AllReduce if distributed: params = [x.grad for x in nn.get_parameters().values()] comm.all_reduce(params, division=False, inplace=False) comm.all_reduce(l_acc, division=True, inplace=True) comm.all_reduce(e_acc, division=True, inplace=True) solver.scale_grad(1./args.accum_grad) solver.weight_decay(args.weight_decay) solver.update() # Linear Warmup if i <= warmup_iter: lr = base_lr + warmup_slope * i solver.set_learning_rate(lr) if distributed: # Synchronize by averaging the weights over devices using allreduce if (i+1) % args.sync_weight_every_itr == 0: weights = [x.data for x in nn.get_parameters().values()] comm.all_reduce(weights, division=True, inplace=True) if device_id == 0: monitor_loss.add( i * n_devices, (l_acc / args.accum_grad).data.copy()) monitor_err.add( i * n_devices, (e_acc / args.accum_grad).data.copy()) monitor_time.add(i * n_devices) # Learning rate decay at scheduled iter --> changed to poly learning rate decay policy # if i in args.learning_rate_decay_at: solver.set_learning_rate(base_lr * ((1 - i / args.max_iter)**0.1)) if device_id == 0: nn.save_parameters(os.path.join(args.model_save_path, 'param_%06d.h5' % args.max_iter)) contents = save_nnp({'x': v_model.image}, { 'y': v_model.pred}, args.batch_size) save.save(os.path.join(args.model_save_path, 'Deeplabv3plus_result.nnp'), contents, variable_batch_size=False)
def CNN_run(args, model): data_iterator_train, data_iterator_valid, num_class = \ get_data_iterator_and_num_class(args) channels, image_height, image_width = 3, args.height, args.width batch_size = args.batch_size initial_model_lr = args.model_lr one_epoch = data_iterator_train.size // batch_size max_iter = args.epoch * one_epoch val_iter = data_iterator_valid.size // batch_size # Create monitor. monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=100) monitor_err = MonitorSeries("Training error", monitor, interval=100) monitor_vloss = MonitorSeries("Test loss", monitor, interval=100) monitor_verr = MonitorSeries("Test error", monitor, interval=100) # prepare variables and graph used for test image_valid = nn.Variable( (batch_size, channels, image_height, image_width)) label_valid = nn.Variable((batch_size, 1)) input_image_valid = {"image": image_valid, "label": label_valid} pred_valid = construct_networks(args, image_valid, model, num_class, test=True) pred_valid.persistent = True loss_valid = loss_function(pred_valid, label_valid) top_1e_valid = F.mean(F.top_n_error(pred_valid, label_valid)) # prepare variables and graph used for training image_train = nn.Variable( (batch_size, channels, image_height, image_width)) label_train = nn.Variable((batch_size, 1)) input_image_train = {"image": image_train, "label": label_train} pred_train = construct_networks(args, image_train, model, num_class, test=False) loss_train = loss_function(pred_train, label_train) top_1e_train = F.mean(F.top_n_error(pred_train, label_train)) # prepare solvers solver = S.Momentum(initial_model_lr) solver.set_parameters(nn.get_parameters()) # Training-loop for i in range(max_iter): image, label = data_iterator_train.next() input_image_train["image"].d = image input_image_train["label"].d = label nn.forward_all([loss_train, top_1e_train], clear_no_need_grad=True) monitor_loss.add(i, loss_train.d.copy()) monitor_err.add(i, top_1e_train.d.copy()) if args.lr_control_model: new_lr = learning_rate_scheduler(i, max_iter, initial_model_lr, 0) solver.set_learning_rate(new_lr) solver.zero_grad() loss_train.backward(clear_buffer=True) if args.with_grad_clip_model: for k, v in nn.get_parameters().items(): v.grad.copy_from( F.clip_by_norm(v.grad, args.grad_clip_value_model)) # update parameters solver.weight_decay(args.weight_decay_model) solver.update() if i % args.model_save_interval == 0: # Validation during training. ve = 0. vloss = 0. for j in range(val_iter): v_image, v_label = data_iterator_valid.next() input_image_valid["image"].d = v_image input_image_valid["label"].d = v_label nn.forward_all([loss_valid, top_1e_valid], clear_buffer=True) vloss += loss_valid.d.copy() ve += top_1e_valid.d.copy() ve /= val_iter vloss /= val_iter monitor_vloss.add(i, vloss) monitor_verr.add(i, ve) nn.save_parameters( os.path.join(args.model_save_path, 'params_{}.h5'.format(i))) ve = 0. vloss = 0. for j in range(val_iter): v_image, v_label = data_iterator_valid.next() input_image_valid["image"].d = v_image input_image_valid["label"].d = v_label nn.forward_all([loss_valid, top_1e_valid], clear_buffer=True) vloss += loss_valid.d.copy() ve += top_1e_valid.d.copy() ve /= val_iter vloss /= val_iter monitor_vloss.add(i, vloss) monitor_verr.add(i, ve) nn.save_parameters( os.path.join(args.model_save_path, 'params_{}.h5'.format(i))) return
def CNN_run(args, ops, alphas_dict): """ Based on the given model architecture, construct CNN and execute training. input: args: arguments set by user. ops: operations used in the network. arch_dict: a dictionary containing architecture information. """ data_iterator = data_iterator_cifar10 all_data = data_iterator(args.batch_size, True) tdata = all_data.slice(rng=None, slice_start=0, slice_end=25000) vdata = all_data.slice(rng=None, slice_start=25000, slice_end=50000) # CIFAR10 statistics, mean and variance CIFAR_MEAN = np.reshape([0.49139968, 0.48215827, 0.44653124], (1, 3, 1, 1)) CIFAR_STD = np.reshape([0.24703233, 0.24348505, 0.26158768], (1, 3, 1, 1)) channels, image_height, image_width = 3, 32, 32 batch_size = args.batch_size initial_model_lr = args.model_lr one_epoch = tdata.size // batch_size max_iter = args.epoch * one_epoch # Create monitor. monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=100) monitor_err = MonitorSeries("Training error", monitor, interval=100) monitor_vloss = MonitorSeries("Validation loss", monitor, interval=100) monitor_verr = MonitorSeries("Validation error", monitor, interval=100) # prepare variables and graph used for training image_train = nn.Variable( (batch_size, channels, image_height, image_width)) label_train = nn.Variable((batch_size, 1)) input_image_train = {"image": image_train, "label": label_train} pred_train = construct_networks(args, ops, image_train, test=False) loss_train = loss_function(pred_train, label_train) # prepare solvers for model parameters model_params_dict = \ {k: v for k, v in nn.get_parameters().items() if "alpha_" not in k} solver_model = S.Momentum(initial_model_lr) solver_model.set_parameters( { k: v for k, v in nn.get_parameters().items() if k in model_params_dict.keys() }, reset=False, retain_state=True) # prepare solvers for architecture parameters solver_archs = S.Adam(alpha=args.arch_lr, beta1=0.5, beta2=0.999) solver_archs.set_parameters( { k: v for k, v in nn.get_parameters().items() if k in alphas_dict.keys() }, reset=False, retain_state=True) # Training-loop for i in range(max_iter): # Update Model Parameters. if args.second_order: # store the weights before update. original_weights = { k: nn.Variable(v.shape, need_grad=True).apply( data=nn.NdArray(v.shape).copy_from(v.data)) for k, v in nn.get_parameters().items() if "alpha_" not in k } # gradients refuge accumulated_gradient = \ {k: nn.Variable(v.shape).apply(d=0) for k, v in alphas_dict.items()} image, label = tdata.next() image = image / 255.0 image = (image - CIFAR_MEAN) / CIFAR_STD input_image_train["image"].d = image input_image_train["label"].d = label loss_train.forward() e = categorical_error(pred_train.d, input_image_train["label"].d) monitor_loss.add(i, loss_train.d.copy()) monitor_err.add(i, e) if args.lr_control_model: new_lr = learning_rate_scheduler(i, max_iter, initial_model_lr, 0) solver_model.set_learning_rate(new_lr) solver_model.zero_grad() loss_train.backward(clear_buffer=True) if args.with_grad_clip_model: for k, v in model_params_dict.items(): v.grad.copy_from( F.clip_by_norm(v.grad, args.grad_clip_value_model)) solver_model.weight_decay(args.weight_decay_model) solver_model.update() # weights update ( w -> w') if args.second_order: updated_weights = { k: nn.Variable(v.shape, need_grad=True).apply( data=nn.NdArray(v.shape).copy_from(v.data)) for k, v in nn.get_parameters().items() if "alpha_" not in k } # Update Architecture Parameters. ve, vloss = 0., 0. v_image, v_label = vdata.next() v_image = v_image / 255.0 v_image = (v_image - CIFAR_MEAN) / CIFAR_STD input_image_train["image"].d = v_image input_image_train["label"].d = v_label # compute Loss_on_valid(w', alpha) loss_train.forward(clear_no_need_grad=True) ve = categorical_error(pred_train.d, input_image_train["label"].d) monitor_vloss.add(i, loss_train.d.copy()) monitor_verr.add(i, ve) solver_archs.zero_grad() solver_model.zero_grad() loss_train.backward(clear_buffer=True) # its gradient is stored if args.second_order: accumulated_gradient = store_gradient(accumulated_gradient, alphas_dict, coeff=1.) # grad_alpha_L_val(w', alpha). Note that gradient stored into .data delta_gradient_w = { k: nn.Variable(v.shape).apply(data=nn.NdArray( v.shape).copy_from(v.grad), need_grad=True) for k, v in nn.get_parameters().items() if "alpha_" not in k } epsilon = 0.01 / np.sum( [np.linalg.norm(v.d) for v in delta_gradient_w.values()]) coeff = 1.0 * epsilon # w -> w+ (= w + epsilon*grad_Loss_on_val(w', alpha)) weight_modify(original_weights, delta_gradient_w, model_params_dict, coeff) input_image_train["image"].d = image # reuse the same data input_image_train["label"].d = label # compute Loss_on_train(w+, alpha) loss_train.forward() solver_archs.zero_grad() solver_model.zero_grad() loss_train.backward(clear_buffer=True) # its gradient is stored # accumulate currently registered gradient coeff = (-1.) * args.eta / 2. * epsilon accumulated_gradient = store_gradient(accumulated_gradient, alphas_dict, coeff) coeff = -1.0 * epsilon # w -> w- (= w - epsilon*grad_Loss_on_val(w', alpha)) weight_modify(original_weights, delta_gradient_w, model_params_dict, coeff) # compute Loss_on_train(w-, alpha) loss_train.forward() solver_archs.zero_grad() solver_model.zero_grad() loss_train.backward(clear_buffer=True) # its gradient is stored # accumulate currently registered gradient again coeff = (+1.) * args.eta / 2. * epsilon accumulated_gradient = store_gradient(accumulated_gradient, alphas_dict, coeff) # replace the weights for k, v in alphas_dict.items(): nn.parameter.set_parameter( k, nn.Variable(v.shape).apply(data=v.data, grad=accumulated_gradient[k], need_grad=True)) for k, v in model_params_dict.items(): nn.parameter.set_parameter( k, nn.Variable(v.shape).apply(data=updated_weights[k].data, need_grad=True)) solver_archs.weight_decay(args.weight_decay_archs) solver_archs.update() if i % 1000 == 0: for k, v in alphas_dict.items(): keynames = k.split("_") print("\nParameters for {} cell, node {} to {};".format( keynames[1], keynames[2], keynames[3])) show_ops_and_prob(v.d, ops) return alphas_dict
def main(args): # Settings device_id = args.device_id batch_size = args.batch_size batch_size_eval = args.batch_size_eval n_l_train_data = 4000 n_train_data = 50000 n_cls = 10 learning_rate = 1. * 1e-3 n_epoch = 300 act = F.relu iter_epoch = n_train_data / batch_size n_iter = n_epoch * iter_epoch extension_module = args.context lambda_ = args.lambda_ # Model ## supervised batch_size, m, h, w = batch_size, 3, 32, 32 ctx = extension_context(extension_module, device_id=device_id) x_l = nn.Variable((batch_size, m, h, w)) y_l = nn.Variable((batch_size, 1)) pred, log_var = cnn_model_003(ctx, x_l) one = F.constant(1., log_var.shape) loss_ce = ce_loss(ctx, pred, y_l) reg_sigma = sigma_regularization(ctx, log_var, one) loss_supervised = loss_ce + er_loss(ctx, pred) + lambda_ * reg_sigma ## stochastic regularization x_u0 = nn.Variable((batch_size, m, h, w)) x_u1 = nn.Variable((batch_size, m, h, w)) pred_x_u0, log_var0 = cnn_model_003(ctx, x_u0) pred_x_u1, log_var1 = cnn_model_003(ctx, x_u1) loss_sr = sr_loss_with_uncertainty(ctx, pred_x_u0, pred_x_u1, log_var0, log_var1) reg_sigma0 = sigma_regularization(ctx, log_var0, one) reg_sigma1 = sigma_regularization(ctx, log_var1, one) reg_sigmas = sigmas_regularization(ctx, log_var0, log_var1) loss_unsupervised = loss_sr + er_loss(ctx, pred_x_u0) + er_loss(ctx, pred_x_u1) \ + lambda_ * (reg_sigma0 + reg_sigma1) + lambda_ * reg_sigmas ## evaluate batch_size_eval, m, h, w = batch_size, 3, 32, 32 x_eval = nn.Variable((batch_size_eval, m, h, w)) pred_eval, _ = cnn_model_003(ctx, x_eval, test=True) # Solver with nn.context_scope(ctx): solver = S.Momentum(learning_rate) solver.set_parameters(nn.get_parameters()) # Dataset ## separate dataset home = os.environ.get("HOME") fpath = os.path.join(home, "datasets/cifar10/cifar-10.npz") separator = Separator(n_l_train_data) separator.separate_then_save(fpath) l_train_path = os.path.join(home, "datasets/cifar10/l_cifar-10.npz") u_train_path = os.path.join(home, "datasets/cifar10/cifar-10.npz") test_path = os.path.join(home, "datasets/cifar10/cifar-10.npz") # data reader data_reader = Cifar10DataReader(l_train_path, u_train_path, test_path, batch_size=batch_size, n_cls=n_cls, da=True, shape=True) # Training loop print("# Training loop") epoch = 1 st = time.time() acc_prev = 0. for i in range(n_iter): # Get data and set it to the varaibles x_l0_data, x_l1_data, y_l_data = data_reader.get_l_train_batch() x_u0_data, x_u1_data, y_u_data = data_reader.get_u_train_batch() x_l.d, _ , y_l.d= x_l0_data, x_l1_data, y_l_data x_u0.d, x_u1.d= x_u0_data, x_u1_data # Train loss_supervised.forward(clear_no_need_grad=True) loss_unsupervised.forward(clear_no_need_grad=True) solver.zero_grad() loss_supervised.backward(clear_buffer=True) loss_unsupervised.backward(clear_buffer=True) solver.update() # Evaluate if (i+1) % iter_epoch == 0: # Get data and set it to the varaibles x_data, y_data = data_reader.get_test_batch() # Evaluation loop ve = 0. iter_val = 0 for k in range(0, len(x_data), batch_size_eval): x_eval.d = get_test_data(x_data, k, batch_size_eval) label = get_test_data(y_data, k, batch_size_eval) pred_eval.forward(clear_buffer=True) ve += categorical_error(pred_eval.d, label) iter_val += 1 msg = "Epoch:{},ElapsedTime:{},Acc:{:02f}".format( epoch, time.time() - st, (1. - ve / iter_val) * 100) print(msg) st = time.time() epoch +=1 if epoch in [100, 200]: learning_rate /= 10. solver.set_learning_rate(learning_rate)
with_file_cache=False) x = nn.Variable((batch_size, sentence_length)) t = nn.Variable((batch_size, sentence_length, 1)) h = PF.embed(x, vocab_size, embedding_size) h = LSTM(h, hidden, return_sequences=True) h = TimeDistributed(PF.affine)(h, hidden, name='hidden') y = TimeDistributed(PF.affine)(h, vocab_size, name='output') mask = F.sum(F.sign(t), axis=2) # do not predict 'pad'. entropy = TimeDistributedSoftmaxCrossEntropy(y, t) * mask count = F.sum(mask, axis=1) loss = F.mean(F.div2(F.sum(entropy, axis=1), count)) # Create solver. solver = S.Momentum(1e-2, momentum=0.9) solver.set_parameters(nn.get_parameters()) # Create monitor. from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed monitor = Monitor('./tmp-lstmlm') monitor_perplexity = MonitorSeries('perplexity', monitor, interval=1) monitor_perplexity_valid = MonitorSeries('perplexity_valid', monitor, interval=1) for epoch in range(max_epoch): train_loss_set = [] for i in tqdm(range(num_train_batch)): x_batch, y_batch = train_data_iter.next() y_batch = y_batch.reshape(list(y_batch.shape) + [1])
def train(): """ Main script. """ args = get_args() # Get context. from nnabla.ext_utils import get_extension_context extension_module = args.context if args.context is None: extension_module = 'cpu' logger.info("Running in %s" % extension_module) ctx = get_extension_context(extension_module, device_id=args.device_id, type_config=args.type_config) nn.set_default_context(ctx) if args.tiny_mode: # We use Tiny ImageNet from Stanford CS231N class. # (Tiny ImageNet, https://tiny-imagenet.herokuapp.com/) # Tiny ImageNet consists of 200 categories, each category has 500 images # in training set. The image size is 64x64. To adapt ResNet into 64x64 # image inputs, the input image size of ResNet is set as 56x56, and # the stride in the first conv and the first max pooling are removed. # Please check README. data = data_iterator_tiny_imagenet(args.batch_size, 'train') vdata = data_iterator_tiny_imagenet(args.batch_size, 'val') num_classes = 200 else: # We use ImageNet. # (ImageNet, https://imagenet.herokuapp.com/) # ImageNet consists of 1000 categories, each category has 1280 images # in training set. The image size is various. To adapt ResNet into # 320x320 image inputs, the input image size of ResNet is set as # 224x224. We need to get tar file and create cache file(320x320 images). # Please check README. data = data_iterator_imagenet(args.batch_size, args.train_cachefile_dir) vdata = data_iterator_imagenet(args.batch_size, args.val_cachefile_dir) num_classes = 1000 t_model = get_model(args, num_classes, test=False, tiny=args.tiny_mode) t_model.pred.persistent = True # Not clearing buffer of pred in backward t_pred2 = t_model.pred.unlinked() t_e = F.mean(F.top_n_error(t_pred2, t_model.label)) v_model = get_model(args, num_classes, test=True, tiny=args.tiny_mode) v_model.pred.persistent = True # Not clearing buffer of pred in forward v_pred2 = v_model.pred.unlinked() v_e = F.mean(F.top_n_error(v_pred2, v_model.label)) # Create Solver. solver = S.Momentum(args.learning_rate, 0.9) solver.set_parameters(nn.get_parameters()) # Create monitor. import nnabla.monitor as M monitor = M.Monitor(args.monitor_path) monitor_loss = M.MonitorSeries("Training loss", monitor, interval=10) monitor_err = M.MonitorSeries("Training error", monitor, interval=10) monitor_vloss = M.MonitorSeries("Validation loss", monitor, interval=10) monitor_verr = M.MonitorSeries("Validation error", monitor, interval=10) monitor_time = M.MonitorTimeElapsed("Training time", monitor, interval=10) monitor_vtime = M.MonitorTimeElapsed("Validation time", monitor, interval=10) # Training loop. for i in range(args.max_iter): # Save parameters if i % args.model_save_interval == 0: nn.save_parameters( os.path.join(args.model_save_path, 'param_%06d.h5' % i)) # Validation if i % args.val_interval == 0 and i != 0: # Clear all intermediate memory to save memory. # t_model.loss.clear_recursive() l = 0.0 e = 0.0 for j in range(args.val_iter): images, labels = vdata.next() v_model.image.d = images v_model.label.d = labels v_model.image.data.cast(np.uint8, ctx) v_model.label.data.cast(np.int32, ctx) v_model.loss.forward(clear_buffer=True) v_e.forward(clear_buffer=True) l += v_model.loss.d e += v_e.d monitor_vloss.add(i, l / args.val_iter) monitor_verr.add(i, e / args.val_iter) monitor_vtime.add(i) # Clear all intermediate memory to save memory. # v_model.loss.clear_recursive() # Training l = 0.0 e = 0.0 solver.zero_grad() def accumulate_error(l, e, t_model, t_e): l += t_model.loss.d e += t_e.d return l, e # Gradient accumulation loop for j in range(args.accum_grad): images, labels = data.next() if j != 0: # Update e and l according to previous results of forward # propagation. # The update of last iteration is performed # after solver update to avoid unnecessary CUDA synchronization. # This is performed after data.next() in order to overlap # the data loading and graph execution. # TODO: Move this to the bottom of the loop when prefetch # data loader is available. l, e = accumulate_error(l, e, t_model, t_e) t_model.image.d = images t_model.label.d = labels t_model.image.data.cast(np.uint8, ctx) t_model.label.data.cast(np.int32, ctx) t_model.loss.forward(clear_no_need_grad=True) t_model.loss.backward(clear_buffer=True) # Accumulating gradients t_e.forward(clear_buffer=True) solver.weight_decay(args.weight_decay) solver.update() # Accumulate errors after solver update l, e = accumulate_error(l, e, t_model, t_e) monitor_loss.add(i, l / args.accum_grad) monitor_err.add(i, e / args.accum_grad) monitor_time.add(i) # Learning rate decay at scheduled iter if i in args.learning_rate_decay_at: solver.set_learning_rate(solver.learning_rate() * 0.1) nn.save_parameters( os.path.join(args.model_save_path, 'param_%06d.h5' % args.max_iter))