def cifar10_iterator(config, comm, train=True): data_iterator_ = data_iterator_cifar10( batch_size=config['train']['batch_size'], train=train, rng=np.random.RandomState(config['model']['rng']), with_memory_cache=config['dataset']['with_memory_cache'], with_file_cache=config['dataset']['with_file_cache'])[1] if comm.n_procs > 1: data_iterator_ = data_iterator_.slice(rng=None, num_of_slices=comm.n_procs, slice_pos=comm.rank) return data_iterator_
def train(args): # Context ctx = get_extension_context(args.context, device_id=args.device_id, type_config=args.type_config) nn.set_default_context(ctx) # Args latent = args.latent maps = args.maps batch_size = args.batch_size image_size = args.image_size lambda_ = args.lambda_ # Model # generator loss z = nn.Variable([batch_size, latent]) x_fake = generator(z, maps=maps, up=args.up).apply(persistent=True) p_fake = discriminator(x_fake, maps=maps) loss_gen = gan_loss(p_fake).apply(persistent=True) # discriminator loss p_fake = discriminator(x_fake, maps=maps) x_real = nn.Variable([batch_size, 3, image_size, image_size]) p_real = discriminator(x_real, maps=maps) loss_dis = gan_loss(p_fake, p_real).apply(persistent=True) # gradient penalty eps = F.rand(shape=[batch_size, 1, 1, 1]) x_rmix = eps * x_real + (1.0 - eps) * x_fake p_rmix = discriminator(x_rmix, maps=maps) x_rmix.need_grad = True # Enabling gradient computation for double backward grads = nn.grad([p_rmix], [x_rmix]) l2norms = [F.sum(g**2.0, [1, 2, 3])**0.5 for g in grads] gp = sum([F.mean((l - 1.0)**2.0) for l in l2norms]) loss_dis += lambda_ * gp # generator with fixed value for test z_test = nn.Variable.from_numpy_array(np.random.randn(batch_size, latent)) x_test = generator(z_test, maps=maps, test=True, up=args.up).apply(persistent=True) # Solver solver_gen = S.Adam(args.lrg, args.beta1, args.beta2) solver_dis = S.Adam(args.lrd, args.beta1, args.beta2) with nn.parameter_scope("generator"): params_gen = nn.get_parameters() solver_gen.set_parameters(params_gen) with nn.parameter_scope("discriminator"): params_dis = nn.get_parameters() solver_dis.set_parameters(params_dis) # Monitor monitor = Monitor(args.monitor_path) monitor_loss_gen = MonitorSeries("Generator Loss", monitor, interval=10) monitor_loss_cri = MonitorSeries("Negative Critic Loss", monitor, interval=10) monitor_time = MonitorTimeElapsed("Training Time", monitor, interval=10) monitor_image_tile_train = MonitorImageTile("Image Tile Train", monitor, num_images=batch_size, interval=1, normalize_method=denormalize) monitor_image_tile_test = MonitorImageTile("Image Tile Test", monitor, num_images=batch_size, interval=1, normalize_method=denormalize) # Data Iterator di = data_iterator_cifar10(batch_size, True) # Train loop for i in range(args.max_iter): # Train discriminator x_fake.need_grad = False # no need backward to generator for _ in range(args.n_critic): solver_dis.zero_grad() x_real.d = di.next()[0] / 127.5 - 1.0 z.d = np.random.randn(batch_size, latent) loss_dis.forward(clear_no_need_grad=True) loss_dis.backward(clear_buffer=True) solver_dis.update() # Train generator x_fake.need_grad = True # need backward to generator solver_gen.zero_grad() z.d = np.random.randn(batch_size, latent) loss_gen.forward(clear_no_need_grad=True) loss_gen.backward(clear_buffer=True) solver_gen.update() # Monitor monitor_loss_gen.add(i, loss_gen.d) monitor_loss_cri.add(i, -loss_dis.d) monitor_time.add(i) # Save if i % args.save_interval == 0: monitor_image_tile_train.add(i, x_fake) monitor_image_tile_test.add(i, x_test) nn.save_parameters( os.path.join(args.monitor_path, "params_{}.h5".format(i))) # Last x_test.forward(clear_buffer=True) nn.save_parameters( os.path.join(args.monitor_path, "params_{}.h5".format(i))) monitor_image_tile_train.add(i, x_fake) monitor_image_tile_test.add(i, x_test)
def train(): """ Naive Multi-Device Training NOTE: the communicator exposes low-level interfaces * Parse command line arguments. * Instantiate a communicator and set parameter variables. * Specify contexts for computation. * Initialize DataIterator. * Construct a computation graph for training and one for validation. * Initialize solver and set parameter variables to that. * Create monitor instances for saving and displaying training stats. * Training loop * Computate error rate for validation data (periodically) * Get a next minibatch. * Execute forwardprop * Set parameter gradients zero * Execute backprop. * Inplace allreduce (THIS IS THE MAIN difference from a single device training) * Solver updates parameters by using gradients computed by backprop. * Compute training error """ # Parse args args = get_args() n_train_samples = 50000 bs_valid = args.batch_size # Communicator and Context extension_module = "cuda.cudnn" ctx = extension_context(extension_module) comm = C.MultiProcessDataParalellCommunicator(ctx) comm.init() n_devices = comm.size mpi_rank = comm.rank device_id = mpi_rank ctx = extension_context(extension_module, device_id=device_id) # Create training graphs test = False image_train = nn.Variable((args.batch_size, 3, 32, 32)) label_train = nn.Variable((args.batch_size, 1)) pred_train = cifar10_resnet23_prediction(image_train, ctx, test) loss_train = cifar10_resnet32_loss(pred_train, label_train) input_image_train = {"image": image_train, "label": label_train} # add parameters to communicator comm.add_context_and_parameters((ctx, nn.get_parameters())) # Create validation graph test = True image_valid = nn.Variable((bs_valid, 3, 32, 32)) pred_valid = cifar10_resnet23_prediction(image_valid, ctx, test) input_image_valid = {"image": image_valid} # Solvers solver = S.Adam() solver.set_parameters(nn.get_parameters()) base_lr = args.learning_rate warmup_iter = int( 1. * n_train_samples / args.batch_size / n_devices) * args.warmup_epoch warmup_slope = 1. * n_devices / warmup_iter # Create monitor from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=10) monitor_err = MonitorSeries("Training error", monitor, interval=10) monitor_time = MonitorTimeElapsed("Training time", monitor, interval=100) monitor_verr = MonitorSeries("Test error", monitor, interval=10) with data_iterator_cifar10(args.batch_size, True) as tdata, \ data_iterator_cifar10(bs_valid, False) as vdata: # Training-loop for i in range(int(args.max_iter / n_devices)): # Validation if mpi_rank == 0: if i % int(n_train_samples / args.batch_size / n_devices) == 0: ve = 0. for j in range(args.val_iter): image, label = vdata.next() input_image_valid["image"].d = image pred_valid.forward() ve += categorical_error(pred_valid.d, label) ve /= args.val_iter monitor_verr.add(i * n_devices, ve) if i % int(args.model_save_interval / n_devices) == 0: nn.save_parameters( os.path.join(args.model_save_path, 'params_%06d.h5' % i)) # Forward/Zerograd/Backward image, label = tdata.next() input_image_train["image"].d = image input_image_train["label"].d = label loss_train.forward() solver.zero_grad() loss_train.backward() # In-place Allreduce comm.allreduce(division=True) # Solvers update solver.update() # Linear Warmup if i < warmup_iter: lr = base_lr * n_devices * warmup_slope * i solver.set_learning_rate(lr) else: lr = base_lr * n_devices solver.set_learning_rate(lr) if mpi_rank == 0: e = categorical_error(pred_train.d, input_image_train["label"].d) monitor_loss.add(i * n_devices, loss_train.d.copy()) monitor_err.add(i * n_devices, e) monitor_time.add(i * n_devices) if mpi_rank == 0: nn.save_parameters( os.path.join(args.model_save_path, 'params_%06d.h5' % (args.max_iter / n_devices)))
def train(): """ Naive Multi-Device Training NOTE: the communicator exposes low-level interfaces * Parse command line arguments. * Specify contexts for computation. * Initialize DataIterator. * Construct computation graphs for training and one for validation. * Initialize solvers and set parameter variables to those. * Instantiate a communicator and set parameter variables. * Create monitor instances for saving and displaying training stats. * Training loop * Computate error rate for validation data (periodically) * Get a next minibatch. * Execute forwardprops * Set parameter gradients zero * Execute backprop. * In-place allreduce (THIS IS THE MAIN difference from a single device training) * Solver updates parameters by using gradients computed by backprop. * Compute training error """ # Parse args args = get_args() n_train_samples = 50000 bs_valid = args.batch_size # Create contexts extension_module = args.context if extension_module != "cuda" and \ extension_module != "cuda.cudnn": raise Exception("Use `cuda` or `cuda.cudnn` extension_module.") n_devices = args.n_devices ctxs = [] for i in range(n_devices): ctx = extension_context(extension_module, device_id=i) ctxs.append(ctx) ctx = ctxs[-1] # Create training graphs input_image_train = [] preds_train = [] losses_train = [] test = False for i in range(n_devices): image = nn.Variable((args.batch_size, 3, 32, 32)) label = nn.Variable((args.batch_size, 1)) device_scope_name = "device{}".format(i) pred = cifar10_resnet23_prediction(image, ctxs[i], device_scope_name, test) loss = cifar10_resnet32_loss(pred, label) input_image_train.append({"image": image, "label": label}) preds_train.append(pred) losses_train.append(loss) # Create validation graph test = True device_scope_name = "device{}".format(0) image_valid = nn.Variable((bs_valid, 3, 32, 32)) pred_valid = cifar10_resnet23_prediction(image_valid, ctxs[i], device_scope_name, test) input_image_valid = {"image": image_valid} # Solvers solvers = [] for i in range(n_devices): with nn.context_scope(ctxs[i]): solver = S.Adam() device_scope_name = "device{}".format(i) with nn.parameter_scope(device_scope_name): params = nn.get_parameters() solver.set_parameters(params) solvers.append(solver) # Communicator comm = C.DataParalellCommunicator(ctx) for i in range(n_devices): device_scope_name = "device{}".format(i) with nn.parameter_scope(device_scope_name): ctx = ctxs[i] params = nn.get_parameters() comm.add_context_and_parameters((ctx, params)) comm.init() # Create threadpools with one thread pools = [] for _ in range(n_devices): pool = ThreadPool(processes=1) pools.append(pool) # Once forward/backward to safely secure memory for device_id in range(n_devices): data, label = \ (np.random.randn(*input_image_train[device_id]["image"].shape), (np.random.rand(*input_image_train[device_id]["label"].shape) * 10).astype(np.int32)) ret = pools[device_id].apply_async( forward_backward, (input_image_train[device_id]["image"], data, input_image_train[device_id]["label"], label, losses_train[device_id], solvers[device_id])) ret.get() losses_train[device_id].d # sync to host # Create monitor. from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=10) monitor_err = MonitorSeries("Training error", monitor, interval=10) monitor_time = MonitorTimeElapsed("Training time", monitor, interval=100) monitor_verr = MonitorSeries("Test error", monitor, interval=10) # Data Iterator rng = np.random.RandomState(device_id) tdata = data_iterator_cifar10(args.batch_size, True, rng) vdata = data_iterator_cifar10(args.batch_size, False) # Training-loop for i in range(int(args.max_iter / n_devices)): # Validation if i % int(n_train_samples / args.batch_size / n_devices) == 0: ve = 0. for j in range(args.val_iter): image, label = vdata.next() input_image_valid["image"].d = image pred_valid.forward() ve += categorical_error(pred_valid.d, label) ve /= args.val_iter monitor_verr.add(i * n_devices, ve) if i % int(args.model_save_interval / n_devices) == 0: nn.save_parameters( os.path.join(args.model_save_path, 'params_%06d.h5' % i)) # Forwards/Zerograd/Backwards fb_results = [] for device_id in range(n_devices): image, label = tdata.next() res = pools[device_id].apply_async( forward_backward, (input_image_train[device_id]["image"], image, input_image_train[device_id]["label"], label, losses_train[device_id], solvers[device_id])) fb_results.append(res) for device_id in range(n_devices): fb_results[device_id].get() # In-place allreduce comm.allreduce(division=True, inplace=False) # Solvers update for device_id in range(n_devices): solvers[device_id].update() e = categorical_error(preds_train[-1].d, input_image_train[-1]["label"].d) monitor_loss.add(i * n_devices, losses_train[-1].d.copy()) monitor_err.add(i * n_devices, e) monitor_time.add(i * n_devices) nn.save_parameters( os.path.join(args.model_save_path, 'params_%06d.h5' % (args.max_iter / n_devices)))
def train(): args = get_args() # Get context. from nnabla.contrib.context import extension_context extension_module = args.context if args.context is None: extension_module = 'cpu' logger.info("Running in %s" % extension_module) ctx = extension_context(extension_module, device_id=args.device_id) nn.set_default_context(ctx) # Create CNN network for both training and testing. if args.net == "resnet23": model_prediction = cifar10_resnet23_prediction elif args.net == 'bincon_resnet23': model_prediction = cifar10_binary_connect_resnet23_prediction elif args.net == 'binnet_resnet23': model_prediction = cifar10_binary_net_resnet23_prediction elif args.net == 'bwn_resnet23': model_prediction = cifar10_binary_weight_resnet23_prediction elif args.net == 'fpcon_resnet23': model_prediction = cifar10_fp_connect_resnet23_prediction elif args.net == 'fpnet_resnet23': model_prediction = cifar10_fp_net_resnet23_prediction elif args.net == 'pow2con_resnet23': model_prediction = cifar10_pow2_connect_resnet23_prediction elif args.net == 'pow2net_resnet23': model_prediction = cifar10_pow2_net_resnet23_prediction # TRAIN maps = 64 c = 3 h = w = 32 n_train = 50000 n_valid = 10000 # Create input variables. image = nn.Variable([args.batch_size, c, h, w]) label = nn.Variable([args.batch_size, 1]) # Create model_prediction graph. pred = model_prediction(image, maps=maps, test=False) pred.persistent = True # Create loss function. loss = F.mean(F.softmax_cross_entropy(pred, label)) # TEST # Create input variables. vimage = nn.Variable([args.batch_size, c, h, w]) vlabel = nn.Variable([args.batch_size, 1]) # Create predition graph. vpred = model_prediction(vimage, maps=maps, test=True) # Create Solver. solver = S.Adam(args.learning_rate) solver.set_parameters(nn.get_parameters()) # Create monitor. from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=10) monitor_err = MonitorSeries("Training error", monitor, interval=10) monitor_time = MonitorTimeElapsed("Training time", monitor, interval=100) monitor_verr = MonitorSeries("Test error", monitor, interval=10) # Initialize DataIterator best_ve = 1.0 ve = 1.0 tdata = data_iterator_cifar10(args.batch_size, True) vdata = data_iterator_cifar10(args.batch_size, False) # Training loop for i in range(args.max_iter): if i % args.val_interval == 0: # Validation ve = 0.0 for j in range(int(n_valid / args.batch_size)): vimage.d, vlabel.d = vdata.next() vpred.forward(clear_buffer=True) ve += categorical_error(vpred.d, vlabel.d) ve /= int(n_valid / args.batch_size) monitor_verr.add(i, ve) if ve < best_ve: nn.save_parameters(os.path.join( args.model_save_path, 'params_%06d.h5' % i)) best_ve = ve # Training forward image.d, label.d = tdata.next() solver.zero_grad() loss.forward(clear_no_need_grad=True) loss.backward(clear_buffer=True) solver.weight_decay(args.weight_decay) solver.update() e = categorical_error(pred.d, label.d) monitor_loss.add(i, loss.d.copy()) monitor_err.add(i, e) monitor_time.add(i) ve = 0.0 for j in range(int(n_valid / args.batch_size)): vimage.d, vlabel.d = vdata.next() vpred.forward(clear_buffer=True) ve += categorical_error(vpred.d, vlabel.d) ve /= int(n_valid / args.batch_size) monitor_verr.add(i, ve) parameter_file = os.path.join( args.model_save_path, 'params_{:06}.h5'.format(args.max_iter)) nn.save_parameters(parameter_file)
def train(): """ Naive Multi-Device Training NOTE: the communicator exposes low-level interfaces * Parse command line arguments. * Instantiate a communicator and set parameter variables. * Specify contexts for computation. * Initialize DataIterator. * Construct a computation graph for training and one for validation. * Initialize solver and set parameter variables to that. * Create monitor instances for saving and displaying training stats. * Training loop * Computate error rate for validation data (periodically) * Get a next minibatch. * Execute forwardprop * Set parameter gradients zero * Execute backprop. * Inplace allreduce (THIS IS THE MAIN difference from a single device training) * Solver updates parameters by using gradients computed by backprop. * Compute training error """ # Parse args args = get_args() n_train_samples = 50000 bs_valid = args.batch_size # Communicator and Context extension_module = "cuda.cudnn" ctx = extension_context(extension_module) comm = C.MultiProcessDataParalellCommunicator(ctx) comm.init() n_devices = comm.size mpi_rank = comm.rank device_id = mpi_rank ctx = extension_context(extension_module, device_id=device_id) # Create training graphs test = False image_train = nn.Variable((args.batch_size, 3, 32, 32)) label_train = nn.Variable((args.batch_size, 1)) pred_train = cifar10_resnet23_prediction( image_train, ctx, test) loss_train = cifar10_resnet32_loss(pred_train, label_train) input_image_train = {"image": image_train, "label": label_train} # add parameters to communicator comm.add_context_and_parameters((ctx, nn.get_parameters())) # Create validation graph test = True image_valid = nn.Variable((bs_valid, 3, 32, 32)) pred_valid = cifar10_resnet23_prediction( image_valid, ctx, test) input_image_valid = {"image": image_valid} # Solvers solver = S.Adam() solver.set_parameters(nn.get_parameters()) base_lr = args.learning_rate warmup_iter = int(1. * n_train_samples / args.batch_size / n_devices) * args.warmup_epoch warmup_slope = 1. * n_devices / warmup_iter # Create monitor from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=10) monitor_err = MonitorSeries("Training error", monitor, interval=10) monitor_time = MonitorTimeElapsed("Training time", monitor, interval=100) monitor_verr = MonitorSeries("Test error", monitor, interval=10) with data_iterator_cifar10(args.batch_size, True) as tdata, \ data_iterator_cifar10(bs_valid, False) as vdata: # Training-loop for i in range(int(args.max_iter / n_devices)): # Validation if mpi_rank == 0: if i % int(n_train_samples / args.batch_size / n_devices) == 0: ve = 0. for j in range(args.val_iter): image, label = vdata.next() input_image_valid["image"].d = image pred_valid.forward() ve += categorical_error(pred_valid.d, label) ve /= args.val_iter monitor_verr.add(i * n_devices, ve) if i % int(args.model_save_interval / n_devices) == 0: nn.save_parameters(os.path.join( args.model_save_path, 'params_%06d.h5' % i)) # Forward/Zerograd/Backward image, label = tdata.next() input_image_train["image"].d = image input_image_train["label"].d = label loss_train.forward() solver.zero_grad() loss_train.backward() # In-place Allreduce comm.allreduce(division=True) # Solvers update solver.update() # Linear Warmup if i < warmup_iter: lr = base_lr * n_devices * warmup_slope * i solver.set_learning_rate(lr) else: lr = base_lr * n_devices solver.set_learning_rate(lr) if mpi_rank == 0: e = categorical_error( pred_train.d, input_image_train["label"].d) monitor_loss.add(i * n_devices, loss_train.d.copy()) monitor_err.add(i * n_devices, e) monitor_time.add(i * n_devices) if mpi_rank == 0: nn.save_parameters(os.path.join( args.model_save_path, 'params_%06d.h5' % (args.max_iter / n_devices)))
def train(): """ Naive Multi-Device Training NOTE: the communicator exposes low-level interfaces * Parse command line arguments. * Specify contexts for computation. * Initialize DataIterator. * Construct computation graphs for training and one for validation. * Initialize solvers and set parameter variables to those. * Instantiate a communicator and set parameter variables. * Create monitor instances for saving and displaying training stats. * Training loop * Computate error rate for validation data (periodically) * Get a next minibatch. * Execute forwardprops * Set parameter gradients zero * Execute backprop. * Inplace allreduce (THIS IS THE MAIN difference from a single device training) * Solver updates parameters by using gradients computed by backprop. * Compute training error """ # Parse args args = get_args() n_train_samples = 50000 bs_valid = args.batch_size # Create contexts extension_module = args.context if extension_module != "cuda" and \ extension_module != "cuda.cudnn": raise Exception("Use `cuda` or `cuda.cudnn` extension_module.") n_devices = args.n_devices ctxs = [] for i in range(n_devices): ctx = extension_context(extension_module, device_id=i) ctxs.append(ctx) ctx = ctxs[-1] # Create training graphs input_image_train = [] preds_train = [] losses_train = [] test = False for i in range(n_devices): image = nn.Variable((args.batch_size, 3, 32, 32)) label = nn.Variable((args.batch_size, 1)) device_scope_name = "device{}".format(i) pred = cifar10_resnet23_prediction( image, ctxs[i], device_scope_name, test) loss = cifar10_resnet32_loss(pred, label) input_image_train.append({"image": image, "label": label}) preds_train.append(pred) losses_train.append(loss) # Create validation graph test = True device_scope_name = "device{}".format(0) image_valid = nn.Variable((bs_valid, 3, 32, 32)) pred_valid = cifar10_resnet23_prediction( image_valid, ctxs[i], device_scope_name, test) input_image_valid = {"image": image_valid} # Solvers solvers = [] for i in range(n_devices): with nn.context_scope(ctxs[i]): solver = S.Adam() device_scope_name = "device{}".format(i) with nn.parameter_scope(device_scope_name): params = nn.get_parameters() solver.set_parameters(params) solvers.append(solver) # Communicator comm = C.DataParalellCommunicator(ctx) for i in range(n_devices): device_scope_name = "device{}".format(i) with nn.parameter_scope(device_scope_name): ctx = ctxs[i] params = nn.get_parameters() comm.add_context_and_parameters((ctx, params)) comm.init() # Create threadpools with one thread pools = [] for _ in range(n_devices): pool = ThreadPool(processes=1) pools.append(pool) # Once forward/backward to safely secure memory for device_id in range(n_devices): data, label = \ (np.random.randn(*input_image_train[device_id]["image"].shape), (np.random.rand(*input_image_train[device_id]["label"].shape) * 10).astype(np.int32)) ret = pools[device_id].apply_async(forward_backward, (input_image_train[device_id]["image"], data, input_image_train[device_id]["label"], label, losses_train[device_id], solvers[device_id])) ret.get() losses_train[device_id].d # sync to host # Create monitor. from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=10) monitor_err = MonitorSeries("Training error", monitor, interval=10) monitor_time = MonitorTimeElapsed("Training time", monitor, interval=100) monitor_verr = MonitorSeries("Test error", monitor, interval=10) with data_iterator_cifar10(args.batch_size, True) as tdata, \ data_iterator_cifar10(bs_valid, False) as vdata: # Training-loop for i in range(int(args.max_iter / n_devices)): # Validation if i % int(n_train_samples / args.batch_size / n_devices) == 0: ve = 0. for j in range(args.val_iter): image, label = vdata.next() input_image_valid["image"].d = image pred_valid.forward() ve += categorical_error(pred_valid.d, label) ve /= args.val_iter monitor_verr.add(i * n_devices, ve) if i % int(args.model_save_interval / n_devices) == 0: nn.save_parameters(os.path.join( args.model_save_path, 'params_%06d.h5' % i)) # Forwards/Zerograd/Backwards fb_results = [] for device_id in range(n_devices): image, label = tdata.next() res = pools[device_id].apply_async(forward_backward, (input_image_train[device_id]["image"], image, input_image_train[device_id]["label"], label, losses_train[device_id], solvers[device_id])) fb_results.append(res) for device_id in range(n_devices): fb_results[device_id].get() # In-place Allreduce comm.allreduce(division=True) # Solvers update for device_id in range(n_devices): solvers[device_id].update() e = categorical_error( preds_train[-1].d, input_image_train[-1]["label"].d) monitor_loss.add(i * n_devices, losses_train[-1].d.copy()) monitor_err.add(i * n_devices, e) monitor_time.add(i * n_devices) nn.save_parameters(os.path.join( args.model_save_path, 'params_%06d.h5' % (args.max_iter / n_devices)))