def test_update_with_gpu(self): self.setup_gpu() self.optimizer = chainermn.create_multi_node_optimizer( self.actual_optimizer, self.comm) self.optimizer.setup(self.target) self.optimizer.update() self.assertEqual(self.actual_optimizer.t, 0) self.optimizer.target.a.W.grad[:] = self.comm.rank self.optimizer.target.b.W.grad[:] = self.comm.rank + 1 self.optimizer.target.c.W.grad[:] = self.comm.rank + 2 self.optimizer.update() self.assertEqual(self.actual_optimizer.t, 1) self.optimizer.target.a.W.update_rule.update.assert_called_once_with( self.optimizer.target.a.W) self.optimizer.target.b.W.update_rule.update.assert_called_once_with( self.optimizer.target.b.W) self.optimizer.target.c.W.update_rule.update.assert_called_once_with( self.optimizer.target.c.W) base = (self.comm.size - 1.0) / 2 chainer.testing.assert_allclose(self.optimizer.target.a.W.grad, (base + 0) * np.ones((3, 2))) chainer.testing.assert_allclose(self.optimizer.target.b.W.grad, (base + 1) * np.ones((4, 3))) chainer.testing.assert_allclose(self.optimizer.target.c.W.grad, (base + 2) * np.ones((5, 4)))
def __init__( self, model_parameters, # Learning rate at training step s with annealing initial_lr=1e-4, final_lr=1e-5, annealing_steps=1600000, # Learning rate as used by the Adam algorithm beta_1=0.9, beta_2=0.99, # Adam regularisation parameter eps=1e-8, initial_training_step=0, communicator=None): self.initial_lr = initial_lr self.final_lr = final_lr self.annealing_steps = annealing_steps self.beta_1 = beta_1 self.beta_2 = beta_2 self.eps = eps lr = self.compute_lr_at_step(initial_training_step) self.optimizer = optimizers.Adam(lr, beta1=beta_1, beta2=beta_2, eps=eps) self.optimizer.setup(model_parameters) self.multi_node_optimizer = None if communicator: self.multi_node_optimizer = chainermn.create_multi_node_optimizer( self.optimizer, communicator)
def __init__( self, model_parameters, # Learning rate at training step s with annealing mu_i=5.0 * 1e-4, mu_f=5.0 * 1e-5, n=1.6 * 1e6, # Learning rate as used by the Adam algorithm beta_1=0.9, beta_2=0.99, # Adam regularisation parameter eps=1e-8, communicator=None): self.mu_i = mu_i self.mu_f = mu_f self.n = n self.beta_1 = beta_1 self.beta_2 = beta_2 self.eps = eps lr = self.mu_s(0) self.optimizer = optimizers.Adam(lr, beta1=beta_1, beta2=beta_2, eps=eps) self.optimizer.setup(model_parameters) self.multi_node_optimizer = None if communicator: self.multi_node_optimizer = chainermn.create_multi_node_optimizer( self.optimizer, communicator)
def __init__( self, model_parameters, # Learning rate at training step s with annealing lr_i=1.0 * 1e-4, lr_f=1.0 * 1e-5, n=10000, # Learning rate as used by the Adam algorithm beta_1=0.9, beta_2=0.99, # Adam regularisation parameter eps=1e-8, communicator=None): super().__init__(lr_i, lr_f, n) self.beta_1 = beta_1 self.beta_2 = beta_2 self.eps = eps lr = self.mu_s(0) self.optimizer = Eve(lr, beta1=beta_1, beta2=beta_2, eps=eps) self.optimizer.setup(model_parameters) if communicator: self.multi_node_optimizer = chainermn.create_multi_node_optimizer( self.optimizer, communicator)
def setup_optimizer(self, alpha=0.0005): if self.comm is None: self.optimizer = optimizers.Adam(alpha) else: self.optimizer = chainermn.create_multi_node_optimizer( optimizers.Adam(alpha), self.comm) self.optimizer.setup(self)
def setup_mnist_trainer(self, display_log=False): batchsize = 100 n_units = 100 comm = self.communicator model = L.Classifier(MLP(n_units, 10)) optimizer = chainermn.create_multi_node_optimizer( chainer.optimizers.Adam(), comm) optimizer.setup(model) if comm.rank == 0: train, test = chainer.datasets.get_mnist() else: train, test = None, None train = chainermn.scatter_dataset(train, comm, shuffle=True) test = chainermn.scatter_dataset(test, comm, shuffle=True) train_iter = chainer.iterators.SerialIterator(train, batchsize) test_iter = chainer.iterators.SerialIterator(test, batchsize, repeat=False, shuffle=False) updater = training.StandardUpdater( train_iter, optimizer ) return updater, optimizer, train_iter, test_iter, model
def main(args, model, x, t, valid_rate=0.2): print('Start a training script using multiple nodes.') comm = chainermn.create_communicator(args.communicator) device = comm.intra_rank assert device >= 0, 'invalid device ID: {}'.format(device) if comm.mpi_comm.rank == 0: print('==========================================') print('Num process (COMM_WORLD): {}'.format(MPI.COMM_WORLD.Get_size())) print('Using GPUs') print('Using {} communicator'.format(args.communicator)) print('Num Minibatch-size: {}'.format(args.batchsize)) print('Num epoch: {}'.format(args.epoch)) print('==========================================') if comm.rank == 0: threshold = int(len(t) * (1 - valid_rate)) train = datasets.tuple_dataset.TupleDataset(x[0:threshold], t[0:threshold]) valid = datasets.tuple_dataset.TupleDataset(x[threshold:], t[threshold:]) datasize = len(train) * args.epoch else: train, valid = None, None train = chainermn.scatter_dataset(train, comm, shuffle=True) valid = chainermn.scatter_dataset(valid, comm, shuffle=True) train_iter = chainer.iterators.SerialIterator(train, args.batchsize) valid_iter = chainer.iterators.SerialIterator(valid, args.batchsize, repeat=False, shuffle=False) if device >= 0: cuda.get_device_from_id(device).use() model.to_gpu() optimizer = chainermn.create_multi_node_optimizer( chainer.optimizers.SGD(lr=2e-4), comm) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(1e-2)) updater = training.StandardUpdater(train_iter, optimizer, device=device) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) evaluator = extensions.Evaluator(valid_iter, model, device=device) evaluator = chainermn.create_multi_node_evaluator(evaluator, comm) prepare_extensions(trainer, evaluator, args, comm) trainer.run() if comm.rank == 0: throughput = datasize / trainer.elapsed_time print('Throughput: {} [images/sec.] ({} / {})'.format( throughput, datasize, trainer.elapsed_time)) model_filepath = os.path.join(args.out, 'trained.model') chainer.serializers.save_npz(model_filepath, model)
def make_optimizer(model, comm, alpha=0.0002, beta1=0.5): # Create a multi node optimizer from a standard Chainer optimizer. optimizer = chainermn.create_multi_node_optimizer( chainer.optimizers.Adam(alpha=alpha, beta1=beta1), comm) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(0.0001), 'hook_dec') return optimizer
def setup_mnist_trainer(self, display_log=False, use_chx=False): batchsize = 100 n_units = 100 comm = self.communicator model = L.Classifier(MLP(n_units, 10)) model.to_device(get_device(None, use_chx)) optimizer = chainermn.create_multi_node_optimizer( chainer.optimizers.Adam(), comm) optimizer.setup(model) if comm.rank == 0: train, test = chainer.datasets.get_mnist() else: train, test = None, None train = chainermn.scatter_dataset(train, comm, shuffle=True) test = chainermn.scatter_dataset(test, comm, shuffle=True) train_iter = chainer.iterators.SerialIterator(train, batchsize) test_iter = chainer.iterators.SerialIterator(test, batchsize, repeat=False, shuffle=False) updater = training.StandardUpdater(train_iter, optimizer) return updater, optimizer, train_iter, test_iter, model
def _prepare_multinode_snapshot(n, result): n_units = 100 batchsize = 10 comm = create_communicator('naive') model = L.Classifier(MLP(n_units, 10)) optimizer = chainermn.create_multi_node_optimizer( chainer.optimizers.Adam(), comm) optimizer.setup(model) if comm.rank == 0: train, _ = chainer.datasets.get_mnist() else: train, _ = None, None train = chainermn.scatter_dataset(train, comm, shuffle=True) train_iter = chainer.iterators.SerialIterator(train, batchsize) updater = StandardUpdater(train_iter, optimizer) trainer = Trainer(updater, out=result) snapshot = extensions.snapshot(target=updater, autoload=True) replica_sets = [] mn_snapshot = multi_node_snapshot(comm, snapshot, replica_sets) mn_snapshot.initialize(trainer) for _ in range(n): updater.update() return updater, mn_snapshot, trainer
def objective(trial, comm): # Sample an architecture. model = L.Classifier(create_model(trial)) # Setup optimizer. optimizer = chainer.optimizers.MomentumSGD() optimizer.setup(model) optimizer = chainermn.create_multi_node_optimizer(optimizer, comm) # Setup dataset and iterator. Only worker 0 loads the whole dataset. # The dataset of worker 0 is evenly split and distributed to all workers. if comm.rank == 0: train, valid = chainer.datasets.get_mnist() rng = np.random.RandomState(0) train = chainer.datasets.SubDataset( train, 0, N_TRAIN_EXAMPLES, order=rng.permutation(len(train)) ) valid = chainer.datasets.SubDataset( valid, 0, N_VALID_EXAMPLES, order=rng.permutation(len(valid)) ) else: train, valid = None, None train = chainermn.scatter_dataset(train, comm, shuffle=True) valid = chainermn.scatter_dataset(valid, comm) train_iter = chainer.iterators.SerialIterator(train, BATCHSIZE, shuffle=True) valid_iter = chainer.iterators.SerialIterator(valid, BATCHSIZE, repeat=False, shuffle=False) # Setup trainer. updater = chainer.training.StandardUpdater(train_iter, optimizer) trainer = chainer.training.Trainer(updater, (EPOCH, "epoch")) # Add Chainer extension for pruners. trainer.extend( optuna.integration.ChainerPruningExtension( trial, "validation/main/accuracy", (PRUNER_INTERVAL, "epoch") ) ) evaluator = chainer.training.extensions.Evaluator(valid_iter, model) trainer.extend(chainermn.create_multi_node_evaluator(evaluator, comm)) log_report_extension = chainer.training.extensions.LogReport(log_name=None) trainer.extend(log_report_extension) if comm.rank == 0: trainer.extend(chainer.training.extensions.ProgressBar()) # Run training. # Please set show_loop_exception_msg False to inhibit messages about TrialPruned exception. # ChainerPruningExtension raises TrialPruned exception to stop training, and # trainer shows some messages every time it receive TrialPruned. trainer.run(show_loop_exception_msg=False) # Evaluate. evaluator = chainer.training.extensions.Evaluator(valid_iter, model) evaluator = chainermn.create_multi_node_evaluator(evaluator, comm) report = evaluator() return report["main/accuracy"]
def run_test_observation_aggregator(comm, xp, use_chainer_variable, communicate_interval, use_gpu): model = DummyChain() if use_gpu: # Use CuPy's Device class to force call cudaSetDevice() chainer.cuda.get_device_from_id(comm.intra_rank).use() device = get_device(comm.intra_rank if use_gpu else None, xp == chainerx) if xp == chainerx: train = xp.array(np.random.rand(10, 1).astype(np.float32)) else: train = xp.random.rand(10, 1).astype(np.float32) model.to_device(device) train_iter = chainer.iterators.SerialIterator(train, batch_size=1, repeat=True, shuffle=True) optimizer = chainermn.create_multi_node_optimizer( chainer.optimizers.Adam(), comm) optimizer.setup(model) updater = chainer.training.StandardUpdater(train_iter, optimizer, device=device) trainer = chainer.training.Trainer(updater, (1, 'epoch')) @extension.make_extension(trigger=(1, 'iteration'), priority=extension.PRIORITY_WRITER) def rank_reporter(trainer_): tmp = xp.asarray(comm.rank, dtype=np.float32) if use_chainer_variable: tmp = chainer.Variable(tmp) trainer_.observation['rank'] = tmp @extension.make_extension(trigger=(communicate_interval, 'iteration'), priority=extension.PRIORITY_READER) def aggregated_rank_checker(trainer_): actual = trainer_.observation['rank-aggregated'] if use_chainer_variable: actual = actual.data expected = (comm.size - 1) / 2 chainer.testing.assert_allclose(actual, expected) trainer.extend(rank_reporter) trainer.extend( ObservationAggregator(comm, 'rank', 'rank-aggregated', comm_trigger=(communicate_interval, 'iteration'))) trainer.extend(aggregated_rank_checker) trainer.run()
def make_optimizer(model, comm, config): # Select from https://docs.chainer.org/en/stable/reference/optimizers.html. # NOTE: The order of the arguments for optimizers follows their definitions. opt_algorithm = yaml_utils.load_optimizer(config.optimizer['algorithm'], args=config.optimizer['args']) optimizer = chainermn.create_multi_node_optimizer(opt_algorithm, comm) optimizer.setup(model) return optimizer
def make_adam(model, lr=0.0002, beta1=0.9, beta2=0.999): optimizer = chainer.optimizers.Adam(alpha=lr, beta1=beta1, beta2=beta2) if chainer.config.using_chainermn: optimizer = chainermn.create_multi_node_optimizer( optimizer, chainer.config.communicator) optimizer.setup(model) return optimizer
def objective(trial, comm): # Sample an architecture. model = L.Classifier(create_model(trial)) # Setup optimizer. optimizer = chainer.optimizers.MomentumSGD() optimizer.setup(model) optimizer = chainermn.create_multi_node_optimizer(optimizer, comm) # Setup dataset and iterator. Only worker 0 loads the whole dataset. # The dataset of worker 0 is evenly split and distributed to all workers. if comm.rank == 0: train, test = chainer.datasets.get_mnist() rng = np.random.RandomState(0) train = chainer.datasets.SubDataset(train, 0, N_TRAIN_EXAMPLES, order=rng.permutation(len(train))) test = chainer.datasets.SubDataset(test, 0, N_TEST_EXAMPLES, order=rng.permutation(len(test))) else: train, test = None, None train = chainermn.scatter_dataset(train, comm, shuffle=True) test = chainermn.scatter_dataset(test, comm) train_iter = chainer.iterators.SerialIterator(train, BATCHSIZE, shuffle=True) test_iter = chainer.iterators.SerialIterator(test, BATCHSIZE, repeat=False, shuffle=False) # Setup trainer. updater = chainer.training.StandardUpdater(train_iter, optimizer) trainer = chainer.training.Trainer(updater, (EPOCH, 'epoch')) if comm.rank == 0: trainer.extend(chainer.training.extensions.ProgressBar()) # Run training. trainer.run() # Evaluate. evaluator = chainer.training.extensions.Evaluator(test_iter, model) evaluator = chainermn.create_multi_node_evaluator(evaluator, comm) report = evaluator() # The following line mitigates the memory problem in CircleCI # (see https://github.com/pfnet/optuna/pull/325 for more details). gc.collect() return 1.0 - report['main/accuracy']
def test_can_create_valid_wrapper_for_chainermn(self): optimizer = create_marked_profile_optimizer( chainermn.create_multi_node_optimizer(optimizers.SGD(lr=1.0), None), sync=True) self.assertIsNotNone(optimizer) np.testing.assert_allclose([optimizer.lr], [1.0]) self.assertIsInstance(optimizer, _MarkedProfileOptimizerForMN) self.assertNotIsInstance(optimizer.actual_optimizer, chainer.Optimizer) self.assertIsInstance(optimizer.actual_optimizer.actual_optimizer, chainer.Optimizer)
def make_optimizer(model, comm, alpha=0.001, beta1=0.9, beta2=0.999, chmn=False, add_decay=False): # # 12/2018: problem in minoas, probably related with openmpi. if chmn: optimizer = chainermn.create_multi_node_optimizer( chainer.optimizers.Adam(alpha=alpha, beta1=beta1, beta2=beta2), comm) else: optimizer = chainer.optimizers.Adam(alpha=alpha, beta1=beta1, beta2=beta2) optimizer.setup(model) if add_decay: optimizer.add_hook(chainer.optimizer.WeightDecay(0.00001), 'hook_dec') #optimizer.add_hook(chainer.optimizer_hooks.GradientClipping(0.1), 'hook_clip') return optimizer
def get_optimizer(args, comm, model): if args.optimizer == 'momentum_sgd': actual_optimizer = chainer.optimizers.MomentumSGD() elif args.optimizer == 'adam': actual_optimizer = chainer.optimizers.Adam() elif args.optimizer == 'rmsprop_warmup': actual_optimizer = dlframeworks.chainer.optimizers.RMSpropWarmup() else: actual_optimizer = chainer.optimizers.RMSprop() optimizer = chainermn.create_multi_node_optimizer(actual_optimizer, comm) optimizer.setup(model) return optimizer
def objective(trial, comm): # Sample an architecture. model = L.Classifier(create_model(trial)) # Setup optimizer. optimizer = chainer.optimizers.MomentumSGD() optimizer.setup(model) optimizer = chainermn.create_multi_node_optimizer(optimizer, comm) # Setup dataset and iterator. Only worker 0 loads the whole dataset. # The dataset of worker 0 is evenly split and distributed to all workers. if comm.rank == 0: train, valid = chainer.datasets.get_mnist() rng = np.random.RandomState(0) train = chainer.datasets.SubDataset(train, 0, N_TRAIN_EXAMPLES, order=rng.permutation(len(train))) valid = chainer.datasets.SubDataset(valid, 0, N_VALID_EXAMPLES, order=rng.permutation(len(valid))) else: train, valid = None, None train = chainermn.scatter_dataset(train, comm, shuffle=True) valid = chainermn.scatter_dataset(valid, comm) train_iter = chainer.iterators.SerialIterator(train, BATCHSIZE, shuffle=True) valid_iter = chainer.iterators.SerialIterator(valid, BATCHSIZE, repeat=False, shuffle=False) # Setup trainer. updater = chainer.training.StandardUpdater(train_iter, optimizer) trainer = chainer.training.Trainer(updater, (EPOCH, "epoch")) if comm.rank == 0: trainer.extend(chainer.training.extensions.ProgressBar()) # Run training. trainer.run() # Evaluate. evaluator = chainer.training.extensions.Evaluator(valid_iter, model) evaluator = chainermn.create_multi_node_evaluator(evaluator, comm) report = evaluator() return report["main/accuracy"]
def make_optimizer(self, model, alpha, beta1, beta2): self.print_log( 'Use Adam Optimizer with alpah = {}, beta1 = {}, beta2 = {}'. format(alpha, beta1, beta2)) optimizer = chainer.optimizers.Adam(alpha=alpha, beta1=beta1, beta2=beta2) if self.use_mpi: self.print_log('Use Optimizer with MPI') optimizer = chainermn.create_multi_node_optimizer( optimizer, self.comm) optimizer.setup(model) return optimizer
def objective(trial, comm): device = comm.intra_rank chainer.cuda.get_device_from_id(device).use() # Sample an architecture. model = L.Classifier(create_model(trial)) model.to_gpu() # Setup optimizer. optimizer = chainer.optimizers.MomentumSGD() optimizer.setup(model) optimizer = chainermn.create_multi_node_optimizer(optimizer, comm) # Setup dataset and iterator. Only worker 0 loads the whole dataset. # The dataset of worker 0 is evenly split and distributed to all workers. if comm.rank == 0: train, test = chainer.datasets.get_mnist() rng = np.random.RandomState(0) train = chainer.datasets.SubDataset( train, 0, N_TRAIN_EXAMPLES, order=rng.permutation(len(train))) test = chainer.datasets.SubDataset( test, 0, N_TEST_EXAMPLES, order=rng.permutation(len(test))) else: train, test = None, None train = chainermn.scatter_dataset(train, comm, shuffle=True) test = chainermn.scatter_dataset(test, comm) train_iter = chainer.iterators.SerialIterator( train, BATCHSIZE, shuffle=True) test_iter = chainer.iterators.SerialIterator( test, BATCHSIZE, repeat=False, shuffle=False) # Setup trainer. updater = chainer.training.StandardUpdater( train_iter, optimizer, device=device) trainer = chainer.training.Trainer(updater, (EPOCH, 'epoch')) if comm.rank == 0: trainer.extend(chainer.training.extensions.ProgressBar()) # Run training. trainer.run() # Evaluate. evaluator = chainer.training.extensions.Evaluator( test_iter, model, device=device) evaluator = chainermn.create_multi_node_evaluator(evaluator, comm) report = evaluator() return report['main/accuracy']
def test_mnist(self, display_log=True): # This test file is intended to be run on Travis-CI and # GPU is not used for now. epoch = 5 batchsize = 100 n_units = 100 comm = chainermn.create_communicator('naive') model = L.Classifier(MLP(n_units, 10)) optimizer = chainermn.create_multi_node_optimizer( chainer.optimizers.Adam(), comm) optimizer.setup(model) if comm.rank == 0: train, test = chainer.datasets.get_mnist() else: train, test = None, None train = chainermn.scatter_dataset(train, comm) test = chainermn.scatter_dataset(test, comm) train_iter = chainer.iterators.SerialIterator(train, batchsize) test_iter = chainer.iterators.SerialIterator(test, batchsize, repeat=False, shuffle=False) updater = training.StandardUpdater(train_iter, optimizer) trainer = training.Trainer(updater, (epoch, 'epoch')) # Wrap standard Chainer evaluators by MultiNodeEvaluator. evaluator = extensions.Evaluator(test_iter, model) evaluator = chainermn.create_multi_node_evaluator(evaluator, comm) trainer.extend(evaluator) # Some display and output extensions are necessary only for one worker. # (Otherwise, there would just be repeated outputs.) if comm.rank == 0 and display_log: trainer.extend(extensions.LogReport(trigger=(1, 'epoch')), trigger=(1, 'epoch')) trainer.extend(extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ], out=sys.stderr), trigger=(1, 'epoch')) trainer.run() err = evaluator()['validation/main/accuracy'] self.assertGreaterEqual(err, 0.95)
def test_update(self): self.setup_gpu() self.optimizer = chainermn.create_multi_node_optimizer( self.actual_optimizer, self.comm, double_buffering=True) opt = self.optimizer.setup(self.target) assert opt is self.optimizer self.optimizer.update() self.assertEqual(self.actual_optimizer.t, 0) self.optimizer.target.a.W.grad[:] = self.comm.rank self.optimizer.target.b.W.grad[:] = self.comm.rank + 1 self.optimizer.target.c.W.grad[:] = self.comm.rank + 2 self.optimizer.update() self.optimizer.wait() self.assertEqual(self.actual_optimizer.t, 0) base = (self.comm.size - 1.0) / 2 chainer.testing.assert_allclose( self.optimizer.communicated_target.a.W.grad, (base + 0) * np.ones((3, 2))) chainer.testing.assert_allclose( self.optimizer.communicated_target.b.W.grad, (base + 1) * np.ones((4, 3))) chainer.testing.assert_allclose( self.optimizer.communicated_target.c.W.grad, (base + 2) * np.ones((5, 4))) self.optimizer.target.a.W.grad[:] = self.comm.rank + 3 self.optimizer.target.b.W.grad[:] = self.comm.rank + 4 self.optimizer.target.c.W.grad[:] = self.comm.rank + 5 self.optimizer.update() self.optimizer.wait() self.assertEqual(self.actual_optimizer.t, 1) self.optimizer.target.a.W.update_rule.update.assert_called_once_with( self.optimizer.target.a.W) self.optimizer.target.b.W.update_rule.update.assert_called_once_with( self.optimizer.target.b.W) self.optimizer.target.c.W.update_rule.update.assert_called_once_with( self.optimizer.target.c.W) chainer.testing.assert_allclose( self.optimizer.communicated_target.a.W.grad, (base + 3) * np.ones((3, 2))) chainer.testing.assert_allclose( self.optimizer.communicated_target.b.W.grad, (base + 4) * np.ones((4, 3))) chainer.testing.assert_allclose( self.optimizer.communicated_target.c.W.grad, (base + 5) * np.ones((5, 4))) # barrier() requires before destructor of PureNcclCommunicator # because communication may not be finished. self.comm.mpi_comm.barrier()
def test_update(self): self.setup_gpu() self.optimizer = chainermn.create_multi_node_optimizer( self.actual_optimizer, self.comm, double_buffering=True) opt = self.optimizer.setup(self.target) assert opt is self.optimizer self.optimizer.update() self.assertEqual(self.actual_optimizer.t, 0) self.optimizer.target.a.W.grad[:] = self.comm.rank self.optimizer.target.b.W.grad[:] = self.comm.rank + 1 self.optimizer.target.c.W.grad[:] = self.comm.rank + 2 self.optimizer.update() self.optimizer.wait() self.assertEqual(self.actual_optimizer.t, 0) base = (self.comm.size - 1.0) / 2 chainer.testing.assert_allclose( self.optimizer.communicated_target.a.W.grad, (base + 0) * np.ones( (3, 2))) chainer.testing.assert_allclose( self.optimizer.communicated_target.b.W.grad, (base + 1) * np.ones( (4, 3))) chainer.testing.assert_allclose( self.optimizer.communicated_target.c.W.grad, (base + 2) * np.ones( (5, 4))) self.optimizer.target.a.W.grad[:] = self.comm.rank + 3 self.optimizer.target.b.W.grad[:] = self.comm.rank + 4 self.optimizer.target.c.W.grad[:] = self.comm.rank + 5 self.optimizer.update() self.optimizer.wait() self.assertEqual(self.actual_optimizer.t, 1) self.optimizer.target.a.W.update_rule.update.assert_called_once_with( self.optimizer.target.a.W) self.optimizer.target.b.W.update_rule.update.assert_called_once_with( self.optimizer.target.b.W) self.optimizer.target.c.W.update_rule.update.assert_called_once_with( self.optimizer.target.c.W) chainer.testing.assert_allclose( self.optimizer.communicated_target.a.W.grad, (base + 3) * np.ones( (3, 2))) chainer.testing.assert_allclose( self.optimizer.communicated_target.b.W.grad, (base + 4) * np.ones( (4, 3))) chainer.testing.assert_allclose( self.optimizer.communicated_target.c.W.grad, (base + 5) * np.ones( (5, 4))) # barrier() requires before destructor of PureNcclCommunicator # because communication may not be finished. self.comm.mpi_comm.barrier()
def make_optimizer(model): if args.optimizer in [ 'SGD', 'Momentum', 'CMomentum', 'AdaGrad', 'RMSprop', 'NesterovAG', 'LBFGS' ]: optimizer = optim[args.optimizer](lr=args.learning_rate) elif args.optimizer in ['AdaDelta']: optimizer = optim[args.optimizer]() elif args.optimizer in ['Adam', 'AdaBound', 'Eve']: optimizer = optim[args.optimizer]( alpha=args.learning_rate, weight_decay_rate=args.weight_decay) if args.mpi: optimizer = chainermn.create_multi_node_optimizer(optimizer, comm) optimizer.setup(model) return optimizer
def check_update(self, batched_copy): self.setup(batched_copy) self.optimizer = chainermn.create_multi_node_optimizer( self.actual_optimizer, self.comm, double_buffering=True) opt = self.optimizer.setup(self.target) assert opt is self.optimizer self.optimizer.update() self.assertEqual(self.actual_optimizer.t, 0) self.optimizer.target.a.W.grad[:] = self.comm.rank self.optimizer.target.b.W.grad[:] = self.comm.rank + 1 self.optimizer.target.c.W.grad[:] = self.comm.rank + 2 self.optimizer.update() self.optimizer.wait() self.assertEqual(self.actual_optimizer.t, 0) base = (self.comm.size - 1.0) / 2 chainer.testing.assert_allclose( self.optimizer.communicated_target.a.W.grad, (base + 0) * np.ones((3, 2))) chainer.testing.assert_allclose( self.optimizer.communicated_target.b.W.grad, (base + 1) * np.ones((4, 3))) chainer.testing.assert_allclose( self.optimizer.communicated_target.c.W.grad, (base + 2) * np.ones((5, 4))) self.optimizer.target.a.W.grad[:] = self.comm.rank + 3 self.optimizer.target.b.W.grad[:] = self.comm.rank + 4 self.optimizer.target.c.W.grad[:] = self.comm.rank + 5 self.optimizer.update() self.optimizer.wait() self.assertEqual(self.actual_optimizer.t, 1) self.optimizer.target.a.W.update_rule.update.assert_called_once_with( self.optimizer.target.a.W) self.optimizer.target.b.W.update_rule.update.assert_called_once_with( self.optimizer.target.b.W) self.optimizer.target.c.W.update_rule.update.assert_called_once_with( self.optimizer.target.c.W) chainer.testing.assert_allclose( self.optimizer.communicated_target.a.W.grad, (base + 3) * np.ones((3, 2))) chainer.testing.assert_allclose( self.optimizer.communicated_target.b.W.grad, (base + 4) * np.ones((4, 3))) chainer.testing.assert_allclose( self.optimizer.communicated_target.c.W.grad, (base + 5) * np.ones((5, 4))) self.comm.finalize()
def objective(trial, comm): # Sample an architecture. model = L.Classifier(create_model(trial)) # Setup optimizer. optimizer = chainer.optimizers.MomentumSGD() optimizer.setup(model) optimizer = chainermn.create_multi_node_optimizer(optimizer, comm) # Setup dataset and iterator. train, test = chainer.datasets.get_mnist() rng = np.random.RandomState(0) train = chainer.datasets.SubDataset(train, 0, N_TRAIN_EXAMPLES, order=rng.permutation(len(train))) test = chainer.datasets.SubDataset(test, 0, N_TEST_EXAMPLES, order=rng.permutation(len(test))) train = chainermn.scatter_dataset(train, comm, shuffle=True) test = chainermn.scatter_dataset(test, comm) train_iter = chainer.iterators.SerialIterator(train, BATCHSIZE, shuffle=True) test_iter = chainer.iterators.SerialIterator(test, BATCHSIZE, repeat=False, shuffle=False) # Setup trainer. updater = chainer.training.StandardUpdater(train_iter, optimizer) trainer = chainer.training.Trainer(updater, (EPOCH, 'epoch')) if comm.rank == 0: trainer.extend(chainer.training.extensions.ProgressBar()) # Run training. trainer.run() # Evaluate. evaluator = chainer.training.extensions.Evaluator(test_iter, model) evaluator = chainermn.create_multi_node_evaluator(evaluator, comm) report = evaluator() return 1.0 - report['main/accuracy']
def make_optimizer(self, model, alpha, beta1, beta2): self.print_log( 'Use Adam Optimizer with alpah = {}, beta1 = {}, beta2 = {}'. format(alpha, beta1, beta2)) optimizer = chainer.optimizers.Adam(alpha=alpha, beta1=beta1, beta2=beta2) if self.use_mpi: self.print_log('Use Optimizer with MPI') optimizer = chainermn.create_multi_node_optimizer( optimizer, self.comm) # if self.nvprof: # optimizer = create_marked_profile_optimizer(optimizer, sync=True, sync_level=2) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(5)) return optimizer
def run_test_observation_aggregator(comm, xp, use_chainer_variable, communicate_interval, use_cupy): model = DummyChain() if use_cupy: model.to_gpu() optimizer = chainermn.create_multi_node_optimizer( chainer.optimizers.Adam(), comm) optimizer.setup(model) train = xp.random.rand(10, 1).astype(np.float32) train_iter = chainer.iterators.SerialIterator(train, batch_size=1, repeat=True, shuffle=True) updater = chainer.training.StandardUpdater(train_iter, optimizer) trainer = chainer.training.Trainer(updater, (1, 'epoch')) @extension.make_extension(trigger=(1, 'iteration'), priority=extension.PRIORITY_WRITER) def rank_reporter(trainer): tmp = xp.asarray(comm.rank, dtype=np.float32) if use_chainer_variable: tmp = chainer.Variable(tmp) trainer.observation['rank'] = tmp @extension.make_extension(trigger=(communicate_interval, 'iteration'), priority=extension.PRIORITY_READER) def aggregated_rank_checker(trainer): actual = trainer.observation['rank-aggregated'] if use_chainer_variable: actual = actual.data expected = (comm.size - 1) / 2 chainer.testing.assert_allclose(actual, expected) trainer.extend(rank_reporter) trainer.extend( ObservationAggregator(comm, 'rank', 'rank-aggregated', comm_trigger=(communicate_interval, 'iteration'))) trainer.extend(aggregated_rank_checker) trainer.run()
def train(x_data, t_data, batchsize=128, layer=1, in_units=1, hidden_units=5, out_units=1): comm = chainermn.create_communicator('naive') # Iterator batchsize = batchsize x_data = chainermn.scatter_dataset(x_data, comm) t_data = chainermn.scatter_dataset(t_data, comm) train_iter = iterators.SerialIterator(x_data, batchsize) test_iter = iterators.SerialIterator(t_data, batchsize, repeat=False, shuffle=False) # setup model model = LSTM(in_units, hidden_units, out_units) # setup optimizer optimizer = chainermn.create_multi_node_optimizer(optimizers.Adam(), comm) optimizer.setup(model) updater = training.StandardUpdater(train_iter, optimizer, MyConverter) trainer = training.Trainer(updater, (20, 'epoch'), out='result') if comm.rank == 0: trainer.extend(extensions.LogReport()) trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.observe_lr()) trainer.extend(extensions.Evaluator(test_iter, model, MyConverter), name='val') trainer.extend( extensions.PrintReport( ['epoch', 'main/loss', 'val/main/loss', 'elapsed_time', 'lr'])) trainer.extend( extensions.PlotReport(['main/loss', 'val/main/loss'], x_key='epoch', file_name='loss.png')) # trainer.extend(extensions.ProgressBar()) trainer.run()
def _setup_optimizer(config, model, comm): optimizer_name = config['optimizer'] lr = float(config['init_lr']) weight_decay = float(config['weight_decay']) if optimizer_name == 'Adam': optimizer = Adam(alpha=lr, weight_decay_rate=weight_decay) elif optimizer_name in \ ('SGD', 'MomentumSGD', 'CorrectedMomentumSGD', 'RMSprop'): optimizer = eval(optimizer_name)(lr=lr) if weight_decay > 0.: optimizer.add_hook(WeightDecay(weight_decay)) else: raise ValueError('Invalid optimizer: {}'.format(optimizer_name)) if comm is not None: optimizer = chainermn.create_multi_node_optimizer(optimizer, comm) optimizer.setup(model) return optimizer
def __init__( self, model_parameters, # Learning rate at training step s with annealing lr_i=1.0 * 1e-4, lr_f=1.0 * 1e-5, n=10000, communicator=None): super().__init__(lr_i, lr_f, n) lr = self.mu_s(0) self.optimizer = optimizers.MomentumSGD(lr) self.optimizer.setup(model_parameters) self.multi_node_optimizer = None if communicator: self.multi_node_optimizer = chainermn.create_multi_node_optimizer( self.optimizer, communicator)
def test_update_with_gpu(self): self.setup_gpu() self.optimizer = chainermn.create_multi_node_optimizer( self.actual_optimizer, self.comm) opt = self.optimizer.setup(self.target) assert opt is self.optimizer self.optimizer.update() self.assertEqual(self.actual_optimizer.t, 0) with self.target.init_scope(): c = chainer.links.Linear(4, 4) c.to_gpu() self.target.c = c if self.comm.rank == 0: self.target.c.W.data[:] = self.comm.rank + 2 self.optimizer.setup(self.target) self.optimizer.update() self.assertEqual(self.actual_optimizer.t, 0) send_buf = chainer.cuda.to_cpu(self.optimizer.target.c.W.data) recv_buf = self.comm.mpi_comm.allgather(send_buf) for i in range(1, self.comm.size): chainer.testing.assert_allclose(recv_buf[0], recv_buf[i]) self.optimizer.target.a.W.grad[:] = self.comm.rank self.optimizer.target.b.W.grad[:] = self.comm.rank + 1 self.optimizer.target.c.W.grad[:] = self.comm.rank + 2 self.optimizer.update() self.assertEqual(self.actual_optimizer.t, 1) self.optimizer.target.a.W.update_rule.update.assert_called_once_with( self.optimizer.target.a.W) self.optimizer.target.b.W.update_rule.update.assert_called_once_with( self.optimizer.target.b.W) self.optimizer.target.c.W.update_rule.update.assert_called_once_with( self.optimizer.target.c.W) base = (self.comm.size - 1.0) / 2 chainer.testing.assert_allclose(self.optimizer.target.a.W.grad, (base + 0) * np.ones((3, 2))) chainer.testing.assert_allclose(self.optimizer.target.b.W.grad, (base + 1) * np.ones((4, 3))) chainer.testing.assert_allclose(self.optimizer.target.c.W.grad, (base + 2) * np.ones((4, 4)))
def test_update(self): self.setup_gpu() self.optimizer = chainermn.create_multi_node_optimizer( self.actual_optimizer, self.comm, double_buffering=True) opt = self.optimizer.setup(self.target) assert opt is self.optimizer self.optimizer.update() self.assertEqual(self.actual_optimizer.t, 0) self.optimizer.target.a.W.grad[:] = self.comm.rank self.optimizer.target.b.W.grad[:] = self.comm.rank + 1 self.optimizer.update() self.optimizer.wait() self.assertEqual(self.actual_optimizer.t, 0) base = (self.comm.size - 1.0) / 2 chainer.testing.assert_allclose( self.optimizer.communicated_target.a.W.grad, (base + 0) * np.ones((3, 2))) chainer.testing.assert_allclose( self.optimizer.communicated_target.b.W.grad, (base + 1) * np.ones((4, 3))) self.optimizer.target.a.W.grad[:] = self.comm.rank + 3 self.optimizer.target.b.W.grad[:] = self.comm.rank + 4 self.optimizer.update() self.optimizer.wait() self.assertEqual(self.actual_optimizer.t, 1) self.optimizer.target.a.W.update_rule.update.assert_called_once_with( self.optimizer.target.a.W) self.optimizer.target.b.W.update_rule.update.assert_called_once_with( self.optimizer.target.b.W) chainer.testing.assert_allclose( self.optimizer.communicated_target.a.W.grad, (base + 3) * np.ones((3, 2))) chainer.testing.assert_allclose( self.optimizer.communicated_target.b.W.grad, (base + 4) * np.ones((4, 3))) with self.target.init_scope(): c = chainer.links.Linear(4, 4) c.to_gpu() self.target.c = c if self.comm.rank == 0: self.target.c.W.data[:] = self.comm.rank + 2 self.optimizer.setup(self.target) self.optimizer.update() self.assertEqual(self.actual_optimizer.t, 0) send_buf = chainer.cuda.to_cpu(self.optimizer.target.c.W.data) recv_buf = self.comm.mpi_comm.allgather(send_buf) for i in range(1, self.comm.size): chainer.testing.assert_allclose(recv_buf[0], recv_buf[i]) self.optimizer.target.a.W.grad[:] = self.comm.rank + 6 self.optimizer.target.b.W.grad[:] = self.comm.rank + 7 self.optimizer.target.c.W.grad[:] = self.comm.rank + 8 self.optimizer.update() self.optimizer.wait() self.assertEqual(self.actual_optimizer.t, 0) base = (self.comm.size - 1.0) / 2 chainer.testing.assert_allclose( self.optimizer.communicated_target.a.W.grad, (base + 6) * np.ones((3, 2))) chainer.testing.assert_allclose( self.optimizer.communicated_target.b.W.grad, (base + 7) * np.ones((4, 3))) chainer.testing.assert_allclose( self.optimizer.communicated_target.c.W.grad, (base + 8) * np.ones((4, 4))) self.optimizer.target.a.W.grad[:] = self.comm.rank + 9 self.optimizer.target.b.W.grad[:] = self.comm.rank + 10 self.optimizer.target.c.W.grad[:] = self.comm.rank + 11 self.optimizer.update() self.optimizer.wait() self.assertEqual(self.actual_optimizer.t, 1) self.optimizer.target.a.W.update_rule.update.assert_called_once_with( self.optimizer.target.a.W) self.optimizer.target.b.W.update_rule.update.assert_called_once_with( self.optimizer.target.b.W) self.optimizer.target.c.W.update_rule.update.assert_called_once_with( self.optimizer.target.c.W) chainer.testing.assert_allclose( self.optimizer.communicated_target.a.W.grad, (base + 9) * np.ones((3, 2))) chainer.testing.assert_allclose( self.optimizer.communicated_target.b.W.grad, (base + 10) * np.ones((4, 3))) chainer.testing.assert_allclose( self.optimizer.communicated_target.c.W.grad, (base + 11) * np.ones((4, 4))) # barrier() requires before destructor of PureNcclCommunicator # because communication may not be finished. self.comm.mpi_comm.barrier()
def main(): parser = argparse.ArgumentParser(description='ChainerMN example: MNIST') parser.add_argument('--batchsize', '-b', type=int, default=100, help='Number of images in each mini-batch') parser.add_argument('--communicator', type=str, default='hierarchical', help='Type of communicator') parser.add_argument('--epoch', '-e', type=int, default=20, help='Number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', action='store_true', help='Use GPU') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--unit', '-u', type=int, default=1000, help='Number of units') args = parser.parse_args() # Prepare ChainerMN communicator. if args.gpu: if args.communicator == 'naive': print("Error: 'naive' communicator does not support GPU.\n") exit(-1) comm = chainermn.create_communicator(args.communicator) device = comm.intra_rank else: if args.communicator != 'naive': print('Warning: using naive communicator ' 'because only naive supports CPU-only execution') comm = chainermn.create_communicator('naive') device = -1 if comm.rank == 0: print('==========================================') print('Num process (COMM_WORLD): {}'.format(comm.size)) if args.gpu: print('Using GPUs') print('Using {} communicator'.format(args.communicator)) print('Num unit: {}'.format(args.unit)) print('Num Minibatch-size: {}'.format(args.batchsize)) print('Num epoch: {}'.format(args.epoch)) print('==========================================') model = L.Classifier(MLP(args.unit, 10)) if device >= 0: chainer.cuda.get_device_from_id(device).use() model.to_gpu() # Create a multi node optimizer from a standard Chainer optimizer. optimizer = chainermn.create_multi_node_optimizer( chainer.optimizers.Adam(), comm) optimizer.setup(model) # Split and distribute the dataset. Only worker 0 loads the whole dataset. # Datasets of worker 0 are evenly split and distributed to all workers. if comm.rank == 0: train, test = chainer.datasets.get_mnist() else: train, test = None, None train = chainermn.scatter_dataset(train, comm, shuffle=True) test = chainermn.scatter_dataset(test, comm, shuffle=True) train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) updater = training.StandardUpdater(train_iter, optimizer, device=device) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) # Create a multi node evaluator from a standard Chainer evaluator. evaluator = extensions.Evaluator(test_iter, model, device=device) evaluator = chainermn.create_multi_node_evaluator(evaluator, comm) trainer.extend(evaluator) # Some display and output extensions are necessary only for one worker. # (Otherwise, there would just be repeated outputs.) if comm.rank == 0: trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.LogReport()) trainer.extend(extensions.PrintReport( ['epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time'])) trainer.extend(extensions.ProgressBar()) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run()
def main(): # Check if GPU is available # (ImageNet example does not support CPU execution) if not chainer.cuda.available: raise RuntimeError("ImageNet requires GPU support.") archs = { 'alex': alex.Alex, 'googlenet': googlenet.GoogLeNet, 'googlenetbn': googlenetbn.GoogLeNetBN, 'nin': nin.NIN, 'resnet50': resnet50.ResNet50, } parser = argparse.ArgumentParser( description='Learning convnet from ILSVRC2012 dataset') parser.add_argument('train', help='Path to training image-label list file') parser.add_argument('val', help='Path to validation image-label list file') parser.add_argument('--arch', '-a', choices=archs.keys(), default='nin', help='Convnet architecture') parser.add_argument('--batchsize', '-B', type=int, default=32, help='Learning minibatch size') parser.add_argument('--epoch', '-E', type=int, default=10, help='Number of epochs to train') parser.add_argument('--initmodel', help='Initialize the model from given file') parser.add_argument('--loaderjob', '-j', type=int, help='Number of parallel data loading processes') parser.add_argument('--mean', '-m', default='mean.npy', help='Mean file (computed by compute_mean.py)') parser.add_argument('--resume', '-r', default='', help='Initialize the trainer from given file') parser.add_argument('--out', '-o', default='result', help='Output directory') parser.add_argument('--root', '-R', default='.', help='Root directory path of image files') parser.add_argument('--val_batchsize', '-b', type=int, default=250, help='Validation minibatch size') parser.add_argument('--test', action='store_true') parser.add_argument('--communicator', default='hierarchical') parser.set_defaults(test=False) args = parser.parse_args() # Prepare ChainerMN communicator. comm = chainermn.create_communicator(args.communicator) device = comm.intra_rank if comm.rank == 0: print('==========================================') print('Num process (COMM_WORLD): {}'.format(comm.size)) print('Using {} communicator'.format(args.communicator)) print('Using {} arch'.format(args.arch)) print('Num Minibatch-size: {}'.format(args.batchsize)) print('Num epoch: {}'.format(args.epoch)) print('==========================================') model = archs[args.arch]() if args.initmodel: print('Load model from', args.initmodel) chainer.serializers.load_npz(args.initmodel, model) chainer.cuda.get_device_from_id(device).use() # Make the GPU current model.to_gpu() # Split and distribute the dataset. Only worker 0 loads the whole dataset. # Datasets of worker 0 are evenly split and distributed to all workers. mean = np.load(args.mean) if comm.rank == 0: train = PreprocessedDataset(args.train, args.root, mean, model.insize) val = PreprocessedDataset( args.val, args.root, mean, model.insize, False) else: train = None val = None train = chainermn.scatter_dataset(train, comm, shuffle=True) val = chainermn.scatter_dataset(val, comm) # We need to change the start method of multiprocessing module if we are # using InfiniBand and MultiprocessIterator. This is because processes # often crash when calling fork if they are using Infiniband. # (c.f., https://www.open-mpi.org/faq/?category=tuning#fork-warning ) multiprocessing.set_start_method('forkserver') train_iter = chainer.iterators.MultiprocessIterator( train, args.batchsize, n_processes=args.loaderjob) val_iter = chainer.iterators.MultiprocessIterator( val, args.val_batchsize, repeat=False, n_processes=args.loaderjob) # Create a multi node optimizer from a standard Chainer optimizer. optimizer = chainermn.create_multi_node_optimizer( chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9), comm) optimizer.setup(model) # Set up a trainer updater = training.StandardUpdater(train_iter, optimizer, device=device) trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out) checkpoint_interval = (10, 'iteration') if args.test else (1, 'epoch') val_interval = (10, 'iteration') if args.test else (1, 'epoch') log_interval = (10, 'iteration') if args.test else (1, 'epoch') checkpointer = chainermn.create_multi_node_checkpointer( name='imagenet-example', comm=comm) checkpointer.maybe_load(trainer, optimizer) trainer.extend(checkpointer, trigger=checkpoint_interval) # Create a multi node evaluator from an evaluator. evaluator = TestModeEvaluator(val_iter, model, device=device) evaluator = chainermn.create_multi_node_evaluator(evaluator, comm) trainer.extend(evaluator, trigger=val_interval) # Some display and output extensions are necessary only for one worker. # (Otherwise, there would just be repeated outputs.) if comm.rank == 0: trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'lr' ]), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run()
comm = chainermn.create_communicator(communicator) device = comm.intra_rank if args.num_gpus > 0 else -1 print('==========================================') print('Using {} communicator'.format(comm)) print('Num unit: {}'.format(args.units)) print('Num Minibatch-size: {}'.format(args.batch_size)) print('Num epoch: {}'.format(args.epochs)) print('==========================================') model = L.Classifier(MLP(args.units, 10)) if device >= 0: chainer.cuda.get_device(device).use() # Create a multi node optimizer from a standard Chainer optimizer. optimizer = chainermn.create_multi_node_optimizer( chainer.optimizers.Adam(), comm) optimizer.setup(model) train_file = np.load(os.path.join(args.train, 'train.npz')) test_file = np.load(os.path.join(args.test, 'test.npz')) preprocess_mnist_options = { 'withlabel': True, 'ndim': 1, 'scale': 1., 'image_dtype': np.float32, 'label_dtype': np.int32, 'rgb_format': False } train_dataset = _preprocess_mnist(train_file, **preprocess_mnist_options)
def main(): parser = argparse.ArgumentParser(description='Chainer example: seq2seq') parser.add_argument('--batchsize', '-b', type=int, default=64, help='Number of images in each mini-batch') parser.add_argument('--bleu', action='store_true', default=False, help='Report BLEU score') parser.add_argument('--gpu', '-g', action='store_true', help='Use GPU') parser.add_argument('--cache', '-c', default=None, help='Directory to cache pre-processed dataset') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--unit', '-u', type=int, default=1024, help='Number of units') parser.add_argument('--communicator', default='hierarchical', help='Type of communicator') parser.add_argument('--stop', '-s', type=str, default='15e', help='Stop trigger (ex. "500i", "15e")') parser.add_argument('--input', '-i', type=str, default='wmt', help='Input directory') parser.add_argument('--optimizer', type=str, default='adam()', help='Optimizer and its argument') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') args = parser.parse_args() # Prepare ChainerMN communicator if args.gpu: comm = chainermn.create_communicator('hierarchical') dev = comm.intra_rank else: comm = chainermn.create_communicator('naive') dev = -1 if comm.rank == 0: print('==========================================') print('Num process (COMM_WORLD): {}'.format(comm.size)) if args.gpu: print('Using GPUs') print('Using {} communicator'.format(args.communicator)) print('Num unit: {}'.format(args.unit)) print('Num Minibatch-size: {}'.format(args.batchsize)) print('==========================================') # Rank 0 prepares all data if comm.rank == 0: if args.cache and not os.path.exists(args.cache): os.mkdir(args.cache) # Read source data bt = time.time() if args.cache: cache_file = os.path.join(args.cache, 'source.pickle') source_vocab, source_data = cached_call(cache_file, read_source, args.input, args.cache) else: source_vocab, source_data = read_source(args.input, args.cache) et = time.time() print('RD source done. {:.3f} [s]'.format(et - bt)) sys.stdout.flush() # Read target data bt = time.time() if args.cache: cache_file = os.path.join(args.cache, 'target.pickle') target_vocab, target_data = cached_call(cache_file, read_target, args.input, args.cache) else: target_vocab, target_data = read_target(args.input, args.cache) et = time.time() print('RD target done. {:.3f} [s]'.format(et - bt)) sys.stdout.flush() print('Original training data size: %d' % len(source_data)) train_data = [(s, t) for s, t in six.moves.zip(source_data, target_data) if 0 < len(s) < 50 and 0 < len(t) < 50] print('Filtered training data size: %d' % len(train_data)) en_path = os.path.join(args.input, 'dev', 'newstest2013.en') source_data = europal.make_dataset(en_path, source_vocab) fr_path = os.path.join(args.input, 'dev', 'newstest2013.fr') target_data = europal.make_dataset(fr_path, target_vocab) assert(len(source_data) == len(target_data)) test_data = [(s, t) for s, t in six.moves.zip(source_data, target_data) if 0 < len(s) and 0 < len(t)] source_ids = {word: index for index, word in enumerate(source_vocab)} target_ids = {word: index for index, word in enumerate(target_vocab)} else: # target_data, source_data = None, None train_data, test_data = None, None target_ids, source_ids = None, None # Print GPU id for i in range(0, comm.size): if comm.rank == i: print('Rank {} GPU: {}'.format(comm.rank, dev)) sys.stdout.flush() comm.mpi_comm.Barrier() # broadcast id- > word dictionary source_ids = comm.bcast_obj(source_ids, root=0) target_ids = comm.bcast_obj(target_ids, root=0) target_words = {i: w for w, i in target_ids.items()} source_words = {i: w for w, i in source_ids.items()} if comm.rank == 0: print('target_words : {}'.format(len(target_words))) print('source_words : {}'.format(len(source_words))) model = Seq2seq(3, len(source_ids), len(target_ids), args.unit) if dev >= 0: chainer.cuda.get_device_from_id(dev).use() model.to_gpu(dev) # determine the stop trigger m = re.match(r'^(\d+)e$', args.stop) if m: trigger = (int(m.group(1)), 'epoch') else: m = re.match(r'^(\d+)i$', args.stop) if m: trigger = (int(m.group(1)), 'iteration') else: if comm.rank == 0: sys.stderr.write('Error: unknown stop trigger: {}'.format( args.stop)) exit(-1) if comm.rank == 0: print('Trigger: {}'.format(trigger)) optimizer = chainermn.create_multi_node_optimizer( create_optimizer(args.optimizer), comm) optimizer.setup(model) # Broadcast dataset # Sanity check of train_data train_data = chainermn.scatter_dataset(train_data, comm) test_data = chainermn.scatter_dataset(test_data, comm) train_iter = chainer.iterators.SerialIterator(train_data, args.batchsize, shuffle=False) updater = training.StandardUpdater( train_iter, optimizer, converter=convert, device=dev) trainer = training.Trainer(updater, trigger, out=args.out) trainer.extend(chainermn.create_multi_node_evaluator( BleuEvaluator(model, test_data, device=dev, comm=comm), comm)) def translate_one(source, target): words = europal.split_sentence(source) print('# source : ' + ' '.join(words)) x = model.xp.array( [source_ids.get(w, 1) for w in words], numpy.int32) ys = model.translate([x])[0] words = [target_words[y] for y in ys] print('# result : ' + ' '.join(words)) print('# expect : ' + target) # @chainer.training.make_extension(trigger=(200, 'iteration')) def translate(trainer): translate_one( 'Who are we ?', 'Qui sommes-nous?') translate_one( 'And it often costs over a hundred dollars ' + 'to obtain the required identity card .', 'Or, il en coûte souvent plus de cent dollars ' + 'pour obtenir la carte d\'identité requise.') source, target = test_data[numpy.random.choice(len(test_data))] source = ' '.join([source_words.get(i, '') for i in source]) target = ' '.join([target_words.get(i, '') for i in target]) translate_one(source, target) if comm.rank == 0: trainer.extend(extensions.LogReport(trigger=(1, 'epoch')), trigger=(1, 'epoch')) report = extensions.PrintReport(['epoch', 'iteration', 'main/loss', 'main/perp', 'validation/main/bleu', 'elapsed_time']) trainer.extend(report, trigger=(1, 'epoch')) comm.mpi_comm.Barrier() if comm.rank == 0: print('start training') sys.stdout.flush() trainer.run()
# comm.inter_rank gives the rank of the node. This should only print on one node. if comm.inter_rank == 0: print('# Minibatch-size: {}'.format(args.batch_size)) print('# epoch: {}'.format(args.epochs)) print('# communicator: {}'.format(args.communicator)) # Set up a neural network to train. # Classifier reports softmax cross entropy loss and accuracy at every # iteration, which will be used by the PrintReport extension below. # comm.intra_rank gives the rank of the process on a given node. device = comm.intra_rank if num_gpus > 0 else -1 if device >= 0: chainer.cuda.get_device_from_id(device).use() optimizer = chainermn.create_multi_node_optimizer(chainer.optimizers.MomentumSGD(args.learning_rate), comm) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(5e-4)) num_loaders = 2 train_iter = chainer.iterators.MultiprocessIterator(train, args.batch_size, n_processes=num_loaders) test_iter = chainer.iterators.MultiprocessIterator(test, args.batch_size, repeat=False, n_processes=num_loaders) # Set up a trainer updater = training.StandardUpdater(train_iter, optimizer, device=device) trainer = training.Trainer(updater, (args.epochs, 'epoch'), out=args.output_data_dir) # Evaluate the model with the test dataset for each epoch evaluator = extensions.Evaluator(test_iter, model, device=device) evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
def main(): parser = argparse.ArgumentParser( description='ChainerMN example: pipelined neural network') parser.add_argument('--batchsize', '-b', type=int, default=100, help='Number of images in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=20, help='Number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', action='store_true', help='Use GPU') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--unit', '-u', type=int, default=1000, help='Number of units') args = parser.parse_args() # Prepare ChainerMN communicator. if args.gpu: comm = chainermn.create_communicator('hierarchical') data_axis, model_axis = comm.rank % 2, comm.rank // 2 data_comm = comm.split(data_axis, comm.rank) model_comm = comm.split(model_axis, comm.rank) device = comm.intra_rank else: comm = chainermn.create_communicator('naive') data_axis, model_axis = comm.rank % 2, comm.rank // 2 data_comm = comm.split(data_axis, comm.rank) model_comm = comm.split(model_axis, comm.rank) device = -1 if model_comm.size != 2: raise ValueError( 'This example can only be executed on the even number' 'of processes.') if comm.rank == 0: print('==========================================') if args.gpu: print('Using GPUs') print('Num unit: {}'.format(args.unit)) print('Num Minibatch-size: {}'.format(args.batchsize)) print('Num epoch: {}'.format(args.epoch)) print('==========================================') if data_axis == 0: model = L.Classifier(MLP0(model_comm, args.unit)) elif data_axis == 1: model = MLP1(model_comm, args.unit, 10) if device >= 0: chainer.cuda.get_device_from_id(device).use() model.to_gpu() optimizer = chainermn.create_multi_node_optimizer( chainer.optimizers.Adam(), data_comm) optimizer.setup(model) # Original dataset on worker 0 and 1. # Datasets of worker 0 and 1 are split and distributed to all workers. if model_axis == 0: train, test = chainer.datasets.get_mnist() if data_axis == 1: train = chainermn.datasets.create_empty_dataset(train) test = chainermn.datasets.create_empty_dataset(test) else: train, test = None, None train = chainermn.scatter_dataset(train, data_comm, shuffle=True) test = chainermn.scatter_dataset(test, data_comm, shuffle=True) train_iter = chainer.iterators.SerialIterator( train, args.batchsize, shuffle=False) test_iter = chainer.iterators.SerialIterator( test, args.batchsize, repeat=False, shuffle=False) updater = training.StandardUpdater(train_iter, optimizer, device=device) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) evaluator = extensions.Evaluator(test_iter, model, device=device) evaluator = chainermn.create_multi_node_evaluator(evaluator, data_comm) trainer.extend(evaluator) # Some display and output extentions are necessary only for worker 0. if comm.rank == 0: trainer.extend(extensions.DumpGraph('main/loss')) trainer.extend(extensions.LogReport()) trainer.extend(extensions.PrintReport( ['epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time'])) trainer.extend(extensions.ProgressBar()) trainer.run()
def check_mnist(gpu, display_log=True): epoch = 5 batchsize = 100 n_units = 100 comm = chainermn.create_communicator('naive') if gpu: device = comm.intra_rank chainer.cuda.get_device_from_id(device).use() else: device = -1 model = L.Classifier(MLP(n_units, 10)) if gpu: model.to_gpu() optimizer = chainermn.create_multi_node_optimizer( chainer.optimizers.Adam(), comm) optimizer.setup(model) if comm.rank == 0: train, test = chainer.datasets.get_mnist() else: train, test = None, None train = chainermn.scatter_dataset(train, comm, shuffle=True) test = chainermn.scatter_dataset(test, comm, shuffle=True) train_iter = chainer.iterators.SerialIterator(train, batchsize) test_iter = chainer.iterators.SerialIterator(test, batchsize, repeat=False, shuffle=False) updater = training.StandardUpdater( train_iter, optimizer, device=device ) trainer = training.Trainer(updater, (epoch, 'epoch')) # Wrap standard Chainer evaluators by MultiNodeEvaluator. evaluator = extensions.Evaluator(test_iter, model, device=device) evaluator = chainermn.create_multi_node_evaluator(evaluator, comm) trainer.extend(evaluator) # Add checkpointer. This is just to check checkpointing runs # without errors path = tempfile.mkdtemp(dir='/tmp', prefix=__name__ + "-tmp-") checkpointer = create_multi_node_checkpointer(name=__name__, comm=comm, path=path) trainer.extend(checkpointer, trigger=(1, 'epoch')) # Some display and output extensions are necessary only for one worker. # (Otherwise, there would just be repeated outputs.) if comm.rank == 0 and display_log: trainer.extend(extensions.LogReport(trigger=(1, 'epoch')), trigger=(1, 'epoch')) trainer.extend(extensions.PrintReport(['epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time'], out=sys.stderr), trigger=(1, 'epoch')) trainer.run() err = evaluator()['validation/main/accuracy'] assert err > 0.95 # Check checkpointer successfully finalized snapshot directory assert [] == os.listdir(path) os.removedirs(path)