def train(batch_size, num_epoch, learning_rate, optimizer, training_channel, testing_channel, hosts, current_host, model_dir): (train_labels, train_images) = load_data(training_channel) (test_labels, test_images) = load_data(testing_channel) # Alternatively to splitting in memory, the data could be pre-split in S3 and use ShardedByS3Key # to do parallel training. shard_size = len(train_images) // len(hosts) for i, host in enumerate(hosts): if host == current_host: start = shard_size * i end = start + shard_size break train_iter = mx.io.NDArrayIter(train_images[start:end], train_labels[start:end], batch_size, shuffle=True) val_iter = mx.io.NDArrayIter(test_images, test_labels, batch_size) logging.getLogger().setLevel(logging.DEBUG) kvstore = 'local' if len(hosts) == 1 else 'dist_sync' mlp_model = mx.mod.Module(symbol=build_graph(), context=get_train_context()) mlp_model.fit(train_iter, eval_data=val_iter, kvstore=kvstore, optimizer=optimizer, optimizer_params={'learning_rate': learning_rate}, eval_metric='acc', batch_end_callback=mx.callback.Speedometer(batch_size, 100), num_epoch=num_epoch) return mlp_model if current_host == scheduler_host(hosts): save(model_dir, mlp_model)
def train( batch_size, epochs, learning_rate, num_gpus, training_channel, testing_channel, hosts, current_host, model_dir, ): (train_labels, train_images) = load_data(training_channel) (test_labels, test_images) = load_data(testing_channel) CHECKPOINTS_DIR = "/opt/ml/checkpoints" checkpoints_enabled = os.path.exists(CHECKPOINTS_DIR) # Data parallel training - shard the data so each host # only trains on a subset of the total data. shard_size = len(train_images) // len(hosts) for i, host in enumerate(hosts): if host == current_host: start = shard_size * i end = start + shard_size break train_iter = mx.io.NDArrayIter(train_images[start:end], train_labels[start:end], batch_size, shuffle=True) val_iter = mx.io.NDArrayIter(test_images, test_labels, batch_size) logging.getLogger().setLevel(logging.DEBUG) kvstore = "local" if len(hosts) == 1 else "dist_sync" mlp_model = mx.mod.Module(symbol=build_graph(), context=get_training_context(num_gpus)) checkpoint_callback = None if checkpoints_enabled: # Create a checkpoint callback that checkpoints the model params and the optimizer state after every epoch at the given path. checkpoint_callback = mx.callback.module_checkpoint( mlp_model, CHECKPOINTS_DIR + "/mnist", period=1, save_optimizer_states=True) mlp_model.fit( train_iter, eval_data=val_iter, kvstore=kvstore, optimizer="sgd", optimizer_params={"learning_rate": learning_rate}, eval_metric="acc", epoch_end_callback=checkpoint_callback, batch_end_callback=mx.callback.Speedometer(batch_size, 100), num_epoch=epochs, ) if current_host == scheduler_host(hosts): save(model_dir, mlp_model)
def train(env): logger.info('MXNet training environment: {}'.format(env.to_env_vars())) if env.additional_framework_parameters.get(LAUNCH_PS_ENV_NAME, False): _verify_hosts(env.hosts) ps_port = env.hyperparameters.get('_ps_port', '8000') ps_verbose = env.hyperparameters.get('_ps_verbose', '0') logger.info('Starting distributed training task') if scheduler_host(env.hosts) == env.current_host: _run_mxnet_process('scheduler', env.hosts, ps_port, ps_verbose) _run_mxnet_process('server', env.hosts, ps_port, ps_verbose) os.environ.update(_env_vars_for_role('worker', env.hosts, ps_port, ps_verbose)) mpi_enabled = env.additional_framework_parameters.get(LAUNCH_MPI_ENV_NAME) if mpi_enabled: runner_type = runner.MPIRunnerType else: runner_type = runner.ProcessRunnerType entry_point.run(uri=env.module_dir, user_entry_point=env.user_entry_point, args=env.to_cmd_args(), env_vars=env.to_env_vars(), runner_type=runner_type)
def train(batch_size, epochs, learning_rate, num_gpus, training_channel, testing_channel, hosts, current_host, model_dir): (train_labels, train_images) = load_data(training_channel) (test_labels, test_images) = load_data(testing_channel) # Data parallel training - shard the data so each host # only trains on a subset of the total data. shard_size = len(train_images) // len(hosts) for i, host in enumerate(hosts): if host == current_host: start = shard_size * i end = start + shard_size break train_iter = mx.io.NDArrayIter(train_images[start:end], train_labels[start:end], batch_size, shuffle=True) val_iter = mx.io.NDArrayIter(test_images, test_labels, batch_size) logging.getLogger().setLevel(logging.DEBUG) kvstore = 'local' if len(hosts) == 1 else 'dist_sync' mlp_model = mx.mod.Module(symbol=build_graph(), context=get_training_context(num_gpus)) mlp_model.fit(train_iter, eval_data=val_iter, kvstore=kvstore, optimizer='sgd', optimizer_params={'learning_rate': learning_rate}, eval_metric='acc', batch_end_callback=mx.callback.Speedometer(batch_size, 100), num_epoch=epochs) if current_host == scheduler_host(hosts): save(model_dir, mlp_model)
def _env_vars_for_role(role, hosts, ps_port, ps_verbose): if role in ROLES: return { 'DMLC_NUM_WORKER': str(len(hosts)), 'DMLC_NUM_SERVER': str(len(hosts)), 'DMLC_ROLE': role, 'DMLC_PS_ROOT_URI': _host_lookup(scheduler_host(hosts)), 'DMLC_PS_ROOT_PORT': ps_port, 'PS_VERBOSE': ps_verbose, } raise ValueError('Unexpected role: {}'.format(role))
def train( batch_size, epochs, learning_rate, num_gpus, training_channel, testing_channel, hosts, current_host, model_dir, ): (train_labels, train_images) = load_data(training_channel) (test_labels, test_images) = load_data(testing_channel) # Data parallel training - shard the data so each host # only trains on a subset of the total data. shard_size = len(train_images) // len(hosts) for i, host in enumerate(hosts): if host == current_host: start = shard_size * i end = start + shard_size break train_iter = mx.io.NDArrayIter(train_images[start:end], train_labels[start:end], batch_size, shuffle=True) val_iter = mx.io.NDArrayIter(test_images, test_labels, batch_size) logging.getLogger().setLevel(logging.DEBUG) kvstore = "local" if len(hosts) == 1 else "dist_sync" mlp_model = mx.mod.Module(symbol=build_graph(), context=get_train_context(num_gpus)) mlp_model.fit( train_iter, eval_data=val_iter, kvstore=kvstore, optimizer="sgd", optimizer_params={"learning_rate": learning_rate}, eval_metric="acc", batch_end_callback=mx.callback.Speedometer(batch_size, 100), num_epoch=epochs, ) if len(hosts) == 1 or current_host == scheduler_host(hosts): save(model_dir, mlp_model)
def train(env): logger.info('MXNet training environment: {}'.format(env.to_env_vars())) if env.additional_framework_parameters.get(LAUNCH_PS_ENV_NAME, False): _verify_hosts(env.hosts) ps_port = env.hyperparameters.get('_ps_port', '8000') ps_verbose = env.hyperparameters.get('_ps_verbose', '0') logger.info('Starting distributed training task') if scheduler_host(env.hosts) == env.current_host: _run_mxnet_process('scheduler', env.hosts, ps_port, ps_verbose) _run_mxnet_process('server', env.hosts, ps_port, ps_verbose) os.environ.update( _env_vars_for_role('worker', env.hosts, ps_port, ps_verbose)) framework.modules.run_module(env.module_dir, env.to_cmd_args(), env.to_env_vars(), env.module_name)
parser.add_argument('--current-host', type=str, default=os.environ['SM_CURRENT_HOST']) parser.add_argument('--hosts', type=list, default=json.loads(os.environ['SM_HOSTS'])) return parser.parse_args() if __name__ == '__main__': args = parse_args() num_cpus = int(os.environ['SM_NUM_CPUS']) num_gpus = int(os.environ['SM_NUM_GPUS']) model = train(args.current_host, args.hosts, num_cpus, num_gpus, args.training_channel, args.model_dir, args.batch_size, args.epochs, args.learning_rate, args.log_interval, args.embedding_size) if args.current_host == scheduler_host(args.hosts): save(model, args.model_dir) # ------------------------------------------------------------ # # Hosting methods # # ------------------------------------------------------------ # def model_fn(model_dir): """ Load the gluon model. Called once when hosting service starts. :param: model_dir The directory where model files are stored. :return: a model (in this case a Gluon network) """ symbol = mx.sym.load('%s/model.json' % model_dir)
default=json.loads(os.environ['SM_HOSTS'])) return parser.parse_args() if __name__ == '__main__': args = parse_args() num_cpus = int(os.environ['SM_NUM_CPUS']) num_gpus = int(os.environ['SM_NUM_GPUS']) model = train(args.current_host, args.hosts, num_cpus, num_gpus, args.training_channel, args.model_dir, args.batch_size, args.epochs, args.learning_rate, args.log_interval, args.embedding_size) if args.current_host == scheduler_host(args.hosts): save(model, args.model_dir) # ------------------------------------------------------------ # # Hosting methods # # ------------------------------------------------------------ # def model_fn(model_dir): """ Load the gluon model. Called once when hosting service starts. :param: model_dir The directory where model files are stored. :return: a model (in this case a Gluon network) """ symbol = mx.sym.load('%s/model.json' % model_dir)
def test_distributed_scheduler_host(): assert training_utils.scheduler_host([SCHEDULER_HOST, WORKER_HOST]) == SCHEDULER_HOST
def test_single_machine_scheduler_host(): assert training_utils.scheduler_host([SCHEDULER_HOST]) == SCHEDULER_HOST