コード例 #1
0
def train(batch_size, num_epoch, learning_rate, optimizer, training_channel,
          testing_channel, hosts, current_host, model_dir):
    (train_labels, train_images) = load_data(training_channel)
    (test_labels, test_images) = load_data(testing_channel)

    # Alternatively to splitting in memory, the data could be pre-split in S3 and use ShardedByS3Key
    # to do parallel training.
    shard_size = len(train_images) // len(hosts)
    for i, host in enumerate(hosts):
        if host == current_host:
            start = shard_size * i
            end = start + shard_size
            break

    train_iter = mx.io.NDArrayIter(train_images[start:end],
                                   train_labels[start:end],
                                   batch_size,
                                   shuffle=True)
    val_iter = mx.io.NDArrayIter(test_images, test_labels, batch_size)
    logging.getLogger().setLevel(logging.DEBUG)
    kvstore = 'local' if len(hosts) == 1 else 'dist_sync'
    mlp_model = mx.mod.Module(symbol=build_graph(),
                              context=get_train_context())
    mlp_model.fit(train_iter,
                  eval_data=val_iter,
                  kvstore=kvstore,
                  optimizer=optimizer,
                  optimizer_params={'learning_rate': learning_rate},
                  eval_metric='acc',
                  batch_end_callback=mx.callback.Speedometer(batch_size, 100),
                  num_epoch=num_epoch)
    return mlp_model

    if current_host == scheduler_host(hosts):
        save(model_dir, mlp_model)
コード例 #2
0
def train(
    batch_size,
    epochs,
    learning_rate,
    num_gpus,
    training_channel,
    testing_channel,
    hosts,
    current_host,
    model_dir,
):
    (train_labels, train_images) = load_data(training_channel)
    (test_labels, test_images) = load_data(testing_channel)
    CHECKPOINTS_DIR = "/opt/ml/checkpoints"
    checkpoints_enabled = os.path.exists(CHECKPOINTS_DIR)

    # Data parallel training - shard the data so each host
    # only trains on a subset of the total data.
    shard_size = len(train_images) // len(hosts)
    for i, host in enumerate(hosts):
        if host == current_host:
            start = shard_size * i
            end = start + shard_size
            break

    train_iter = mx.io.NDArrayIter(train_images[start:end],
                                   train_labels[start:end],
                                   batch_size,
                                   shuffle=True)
    val_iter = mx.io.NDArrayIter(test_images, test_labels, batch_size)

    logging.getLogger().setLevel(logging.DEBUG)

    kvstore = "local" if len(hosts) == 1 else "dist_sync"

    mlp_model = mx.mod.Module(symbol=build_graph(),
                              context=get_training_context(num_gpus))

    checkpoint_callback = None
    if checkpoints_enabled:
        # Create a checkpoint callback that checkpoints the model params and the optimizer state after every epoch at the given path.
        checkpoint_callback = mx.callback.module_checkpoint(
            mlp_model,
            CHECKPOINTS_DIR + "/mnist",
            period=1,
            save_optimizer_states=True)
    mlp_model.fit(
        train_iter,
        eval_data=val_iter,
        kvstore=kvstore,
        optimizer="sgd",
        optimizer_params={"learning_rate": learning_rate},
        eval_metric="acc",
        epoch_end_callback=checkpoint_callback,
        batch_end_callback=mx.callback.Speedometer(batch_size, 100),
        num_epoch=epochs,
    )

    if current_host == scheduler_host(hosts):
        save(model_dir, mlp_model)
コード例 #3
0
def train(env):
    logger.info('MXNet training environment: {}'.format(env.to_env_vars()))

    if env.additional_framework_parameters.get(LAUNCH_PS_ENV_NAME, False):
        _verify_hosts(env.hosts)

        ps_port = env.hyperparameters.get('_ps_port', '8000')
        ps_verbose = env.hyperparameters.get('_ps_verbose', '0')

        logger.info('Starting distributed training task')
        if scheduler_host(env.hosts) == env.current_host:
            _run_mxnet_process('scheduler', env.hosts, ps_port, ps_verbose)
        _run_mxnet_process('server', env.hosts, ps_port, ps_verbose)
        os.environ.update(_env_vars_for_role('worker', env.hosts, ps_port, ps_verbose))

    mpi_enabled = env.additional_framework_parameters.get(LAUNCH_MPI_ENV_NAME)

    if mpi_enabled:
        runner_type = runner.MPIRunnerType
    else:
        runner_type = runner.ProcessRunnerType

    entry_point.run(uri=env.module_dir,
                    user_entry_point=env.user_entry_point,
                    args=env.to_cmd_args(),
                    env_vars=env.to_env_vars(),
                    runner_type=runner_type)
コード例 #4
0
def train(batch_size, epochs, learning_rate, num_gpus, training_channel, testing_channel,
          hosts, current_host, model_dir):
    (train_labels, train_images) = load_data(training_channel)
    (test_labels, test_images) = load_data(testing_channel)

    # Data parallel training - shard the data so each host
    # only trains on a subset of the total data.
    shard_size = len(train_images) // len(hosts)
    for i, host in enumerate(hosts):
        if host == current_host:
            start = shard_size * i
            end = start + shard_size
            break

    train_iter = mx.io.NDArrayIter(train_images[start:end], train_labels[start:end], batch_size,
                                   shuffle=True)
    val_iter = mx.io.NDArrayIter(test_images, test_labels, batch_size)

    logging.getLogger().setLevel(logging.DEBUG)

    kvstore = 'local' if len(hosts) == 1 else 'dist_sync'

    mlp_model = mx.mod.Module(symbol=build_graph(),
                              context=get_training_context(num_gpus))
    mlp_model.fit(train_iter,
                  eval_data=val_iter,
                  kvstore=kvstore,
                  optimizer='sgd',
                  optimizer_params={'learning_rate': learning_rate},
                  eval_metric='acc',
                  batch_end_callback=mx.callback.Speedometer(batch_size, 100),
                  num_epoch=epochs)

    if current_host == scheduler_host(hosts):
        save(model_dir, mlp_model)
コード例 #5
0
def _env_vars_for_role(role, hosts, ps_port, ps_verbose):
    if role in ROLES:
        return {
            'DMLC_NUM_WORKER': str(len(hosts)),
            'DMLC_NUM_SERVER': str(len(hosts)),
            'DMLC_ROLE': role,
            'DMLC_PS_ROOT_URI': _host_lookup(scheduler_host(hosts)),
            'DMLC_PS_ROOT_PORT': ps_port,
            'PS_VERBOSE': ps_verbose,
        }

    raise ValueError('Unexpected role: {}'.format(role))
コード例 #6
0
def train(
    batch_size,
    epochs,
    learning_rate,
    num_gpus,
    training_channel,
    testing_channel,
    hosts,
    current_host,
    model_dir,
):
    (train_labels, train_images) = load_data(training_channel)
    (test_labels, test_images) = load_data(testing_channel)

    # Data parallel training - shard the data so each host
    # only trains on a subset of the total data.
    shard_size = len(train_images) // len(hosts)
    for i, host in enumerate(hosts):
        if host == current_host:
            start = shard_size * i
            end = start + shard_size
            break

    train_iter = mx.io.NDArrayIter(train_images[start:end],
                                   train_labels[start:end],
                                   batch_size,
                                   shuffle=True)
    val_iter = mx.io.NDArrayIter(test_images, test_labels, batch_size)

    logging.getLogger().setLevel(logging.DEBUG)

    kvstore = "local" if len(hosts) == 1 else "dist_sync"

    mlp_model = mx.mod.Module(symbol=build_graph(),
                              context=get_train_context(num_gpus))
    mlp_model.fit(
        train_iter,
        eval_data=val_iter,
        kvstore=kvstore,
        optimizer="sgd",
        optimizer_params={"learning_rate": learning_rate},
        eval_metric="acc",
        batch_end_callback=mx.callback.Speedometer(batch_size, 100),
        num_epoch=epochs,
    )

    if len(hosts) == 1 or current_host == scheduler_host(hosts):
        save(model_dir, mlp_model)
コード例 #7
0
def train(env):
    logger.info('MXNet training environment: {}'.format(env.to_env_vars()))

    if env.additional_framework_parameters.get(LAUNCH_PS_ENV_NAME, False):
        _verify_hosts(env.hosts)

        ps_port = env.hyperparameters.get('_ps_port', '8000')
        ps_verbose = env.hyperparameters.get('_ps_verbose', '0')

        logger.info('Starting distributed training task')
        if scheduler_host(env.hosts) == env.current_host:
            _run_mxnet_process('scheduler', env.hosts, ps_port, ps_verbose)
        _run_mxnet_process('server', env.hosts, ps_port, ps_verbose)
        os.environ.update(
            _env_vars_for_role('worker', env.hosts, ps_port, ps_verbose))

    framework.modules.run_module(env.module_dir, env.to_cmd_args(),
                                 env.to_env_vars(), env.module_name)
コード例 #8
0
    parser.add_argument('--current-host', type=str, default=os.environ['SM_CURRENT_HOST'])
    parser.add_argument('--hosts', type=list, default=json.loads(os.environ['SM_HOSTS']))

    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    num_cpus = int(os.environ['SM_NUM_CPUS'])
    num_gpus = int(os.environ['SM_NUM_GPUS'])

    model = train(args.current_host, args.hosts, num_cpus, num_gpus, args.training_channel, args.model_dir,
                  args.batch_size, args.epochs, args.learning_rate, args.log_interval, args.embedding_size)

    if args.current_host == scheduler_host(args.hosts):
        save(model, args.model_dir)


# ------------------------------------------------------------ #
# Hosting methods                                              #
# ------------------------------------------------------------ #

def model_fn(model_dir):
    """
    Load the gluon model. Called once when hosting service starts.

    :param: model_dir The directory where model files are stored.
    :return: a model (in this case a Gluon network)
    """
    symbol = mx.sym.load('%s/model.json' % model_dir)
コード例 #9
0
                        default=json.loads(os.environ['SM_HOSTS']))

    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    num_cpus = int(os.environ['SM_NUM_CPUS'])
    num_gpus = int(os.environ['SM_NUM_GPUS'])

    model = train(args.current_host, args.hosts, num_cpus, num_gpus,
                  args.training_channel, args.model_dir, args.batch_size,
                  args.epochs, args.learning_rate, args.log_interval,
                  args.embedding_size)

    if args.current_host == scheduler_host(args.hosts):
        save(model, args.model_dir)

# ------------------------------------------------------------ #
# Hosting methods                                              #
# ------------------------------------------------------------ #


def model_fn(model_dir):
    """
    Load the gluon model. Called once when hosting service starts.

    :param: model_dir The directory where model files are stored.
    :return: a model (in this case a Gluon network)
    """
    symbol = mx.sym.load('%s/model.json' % model_dir)
コード例 #10
0
def test_distributed_scheduler_host():
    assert training_utils.scheduler_host([SCHEDULER_HOST,
                                          WORKER_HOST]) == SCHEDULER_HOST
コード例 #11
0
def test_single_machine_scheduler_host():
    assert training_utils.scheduler_host([SCHEDULER_HOST]) == SCHEDULER_HOST