Ejemplo n.º 1
0
def train(hosts, current_host, num_gpus, custom_mpi_cmds):
    hyperparameters = framework.env.read_hyperparameters()
    env = framework.training_env(hyperparameters=hyperparameters)
    process_slots_per_host = num_gpus

    _start_ssh_daemon()
    # Remove the conflict MPI setting
    subprocess.check_call("sed -ie \"s/btl_tcp_if_exclude/#btl_tcp_if_exclude/g\" /usr/local/etc/openmpi-mca-params.conf", shell=True)

    if current_host == hosts[0]:
        host_list = hosts if process_slots_per_host == 1 else \
            [host + ':{}'.format(process_slots_per_host) for host in hosts]

        num_processes = process_slots_per_host * len(hosts)
        credential_vars = ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_SESSION_TOKEN']
        # Build mpirun file
        mpi_command = [    '#!/usr/bin/env bash \n',
                           '/usr/local/bin/mpirun --allow-run-as-root --display-map --tag-output --host {} \\\n'.format(",".join(host_list)), \
                           ' --mca plm_rsh_no_tree_spawn 1 \\\n', \
                           ' -mca pml ob1 \\\n', \
                           ' -mca btl ^openib \\\n', \
                           ' -bind-to None \\\n', \
                           ' -map-by slot \\\n', \
                           ' -mca btl_vader_single_copy_mechanism none \\\n'
                           ' -mca btl_tcp_if_include {} \\\n'.format(env.network_interface_name), \
                           ' -mca oob_tcp_if_include {} \\\n'.format(env.network_interface_name), \
                           ' -x NCCL_SOCKET_IFNAME={} \\\n'.format(env.network_interface_name), \
                           ' -x NCCL_MIN_NRINGS=8 \\\n', \
                           ' -x HOROVOD_CYCLE_TIME=0.5 \\\n', \
                           ' -x TF_CUDNN_USE_AUTOTUNE=0 \\\n', \
                           ' -x HOROVOD_FUSION_THRESHOLD=67108864 \\\n', \
                           ' -x TENSORPACK_FP16=1 \\\n', \
                           ' -x PATH \\\n', \
                           ' -x LD_LIBRARY_PATH \\\n', \
                           ' -x NCCL_DEBUG=INFO \\\n', \
                           ' -mca orte_abort_on_non_zero_status 1 \\\n', \
                           ' -np {} \\\n'.format(num_processes)]
        for v in credential_vars:
            if v in os.environ:
                mpi_command.append(" -x {} \\\n".format(v))
        for cmd in custom_mpi_cmds:
            mpi_command.append("{} \\\n".format(cmd))
        mpi_command.append("/opt/ml/code/run.sh")
        # Write file and lanch mpi
        with open('mpi_cmd.sh', 'a') as the_file:
            for item in mpi_command: the_file.write(item)
        with open('mpi_cmd.sh', 'r') as the_file:
            logger.info('MPI script:\n\n%s', the_file.read())
        subprocess.check_call("chmod +x mpi_cmd.sh", shell=True)
        _wait_for_worker_nodes_to_start_sshd(hosts)
        subprocess.check_call("./mpi_cmd.sh", shell=True)
    else:
        _wait_master_to_start(hosts[0])
        _wait_master_to_finish(hosts[0])
Ejemplo n.º 2
0
def main():
    """Training entry point
    """
    hyperparameters = framework.env.read_hyperparameters()
    env = framework.training_env(hyperparameters=hyperparameters)

    user_hyperparameters = env.hyperparameters

    # If the training job is part of the multiple training jobs for tuning, we need to append the training job name to
    # model_dir in case they read from/write to the same object
    if '_tuning_objective_metric' in hyperparameters:
        model_dir = _model_dir_with_training_job(
            hyperparameters.get('model_dir'), env.job_name)
        logger.info('Appending the training job name to model_dir: {}'.format(
            model_dir))
        user_hyperparameters['model_dir'] = model_dir

    s3_utils.configure(user_hyperparameters.get('model_dir'),
                       os.environ.get('SAGEMAKER_REGION'))
    train(env, framework.mapping.to_cmd_args(user_hyperparameters))
    _log_model_missing_warning(MODEL_DIR)
Ejemplo n.º 3
0
def main():
    train(framework.training_env())
    sys.exit(0)
def main():
    train(framework.training_env())
Ejemplo n.º 5
0
def main():
    hyperparameters = framework.env.read_hyperparameters()
    env = framework.training_env(hyperparameters=hyperparameters)

    logger.setLevel(env.log_level)
    train(env, hyperparameters)
Ejemplo n.º 6
0
def test_env_vars_round_trip():
    hyperparameters = {
        "loss": "SGD",
        "sagemaker_program": "user_script.py",
        "epochs": 10,
        "batch_size": 64,
        "precision": 5.434322,
        "sagemaker_region": "us-west-2",
        "sagemaker_job_name": "horovod-training-job",
        "sagemaker_submit_directory": "s3/something",
    }

    resource_config = {
        "current_host": "algo-1",
        "hosts": ["algo-1", "algo-2", "algo-3"]
    }

    input_data_config = {
        "train": {
            "ContentType": "trainingContentType",
            "TrainingInputMode": "File",
            "S3DistributionType": "FullyReplicated",
            "RecordWrapperType": "None",
        },
        "validation": {
            "TrainingInputMode": "File",
            "S3DistributionType": "FullyReplicated",
            "RecordWrapperType": "None",
        },
    }

    os.environ[
        framework.params.
        FRAMEWORK_TRAINING_MODULE_ENV] = "test.functional.simple_framework:train"

    training_env = framework.training_env(
        resource_config=resource_config,
        input_data_config=input_data_config,
        hyperparameters=hyperparameters,
    )

    os.environ[framework.params.FRAMEWORK_TRAINING_MODULE_ENV] = ""

    args = framework.mapping.to_cmd_args(training_env.hyperparameters)

    env_vars = training_env.to_env_vars()
    env_vars["SM_USER_ARGS"] = " ".join(args)

    assert env_vars["SM_OUTPUT_DATA_DIR"] == training_env.output_data_dir
    assert (
        env_vars["SM_INPUT_DATA_CONFIG"] ==
        '{"train":{"ContentType":"trainingContentType",'
        '"RecordWrapperType":"None","S3DistributionType":"FullyReplicated",'
        '"TrainingInputMode":"File"},"validation":{"RecordWrapperType":"None",'
        '"S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}}')
    assert env_vars["SM_NETWORK_INTERFACE_NAME"] == "eth0"
    assert env_vars["SM_LOG_LEVEL"] == "20"
    assert env_vars["SM_INPUT_DIR"].endswith("/opt/ml/input")
    assert env_vars["SM_NUM_CPUS"] == str(training_env.num_cpus)
    assert env_vars["SM_HP_BATCH_SIZE"] == "64"
    assert env_vars["SM_CHANNEL_TRAIN"].endswith("/opt/ml/input/data/train")
    assert env_vars["SM_CHANNEL_VALIDATION"].endswith(
        "/opt/ml/input/data/validation")
    assert env_vars["SM_HP_EPOCHS"] == "10"
    assert env_vars[
        "SM_HPS"] == '{"batch_size":64,"epochs":10,"loss":"SGD","precision":5.434322}'
    assert env_vars["SM_HP_PRECISION"] == "5.434322"
    assert (env_vars["SM_RESOURCE_CONFIG"] ==
            '{"current_host":"algo-1","hosts":["algo-1","algo-2","algo-3"]}')
    assert env_vars["SM_MODULE_NAME"] == "user_script"
    assert env_vars["SM_INPUT_CONFIG_DIR"].endswith("/opt/ml/input/config")
    assert env_vars[
        "SM_USER_ARGS"] == "--batch_size 64 --epochs 10 --loss SGD --precision 5.434322"
    assert env_vars["SM_OUTPUT_DIR"].endswith("/opt/ml/output")
    assert env_vars["SM_MODEL_DIR"].endswith("/opt/ml/model")
    assert env_vars["SM_HOSTS"] == '["algo-1","algo-2","algo-3"]'
    assert env_vars["SM_NUM_GPUS"] == str(training_env.num_gpus)
    assert env_vars["SM_MODULE_DIR"] == "s3/something"
    assert env_vars["SM_CURRENT_HOST"] == "algo-1"
    assert env_vars["SM_CHANNELS"] == '["train","validation"]'
    assert env_vars["SM_HP_LOSS"] == "SGD"
    assert env_vars[
        "SM_FRAMEWORK_MODULE"] == "test.functional.simple_framework:train"

    assert all(x in env_vars["SM_TRAINING_ENV"]
               for x in (training_env.properties()))
Ejemplo n.º 7
0
def train(hosts, current_host, num_gpus, custom_mpi_cmds):
    hyperparameters = framework.env.read_hyperparameters()
    env = framework.training_env(hyperparameters=hyperparameters)
    process_slots_per_host = num_gpus

    # Data Preprocessing
    print("Download pre-trained model....")
    subprocess.check_call("mkdir -p /opt/ml/code/data/pretrained-models",
                          shell=True)
    subprocess.check_call(
        "wget http://models.tensorpack.com/FasterRCNN/ImageNet-R50-AlignPadding.npz",
        shell=True)
    subprocess.check_call(
        "cp ImageNet-R50-AlignPadding.npz data/pretrained-models", shell=True)
    print("Loading data from s3......")
    subprocess.check_call(
        "aws s3 cp s3://armand-ajay-workshop/mask-rcnn/sagemaker/input/train /opt/ml/code/data --recursive --quiet",
        shell=True)
    print("Loading data finsihed...Install tensorpack....")
    subprocess.check_call(
        "git clone https://github.com/armandmcqueen/tensorpack-mask-rcnn /opt/ml/code/tensorpack-mask-rcnn",
        shell=True)
    subprocess.check_call("chmod -R +w /opt/ml/code/tensorpack-mask-rcnn",
                          shell=True)
    subprocess.check_call(
        "pip install --ignore-installed -e /opt/ml/code/tensorpack-mask-rcnn/",
        shell=True)
    subprocess.check_call("chmod +x /opt/ml/code/run.sh", shell=True)
    print("Tensorpack install finished...")

    _start_ssh_daemon()
    # Remove the conflict MPI setting
    subprocess.check_call(
        "sed -ie \"s/btl_tcp_if_exclude/#btl_tcp_if_exclude/g\" /usr/local/etc/openmpi-mca-params.conf",
        shell=True)

    if current_host == hosts[0]:
        host_list = hosts if process_slots_per_host == 1 else \
            [host + ':{}'.format(process_slots_per_host) for host in hosts]

        num_processes = process_slots_per_host * len(hosts)
        credential_vars = [
            'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_SESSION_TOKEN'
        ]
        # Build mpirun file
        mpi_command = [    '#!/usr/bin/env bash \n',
                           '/usr/local/bin/mpirun --allow-run-as-root --display-map --tag-output --host {} \\\n'.format(",".join(host_list)), \
                           ' --mca plm_rsh_no_tree_spawn 1 \\\n', \
                           ' -mca pml ob1 \\\n', \
                           ' -mca btl ^openib \\\n', \
                           ' -bind-to None \\\n', \
                           ' -map-by slot \\\n', \
                           ' -mca btl_vader_single_copy_mechanism none \\\n'
                           ' -mca btl_tcp_if_include {} \\\n'.format(env.network_interface_name), \
                           ' -mca oob_tcp_if_include {} \\\n'.format(env.network_interface_name), \
                           ' -x NCCL_SOCKET_IFNAME={} \\\n'.format(env.network_interface_name), \
                           ' -x NCCL_MIN_NRINGS=8 \\\n', \
                           ' -x HOROVOD_CYCLE_TIME=0.5 \\\n', \
                           ' -x HOROVOD_FUSION_THRESHOLD=67108864 \\\n', \
                           ' -x TENSORPACK_FP16=1 \\\n', \
                           ' -x PATH \\\n', \
                           ' -x LD_LIBRARY_PATH \\\n', \
                           ' -x NCCL_DEBUG=INFO \\\n', \
                           ' -mca orte_abort_on_non_zero_status 1 \\\n', \
                           ' -np {} \\\n'.format(num_processes)]
        for v in credential_vars:
            if v in os.environ:
                mpi_command.append(" -x {} \\\n".format(v))
        for cmd in custom_mpi_cmds:
            mpi_command.append("{} \\\n".format(cmd))
        mpi_command.append("/opt/ml/code/run.sh")
        # Write file and lanch mpi
        with open('mpi_cmd.sh', 'a') as the_file:
            for item in mpi_command:
                the_file.write(item)
        with open('mpi_cmd.sh', 'r') as the_file:
            logger.info('MPI script:\n\n%s', the_file.read())
        subprocess.check_call("chmod +x mpi_cmd.sh", shell=True)
        _wait_for_worker_nodes_to_start_sshd(hosts)
        subprocess.check_call("./mpi_cmd.sh", shell=True)
    else:
        _wait_master_to_start(hosts[0])
        _wait_master_to_finish(hosts[0])
Ejemplo n.º 8
0
def test_env_vars_round_trip():
    hyperparameters = {
        'loss': 'SGD',
        'sagemaker_program': 'user_script.py',
        'epochs': 10,
        'batch_size': 64,
        'precision': 5.434322,
        'sagemaker_region': 'us-west-2',
        'sagemaker_job_name': 'horovod-training-job',
        'sagemaker_submit_directory': 's3/something'
    }

    resource_config = {
        'current_host': 'algo-1',
        'hosts': ['algo-1', 'algo-2', 'algo-3']
    }

    input_data_config = {
        'train': {
            'ContentType': 'trainingContentType',
            'TrainingInputMode': 'File',
            'S3DistributionType': 'FullyReplicated',
            'RecordWrapperType': 'None'
        },
        'validation': {
            'TrainingInputMode': 'File',
            'S3DistributionType': 'FullyReplicated',
            'RecordWrapperType': 'None'
        }
    }

    os.environ[
        framework.params.
        FRAMEWORK_TRAINING_MODULE_ENV] = 'test.functional.simple_framework:train'

    training_env = framework.training_env(resource_config=resource_config,
                                          input_data_config=input_data_config,
                                          hyperparameters=hyperparameters)

    os.environ[framework.params.FRAMEWORK_TRAINING_MODULE_ENV] = ''

    args = framework.mapping.to_cmd_args(training_env.hyperparameters)

    env_vars = training_env.to_env_vars()
    env_vars['SM_USER_ARGS'] = ' '.join(args)

    assert env_vars['SM_OUTPUT_DATA_DIR'] == training_env.output_data_dir
    assert env_vars['SM_INPUT_DATA_CONFIG'] == '{"train":{"ContentType":"trainingContentType",' \
                                               '"RecordWrapperType":"None","S3DistributionType":"FullyReplicated",' \
                                               '"TrainingInputMode":"File"},"validation":{"RecordWrapperType":"None",' \
                                               '"S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}}'
    assert env_vars['SM_NETWORK_INTERFACE_NAME'] == 'ethwe'
    assert env_vars['SM_LOG_LEVEL'] == '20'
    assert env_vars['SM_INPUT_DIR'].endswith('/opt/ml/input')
    assert env_vars['SM_NUM_CPUS'] == str(training_env.num_cpus)
    assert env_vars['SM_HP_BATCH_SIZE'] == '64'
    assert env_vars['SM_CHANNEL_TRAIN'].endswith('/opt/ml/input/data/train')
    assert env_vars['SM_CHANNEL_VALIDATION'].endswith(
        '/opt/ml/input/data/validation')
    assert env_vars['SM_HP_EPOCHS'] == '10'
    assert env_vars[
        'SM_HPS'] == '{"batch_size":64,"epochs":10,"loss":"SGD","precision":5.434322}'
    assert env_vars['SM_HP_PRECISION'] == '5.434322'
    assert env_vars[
        'SM_RESOURCE_CONFIG'] == '{"current_host":"algo-1","hosts":["algo-1","algo-2","algo-3"]}'
    assert env_vars['SM_MODULE_NAME'] == 'user_script'
    assert env_vars['SM_INPUT_CONFIG_DIR'].endswith('/opt/ml/input/config')
    assert env_vars[
        'SM_USER_ARGS'] == '--batch_size 64 --epochs 10 --loss SGD --precision 5.434322'
    assert env_vars['SM_OUTPUT_DIR'].endswith('/opt/ml/output')
    assert env_vars['SM_MODEL_DIR'].endswith('/opt/ml/model')
    assert env_vars['SM_HOSTS'] == '["algo-1","algo-2","algo-3"]'
    assert env_vars['SM_NUM_GPUS'] == str(training_env.num_gpus)
    assert env_vars['SM_MODULE_DIR'] == 's3/something'
    assert env_vars['SM_CURRENT_HOST'] == 'algo-1'
    assert env_vars['SM_CHANNELS'] == '["train","validation"]'
    assert env_vars['SM_HP_LOSS'] == 'SGD'
    assert env_vars[
        'SM_FRAMEWORK_MODULE'] == 'test.functional.simple_framework:train'

    assert all(x in env_vars['SM_TRAINING_ENV']
               for x in (training_env.properties()))
Ejemplo n.º 9
0
def cli(program, args):
    hyperparameters = framework.env.read_hyperparameters()
    env = framework.training_env(hyperparameters=hyperparameters)

    logger.setLevel(env.log_level)
    train(env, hyperparameters, program, args)