Ejemplo n.º 1
0
def test_mnist_cpu(docker_image, opt_ml, use_gpu):
    local_mode.train(mnist_script,
                     data_dir,
                     docker_image,
                     opt_ml,
                     use_gpu=use_gpu)

    assert local_mode.file_exists(
        opt_ml, 'model/model.pth'), 'Model file was not created'
    assert local_mode.file_exists(
        opt_ml, 'output/success'), 'Success file was not created'
    assert not local_mode.file_exists(opt_ml,
                                      'output/failure'), 'Failure happened'
Ejemplo n.º 2
0
def test_mnist(docker_image, opt_ml, processor):
    local_mode.train(mnist_script,
                     data_dir,
                     docker_image,
                     opt_ml,
                     hyperparameters={'processor': processor})

    assert local_mode.file_exists(
        opt_ml, 'model/model.pth'), 'Model file was not created'
    assert local_mode.file_exists(
        opt_ml, 'output/success'), 'Success file was not created'
    assert not local_mode.file_exists(opt_ml,
                                      'output/failure'), 'Failure happened'
def test_dist_operations_path_cpu(docker_image, opt_ml, dist_cpu_backend):
    local_mode.train(dist_operations_path,
                     data_dir,
                     docker_image,
                     opt_ml,
                     cluster_size=3,
                     hyperparameters={'backend': dist_cpu_backend})

    assert local_mode.file_exists(
        opt_ml, 'model/success'), 'Script success file was not created'
    assert local_mode.file_exists(
        opt_ml, 'output/success'), 'Success file was not created'
    assert not local_mode.file_exists(opt_ml,
                                      'output/failure'), 'Failure happened'
def test_cpu_nccl(docker_image, opt_ml):
    local_mode.train(mnist_script,
                     data_dir,
                     docker_image,
                     opt_ml,
                     cluster_size=2,
                     hyperparameters={'backend': 'nccl'})

    assert not local_mode.file_exists(
        opt_ml, 'model/success'), 'Script success file was not created'
    assert not local_mode.file_exists(
        opt_ml, 'output/success'), 'Success file was not created'
    assert local_mode.file_exists(opt_ml,
                                  'output/failure'), 'Failure not happened'
def test_mnist_cpu(docker_image, opt_ml, dist_cpu_backend):
    local_mode.train(mnist_script,
                     data_dir,
                     docker_image,
                     opt_ml,
                     cluster_size=2,
                     hyperparameters={'backend': dist_cpu_backend})

    assert local_mode.file_exists(
        opt_ml, 'model/model.pth'), 'Model file was not created'
    assert local_mode.file_exists(
        opt_ml, 'output/success'), 'Success file was not created'
    assert not local_mode.file_exists(opt_ml,
                                      'output/failure'), 'Failure happened'
Ejemplo n.º 6
0
def test_single_machine_failure(docker_image, opt_ml, use_gpu):
    customer_script = 'failure_script.py'

    local_mode.train(customer_script, resource_path, docker_image, opt_ml,
                     source_dir=resource_path, use_gpu=use_gpu)

    assert local_mode.file_exists(opt_ml, 'output/failure'), 'Failure did not happen'
def test_xgboost_training_multiple_machines_without_early_stopping(
        docker_image, opt_ml):
    hyperparameters = get_default_hyperparameters(100000)
    hyperparameters['save_model_on_termination'] = 'false'

    local_mode.train(False,
                     data_dir,
                     docker_image,
                     opt_ml,
                     hyperparameters=hyperparameters,
                     cluster_size=2,
                     early_stopping=True)

    host1 = local_mode.file_exists(opt_ml, "model/xgboost-model", 'algo-1')
    host2 = local_mode.file_exists(opt_ml, "model/xgboost-model", 'algo-2')
    assert not (host1 or host2), "Model saved on some host"
def test_xgboost_boston_single_machine(docker_image, opt_ml):

    customer_script = 'single_machine_customer_script.py'
    hyperparameters = {
        'objective': 'reg:linear',
        'colsample-bytree': 0.3,
        'learning-rate': 0.1,
        'max-depth': 5,
        'reg-alpha': 10,
        'n-estimators': 10
    }

    local_mode.train(customer_script,
                     data_dir,
                     docker_image,
                     opt_ml,
                     hyperparameters=hyperparameters,
                     source_dir=boston_path)

    files = [
        'model/xgb-boston.model', 'output/data/cv_results.csv',
        'output/data/feature-importance-plot.png'
    ]

    assert not local_mode.file_exists(opt_ml,
                                      'output/failure'), 'Failure happened'

    test_utils.files_exist(opt_ml, files)
Ejemplo n.º 9
0
def test_fastai_mnist(docker_image, opt_ml, use_gpu, py_version):
    if py_version != PYTHON3:
        print('Skipping the test because fastai supports >= Python 3.6.')
        return

    local_mode.train(fastai_mnist_script,
                     os.path.join(fastai_path, 'mnist_tiny'),
                     docker_image,
                     opt_ml,
                     use_gpu=use_gpu)

    assert local_mode.file_exists(
        opt_ml, 'model/model.pth'), 'Model file was not created'
    assert local_mode.file_exists(
        opt_ml, 'output/success'), 'Success file was not created'
    assert not local_mode.file_exists(opt_ml,
                                      'output/failure'), 'Failure happened'
Ejemplo n.º 10
0
def test_distributed_failure(docker_image, opt_ml, use_gpu):
    customer_script = 'failure_script.py'
    cluster_size = 2
    hyperparameters = {'sagemaker_process_slots_per_host': 1,
                       'sagemaker_num_processes': cluster_size, 'node_to_fail': 1}

    local_mode.train(customer_script, resource_path, docker_image, opt_ml,
                     hyperparameters=hyperparameters, source_dir=resource_path,
                     use_gpu=use_gpu, cluster_size=cluster_size)

    assert local_mode.file_exists(opt_ml, 'output/failure'), 'Failure did not happen'
Ejemplo n.º 11
0
def test_xgboost_abalone_kfold(dataset, extra_hps, model_file_count,
                               docker_image, opt_ml):
    hyperparameters = get_abalone_default_hyperparameters()
    data_path = os.path.join(data_root, dataset, "data")

    local_mode.train(
        False,
        data_path,
        docker_image,
        opt_ml,
        hyperparameters={
            **hyperparameters,
            **extra_hps
        },
    )

    files = [f"model/xgboost-model-{i}" for i in range(model_file_count)]
    assert not local_mode.file_exists(opt_ml,
                                      "output/failure"), "Failure happened"
    test_utils.files_exist(opt_ml, files)
    local_mode.file_exists(opt_ml, "output/data/predictions.csv")
Ejemplo n.º 12
0
def test_chainer_mnist_single_machine(docker_image, opt_ml, use_gpu):

    customer_script = 'single_machine_customer_script.py'
    hyperparameters = {'batch-size': 10000, 'epochs': 1}

    local_mode.train(customer_script,
                     data_dir,
                     docker_image,
                     opt_ml,
                     hyperparameters=hyperparameters,
                     source_dir=mnist_path,
                     use_gpu=use_gpu)

    files = [
        'model/model.npz', 'output/success', 'output/data/algo-1/accuracy.png',
        'output/data/algo-1/cg.dot', 'output/data/algo-1/log',
        'output/data/algo-1/loss.png'
    ]

    test_utils.files_exist(opt_ml, files)

    assert not local_mode.file_exists(opt_ml,
                                      'output/failure'), 'Failure happened'

    script_path = os.path.join(mnist_path, customer_script)

    with local_mode.serve(script_path,
                          model_dir=None,
                          image_name=docker_image,
                          opt_ml=opt_ml,
                          use_gpu=use_gpu,
                          source_dir=mnist_path):

        test_arrays = [
            np.zeros((100, 784), dtype='float32'),
            np.zeros((100, 1, 28, 28), dtype='float32'),
            np.zeros((100, 28, 28), dtype='float32')
        ]

        request_data = np.zeros((100, 784), dtype='float32')

        data_as_list = request_data.tolist()

        test_utils.predict_and_assert_response_length(data_as_list, 'text/csv')

        for array in test_arrays:
            # JSON and NPY can take multidimensional (n > 2) arrays
            data_as_list = array.tolist()
            test_utils.predict_and_assert_response_length(
                data_as_list, 'application/json')
            test_utils.predict_and_assert_response_length(
                request_data, 'application/x-npy')
def test_package_version(docker_image, opt_ml):
    version_check_script = "train.py"

    local_mode.train(
        version_check_script,
        data_dir,
        docker_image,
        opt_ml,
        source_dir=script_path,
    )

    assert not local_mode.file_exists(opt_ml,
                                      "output/failure"), "Failure happened"
def test_xgboost_training_single_machine_without_early_stopping(
        docker_image, opt_ml):
    hyperparameters = get_default_hyperparameters(100000)
    hyperparameters['save_model_on_termination'] = 'false'

    local_mode.train(False,
                     data_dir,
                     docker_image,
                     opt_ml,
                     hyperparameters=hyperparameters,
                     early_stopping=True,
                     train_time=5)

    assert not local_mode.file_exists(opt_ml,
                                      "model/xgboost-model"), "Model saved"
Ejemplo n.º 15
0
def test_training_jobs_do_not_stall(docker_image, opt_ml, use_gpu):
    """
    This test validates that training does not stall.
    https://github.com/chainer/chainermn/issues/236
    """
    customer_script = 'training_jobs_do_not_stall_customer_script.py'
    cluster_size = 2
    hyperparameters = {'sagemaker_use_mpi': True, 'sagemaker_process_slots_per_host': 1,
                       'sagemaker_num_processes': 2}

    local_mode.train(customer_script, resource_path, docker_image, opt_ml,
                     hyperparameters=hyperparameters, source_dir=resource_path,
                     use_gpu=use_gpu, cluster_size=cluster_size)

    assert local_mode.file_exists(opt_ml, 'output/failure'), 'Failure did not happen'
Ejemplo n.º 16
0
def test_xgboost_abalone_inference(docker_image, opt_ml):
    customer_script = "abalone_distributed.py"
    request_body = get_libsvm_request_body()

    with local_mode.serve(customer_script,
                          libsvm_model_dir,
                          docker_image,
                          opt_ml,
                          source_dir=abalone_path):
        response_status_code, response_body = local_mode.request(
            request_body, content_type="text/libsvm")

    assert response_status_code == 200
    assert not local_mode.file_exists(opt_ml,
                                      "output/failure"), "Failure happened"
    assert len(response_body.split(",")) == 1
def test_chainer_mnist_distributed(docker_image, opt_ml, use_gpu,
                                   customer_script):

    cluster_size = 2
    # pure_nccl communicator hangs when only one gpu is available.
    hyperparameters = {
        'sagemaker_process_slots_per_host': 1,
        'sagemaker_num_processes': cluster_size,
        'batch-size': 10000,
        'epochs': 1,
        'communicator': 'hierarchical'
    }

    local_mode.train(customer_script,
                     data_dir,
                     docker_image,
                     opt_ml,
                     hyperparameters=hyperparameters,
                     cluster_size=cluster_size,
                     source_dir=mnist_path,
                     use_gpu=use_gpu)

    files = [
        'model/model.npz', 'output/success', 'output/data/algo-1/accuracy.png',
        'output/data/algo-1/cg.dot', 'output/data/algo-1/log',
        'output/data/algo-1/loss.png'
    ]

    test_utils.files_exist(opt_ml, files)

    assert not local_mode.file_exists(opt_ml,
                                      'output/failure'), 'Failure happened'

    with local_mode.serve(os.path.join(mnist_path, customer_script),
                          model_dir=None,
                          image_name=docker_image,
                          opt_ml=opt_ml):

        request_data = np.zeros((100, 784), dtype='float32')

        data_as_list = request_data.tolist()

        test_utils.predict_and_assert_response_length(data_as_list,
                                                      'application/json')
        test_utils.predict_and_assert_response_length(data_as_list, 'text/csv')
        test_utils.predict_and_assert_response_length(request_data,
                                                      'application/x-npy')
Ejemplo n.º 18
0
def test_all_processes_finish_with_mpi(docker_image, opt_ml, use_gpu):
    """
    This test validates that all training processes finish before containers are shut down.
    """
    customer_script = 'all_processes_finish_customer_script.py'

    cluster_size = 2
    hyperparameters = {'sagemaker_use_mpi': True, 'sagemaker_process_slots_per_host': 2,
                       'sagemaker_num_processes': 4}

    local_mode.train(customer_script, resource_path, docker_image, opt_ml,
                     hyperparameters=hyperparameters, source_dir=resource_path, use_gpu=use_gpu,
                     cluster_size=cluster_size)

    file_name = 'output/data/process_could_complete'

    assert local_mode.file_exists(opt_ml, file_name, host='algo-2'), 'Model was not saved'
Ejemplo n.º 19
0
def test_xgboost_abalone_training_single_machine(docker_image, opt_ml):
    customer_script = "abalone_distributed.py"
    hyperparameters = get_abalone_default_hyperparameters()

    local_mode.train(
        customer_script,
        data_dir,
        docker_image,
        opt_ml,
        hyperparameters=hyperparameters,
        source_dir=abalone_path,
    )

    files = ["model/xgboost-model"]
    assert not local_mode.file_exists(opt_ml,
                                      "output/failure"), "Failure happened"
    test_utils.files_exist(opt_ml, files)
Ejemplo n.º 20
0
def test_xgboost_abalone_mme_with_transform_fn(docker_image, opt_ml):
    customer_script = "abalone_distributed.py"
    request_body = get_libsvm_request_body()
    additional_env_vars = [
        "SAGEMAKER_BIND_TO_PORT=8080",
        "SAGEMAKER_SAFE_PORT_RANGE=9000-9999",
        "SAGEMAKER_MULTI_MODEL=true",
    ]
    model_name = "libsvm_pickled"
    model_data = json.dumps({
        "model_name": model_name,
        "url": "/opt/ml/model/{}".format(model_name)
    })
    with append_transform_fn_to_abalone_script(
            abalone_path, customer_script) as custom_script_path:
        with local_mode.serve(
                customer_script,
                models_dir,
                docker_image,
                opt_ml,
                source_dir=custom_script_path,
                additional_env_vars=additional_env_vars,
        ):
            load_status_code, _ = local_mode.request(
                model_data,
                content_type="application/json",
                request_url=MME_MODELS_URL.format(model_name),
            )
            assert load_status_code == 200
            invoke_status_code, invoke_response_body = local_mode.request(
                request_body,
                content_type="text/libsvm",
                request_url=MME_INVOKE_URL.format(model_name),
            )

    assert invoke_status_code == 200
    assert (len(invoke_response_body.split(",")) == len(request_body.split()) +
            1  # final column is the bias term
            )
    assert not local_mode.file_exists(opt_ml,
                                      "output/failure"), "Failure happened"
Ejemplo n.º 21
0
def test_xgboost_abalone_custom_inference_with_transform_fn(
        docker_image, opt_ml):
    customer_script = "abalone_distributed.py"
    request_body = get_libsvm_request_body()
    with append_transform_fn_to_abalone_script(
            abalone_path, customer_script) as custom_script_path:
        with local_mode.serve(
                customer_script,
                libsvm_model_dir,
                docker_image,
                opt_ml,
                source_dir=custom_script_path,
        ):
            response_status_code, response_body = local_mode.request(
                request_body, content_type="text/libsvm")
    assert response_status_code == 200
    assert not local_mode.file_exists(opt_ml,
                                      "output/failure"), "Failure happened"
    assert (len(response_body.split(",")) == len(request_body.split()) +
            1  # final column is the bias term
            )
def test_chainer_mnist_custom_loop(docker_image, opt_ml, use_gpu):

    customer_script = 'single_machine_custom_loop.py'
    hyperparameters = {'batch-size': 10000, 'epochs': 1}

    local_mode.train(customer_script,
                     data_dir,
                     docker_image,
                     opt_ml,
                     hyperparameters=hyperparameters,
                     source_dir=mnist_path,
                     use_gpu=use_gpu)

    files = ['model/model.npz', 'output/success']

    test_utils.files_exist(opt_ml, files)

    assert not local_mode.file_exists(opt_ml,
                                      'output/failure'), 'Failure happened'

    script_path = os.path.join(mnist_path, customer_script)

    with local_mode.serve(script_path,
                          model_dir=None,
                          image_name=docker_image,
                          opt_ml=opt_ml):

        request_data = np.zeros((100, 784), dtype='float32')

        data_as_list = request_data.tolist()

        test_utils.predict_and_assert_response_length(data_as_list,
                                                      'application/json')
        test_utils.predict_and_assert_response_length(data_as_list, 'text/csv')
        test_utils.predict_and_assert_response_length(request_data,
                                                      'application/x-npy')
def files_exist(opt_ml, files):
    for f in files:
        assert localmode.file_exists(opt_ml, f), 'file {} was not created'.format(f)