def _run_mnist_training_job(sagemaker_session, instance_type, instance_count,
                            chainer_version, py_version):
    script_path = (os.path.join(DATA_DIR, "chainer_mnist", "mnist.py")
                   if instance_type == 1 else os.path.join(
                       DATA_DIR, "chainer_mnist", "distributed_mnist.py"))

    data_path = os.path.join(DATA_DIR, "chainer_mnist")

    chainer = Chainer(
        entry_point=script_path,
        role="SageMakerRole",
        framework_version=chainer_version,
        py_version=py_version,
        instance_count=instance_count,
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        hyperparameters={"epochs": 1},
        # test output_path without trailing slash
        output_path="s3://{}".format(sagemaker_session.default_bucket()),
    )

    train_input = "file://" + os.path.join(data_path, "train")
    test_input = "file://" + os.path.join(data_path, "test")

    job_name = unique_name_from_base("test-chainer-training")
    chainer.fit({"train": train_input, "test": test_input}, job_name=job_name)
    return chainer
def test_training_with_additional_hyperparameters(sagemaker_session):
    with timeout(minutes=15):
        script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py')
        data_path = os.path.join(DATA_DIR, 'chainer_mnist')

        chainer = Chainer(entry_point=script_path,
                          role='SageMakerRole',
                          train_instance_count=1,
                          train_instance_type="ml.c4.xlarge",
                          sagemaker_session=sagemaker_session,
                          hyperparameters={'epochs': 1},
                          use_mpi=True,
                          num_processes=2,
                          process_slots_per_host=2,
                          additional_mpi_options="-x NCCL_DEBUG=INFO")

        train_input = chainer.sagemaker_session.upload_data(
            path=os.path.join(data_path, 'train'),
            key_prefix='integ-test-data/chainer_mnist/train')
        test_input = chainer.sagemaker_session.upload_data(
            path=os.path.join(data_path, 'test'),
            key_prefix='integ-test-data/chainer_mnist/test')

        chainer.fit({'train': train_input, 'test': test_input})
        return chainer.latest_training_job.name
def _run_mnist_training_job(sagemaker_session,
                            instance_type,
                            instance_count,
                            chainer_full_version,
                            wait=True):
    with timeout(minutes=15):

        script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py') if instance_type == 1 else \
            os.path.join(DATA_DIR, 'chainer_mnist', 'distributed_mnist.py')

        data_path = os.path.join(DATA_DIR, 'chainer_mnist')

        chainer = Chainer(entry_point=script_path,
                          role='SageMakerRole',
                          framework_version=chainer_full_version,
                          train_instance_count=instance_count,
                          train_instance_type=instance_type,
                          sagemaker_session=sagemaker_session,
                          hyperparameters={'epochs': 1})

        train_input = chainer.sagemaker_session.upload_data(
            path=os.path.join(data_path, 'train'),
            key_prefix='integ-test-data/chainer_mnist/train')
        test_input = chainer.sagemaker_session.upload_data(
            path=os.path.join(data_path, 'test'),
            key_prefix='integ-test-data/chainer_mnist/test')

        chainer.fit({'train': train_input, 'test': test_input}, wait=wait)
        return chainer.latest_training_job.name
def test_training_with_additional_hyperparameters(sagemaker_local_session,
                                                  chainer_latest_version,
                                                  chainer_latest_py_version):
    script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py")
    data_path = os.path.join(DATA_DIR, "chainer_mnist")

    chainer = Chainer(
        entry_point=script_path,
        role="SageMakerRole",
        instance_count=1,
        instance_type="local",
        framework_version=chainer_latest_version,
        py_version=chainer_latest_py_version,
        sagemaker_session=sagemaker_local_session,
        hyperparameters={"epochs": 1},
        use_mpi=True,
        num_processes=2,
        process_slots_per_host=2,
        additional_mpi_options="-x NCCL_DEBUG=INFO",
    )

    train_input = "file://" + os.path.join(data_path, "train")
    test_input = "file://" + os.path.join(data_path, "test")

    chainer.fit({"train": train_input, "test": test_input})
Esempio n. 5
0
def test_training_with_additional_hyperparameters(sagemaker_session,
                                                  chainer_full_version):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py')
        data_path = os.path.join(DATA_DIR, 'chainer_mnist')

        chainer = Chainer(entry_point=script_path,
                          role='SageMakerRole',
                          train_instance_count=1,
                          train_instance_type="ml.c4.xlarge",
                          framework_version=chainer_full_version,
                          py_version=PYTHON_VERSION,
                          sagemaker_session=sagemaker_session,
                          hyperparameters={'epochs': 1},
                          use_mpi=True,
                          num_processes=2,
                          process_slots_per_host=2,
                          additional_mpi_options="-x NCCL_DEBUG=INFO")

        train_input = chainer.sagemaker_session.upload_data(
            path=os.path.join(data_path, 'train'),
            key_prefix='integ-test-data/chainer_mnist/train')
        test_input = chainer.sagemaker_session.upload_data(
            path=os.path.join(data_path, 'test'),
            key_prefix='integ-test-data/chainer_mnist/test')

        job_name = unique_name_from_base('test-chainer-training')
        chainer.fit({
            'train': train_input,
            'test': test_input
        },
                    job_name=job_name)
        return chainer.latest_training_job.name
def _run_mnist_training_job(sagemaker_session,
                            instance_type,
                            instance_count,
                            chainer_full_version,
                            wait=True):
    script_path = (os.path.join(DATA_DIR, "chainer_mnist", "mnist.py")
                   if instance_type == 1 else os.path.join(
                       DATA_DIR, "chainer_mnist", "distributed_mnist.py"))

    data_path = os.path.join(DATA_DIR, "chainer_mnist")

    chainer = Chainer(
        entry_point=script_path,
        role="SageMakerRole",
        framework_version=chainer_full_version,
        py_version=PYTHON_VERSION,
        train_instance_count=instance_count,
        train_instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        hyperparameters={"epochs": 1},
    )

    train_input = "file://" + os.path.join(data_path, "train")
    test_input = "file://" + os.path.join(data_path, "test")

    job_name = unique_name_from_base("test-chainer-training")
    chainer.fit({
        "train": train_input,
        "test": test_input
    },
                wait=wait,
                job_name=job_name)
    return chainer
def main(parser=argparse.ArgumentParser()):
    import logging
    logging.basicConfig(level=logging.WARN)

    parser.add_argument('--profile', type=str)
    parser.add_argument('--local-mode', action='store_true')
    parser.add_argument('--instance-type',
                        type=str,
                        default=DEFAULT_INSTANCE_TYPE)
    parser.add_argument('--region', type=str, default=DEFAULT_REGION)
    parser.add_argument('--arn', type=str, default=None)
    parser.add_argument('--max-runtime',
                        type=int,
                        default=DEFAULT_RUNTIME,
                        help='seconds')
    args = parser.parse_args()

    boto_session = Session(profile_name=args.profile, region_name=args.region)
    sagemaker_session = sagemaker.Session(boto_session=boto_session)
    role = args.arn if args.arn is not None else sagemaker.get_execution_role(
        sagemaker_session)

    hyperparameters = {
        'gpu':
        0 if args.instance_type.startswith('ml.p') else -1,
        'mjcf':
        'env/ant_simple.xml',
        'action-dim':
        8,
        'obs-dim':
        28,
        'skip-step':
        10,
        'algorithm':
        'TRPO',
        'foot-list':
        'right_back_foot left_back_foot front_right_foot front_left_foot'
    }
    chainer_estimator = Chainer(
        entry_point='train.py',
        source_dir='../',
        role=role,
        image_name=IMAGE,
        framework_version='5.0.0',
        sagemaker_session=LocalSession(boto_session)
        if args.local_mode else sagemaker_session,
        train_instance_count=1,
        train_instance_type='local' if args.local_mode else args.instance_type,
        hyperparameters=hyperparameters,
        base_job_name='roboschool-TRPO-skipstep10-continue-1',
        train_max_run=args.max_runtime)
    chainer_estimator.fit(wait=args.local_mode)
def test_failed_training_job(sagemaker_session, chainer_full_version):
    with timeout(minutes=15):
        script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'failure_script.py')
        data_path = os.path.join(DATA_DIR, 'chainer_mnist')

        chainer = Chainer(entry_point=script_path, role='SageMakerRole',
                          framework_version=chainer_full_version,
                          train_instance_count=1, train_instance_type='ml.c4.xlarge',
                          sagemaker_session=sagemaker_session)

        train_input = chainer.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                            key_prefix='integ-test-data/chainer_mnist/train')

        with pytest.raises(ValueError):
            chainer.fit(train_input)
def test_failed_training_job(sagemaker_session, chainer_full_version):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'failure_script.py')
        data_path = os.path.join(DATA_DIR, 'chainer_mnist')

        chainer = Chainer(entry_point=script_path, role='SageMakerRole',
                          framework_version=chainer_full_version,
                          train_instance_count=1, train_instance_type='ml.c4.xlarge',
                          sagemaker_session=sagemaker_session)

        train_input = chainer.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                            key_prefix='integ-test-data/chainer_mnist/train')

        with pytest.raises(ValueError):
            chainer.fit(train_input)
def test_attach_deploy(chainer_training_job, sagemaker_session):
    endpoint_name = 'test-chainer-attach-deploy-{}'.format(sagemaker_timestamp())

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = Chainer.attach(chainer_training_job, sagemaker_session=sagemaker_session)
        predictor = estimator.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
        _predict_and_assert(predictor)
def test_attach_deploy(chainer_training_job, sagemaker_session):
    endpoint_name = 'test-chainer-attach-deploy-{}'.format(sagemaker_timestamp())

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = Chainer.attach(chainer_training_job, sagemaker_session=sagemaker_session)
        predictor = estimator.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
        _predict_and_assert(predictor)
Esempio n. 12
0
def test_async_fit(sagemaker_session):
    endpoint_name = 'test-chainer-attach-deploy-{}'.format(
        sagemaker_timestamp())

    with timeout(minutes=5):
        training_job_name = _run_mnist_training_job(
            sagemaker_session,
            "ml.c4.xlarge",
            1,
            chainer_full_version=CHAINER_VERSION,
            wait=False)

        print("Waiting to re-attach to the training job: %s" %
              training_job_name)
        time.sleep(20)

    with timeout_and_delete_endpoint_by_name(endpoint_name,
                                             sagemaker_session,
                                             minutes=35):
        print("Re-attaching now to: %s" % training_job_name)
        estimator = Chainer.attach(training_job_name=training_job_name,
                                   sagemaker_session=sagemaker_session)
        predictor = estimator.deploy(1,
                                     "ml.c4.xlarge",
                                     endpoint_name=endpoint_name)
        _predict_and_assert(predictor)
Esempio n. 13
0
def test_failed_training_job(sagemaker_session, chainer_full_version):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'chainer_mnist',
                                   'failure_script.py')

        chainer = Chainer(entry_point=script_path,
                          role='SageMakerRole',
                          framework_version=chainer_full_version,
                          py_version=PYTHON_VERSION,
                          train_instance_count=1,
                          train_instance_type='ml.c4.xlarge',
                          sagemaker_session=sagemaker_session)

        with pytest.raises(ValueError) as e:
            chainer.fit()
        assert 'This failure is expected' in str(e.value)
Esempio n. 14
0
def test_attach_deploy(chainer_training_job, sagemaker_session):
    endpoint_name = unique_name_from_base('test-chainer-attach-deploy')

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = Chainer.attach(chainer_training_job,
                                   sagemaker_session=sagemaker_session)
        predictor = estimator.deploy(1,
                                     'ml.m4.xlarge',
                                     endpoint_name=endpoint_name)
        _predict_and_assert(predictor)
def test_attach_deploy(sagemaker_session, chainer_latest_version,
                       chainer_latest_py_version, cpu_instance_type):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py")
        data_path = os.path.join(DATA_DIR, "chainer_mnist")

        chainer = Chainer(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=chainer_latest_version,
            py_version=chainer_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            hyperparameters={"epochs": 1},
        )

        train_input = sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/chainer_mnist/train")

        test_input = sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/chainer_mnist/test")

        job_name = unique_name_from_base("test-chainer-training")
        chainer.fit({
            "train": train_input,
            "test": test_input
        },
                    wait=False,
                    job_name=job_name)

    endpoint_name = unique_name_from_base("test-chainer-attach-deploy")

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = Chainer.attach(chainer.latest_training_job.name,
                                   sagemaker_session=sagemaker_session)
        predictor = estimator.deploy(1,
                                     cpu_instance_type,
                                     endpoint_name=endpoint_name)
        _predict_and_assert(predictor)
Esempio n. 16
0
def main(parser=argparse.ArgumentParser()):
    import logging
    logging.basicConfig(level=logging.WARN)

    parser.add_argument('--profile', type=str, default='default')
    parser.add_argument('--local-mode', action='store_true')
    parser.add_argument('--instance-type',
                        type=str,
                        default=DEFAULT_INSTANCE_TYPE)
    parser.add_argument('--region', type=str, default=DEFAULT_REGION)
    parser.add_argument('--arn', type=str, default=None)
    parser.add_argument('--max-runtime',
                        type=int,
                        default=DEFAULT_RUNTIME,
                        help='seconds')
    args = parser.parse_args()

    boto_session = Session(profile_name=args.profile, region_name=args.region)
    sagemaker_session = LocalSession(
        boto_session) if args.local_mode else sagemaker.Session(boto_session)
    role = args.arn if args.arn is not None else sagemaker.get_execution_role(
        sagemaker_session)

    gpu = 0 if args.instance_type.startswith(
        'ml.p') and not args.local_mode else -1
    hyperparameters = {
        'gpu': gpu,
    }
    chainer_estimator = Chainer(
        entry_point='train.py',
        source_dir='./',
        role=role,
        image_name=IMAGE,
        framework_version='5.0.0',
        sagemaker_session=sagemaker_session,
        train_instance_count=1,
        train_instance_type='local' if args.local_mode else args.instance_type,
        hyperparameters=hyperparameters,
        base_job_name='chainer-sagemaker-sample',
        train_max_run=args.max_runtime)
    chainer_estimator.fit(wait=args.local_mode)
def _run_mnist_training_job(sagemaker_session, instance_type, instance_count,
                            chainer_full_version, wait=True):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):

        script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py') if instance_type == 1 else \
            os.path.join(DATA_DIR, 'chainer_mnist', 'distributed_mnist.py')

        data_path = os.path.join(DATA_DIR, 'chainer_mnist')

        chainer = Chainer(entry_point=script_path, role='SageMakerRole',
                          framework_version=chainer_full_version,
                          train_instance_count=instance_count, train_instance_type=instance_type,
                          sagemaker_session=sagemaker_session, hyperparameters={'epochs': 1})

        train_input = chainer.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                            key_prefix='integ-test-data/chainer_mnist/train')
        test_input = chainer.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
                                                           key_prefix='integ-test-data/chainer_mnist/test')

        chainer.fit({'train': train_input, 'test': test_input}, wait=wait)
        return chainer.latest_training_job.name
def test_training_with_additional_hyperparameters(sagemaker_session, chainer_full_version):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py')
        data_path = os.path.join(DATA_DIR, 'chainer_mnist')

        chainer = Chainer(entry_point=script_path, role='SageMakerRole',
                          train_instance_count=1, train_instance_type="ml.c4.xlarge",
                          framework_version=chainer_full_version,
                          sagemaker_session=sagemaker_session, hyperparameters={'epochs': 1},
                          use_mpi=True,
                          num_processes=2,
                          process_slots_per_host=2,
                          additional_mpi_options="-x NCCL_DEBUG=INFO")

        train_input = chainer.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                            key_prefix='integ-test-data/chainer_mnist/train')
        test_input = chainer.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
                                                           key_prefix='integ-test-data/chainer_mnist/test')

        chainer.fit({'train': train_input, 'test': test_input})
        return chainer.latest_training_job.name
def _run_mnist_training_job(sagemaker_session,
                            instance_type,
                            instance_count,
                            chainer_full_version,
                            wait=True):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):

        script_path = (os.path.join(DATA_DIR, "chainer_mnist", "mnist.py")
                       if instance_type == 1 else os.path.join(
                           DATA_DIR, "chainer_mnist", "distributed_mnist.py"))

        data_path = os.path.join(DATA_DIR, "chainer_mnist")

        chainer = Chainer(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=chainer_full_version,
            py_version=PYTHON_VERSION,
            train_instance_count=instance_count,
            train_instance_type=instance_type,
            sagemaker_session=sagemaker_session,
            hyperparameters={"epochs": 1},
        )

        train_input = chainer.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/chainer_mnist/train")
        test_input = chainer.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/chainer_mnist/test")

        job_name = unique_name_from_base("test-chainer-training")
        chainer.fit({
            "train": train_input,
            "test": test_input
        },
                    wait=wait,
                    job_name=job_name)
        return chainer.latest_training_job.name
Esempio n. 20
0
def _run_mnist_training_job(sagemaker_session,
                            instance_type,
                            instance_count,
                            chainer_full_version,
                            wait=True):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):

        script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py') if instance_type == 1 else \
            os.path.join(DATA_DIR, 'chainer_mnist', 'distributed_mnist.py')

        data_path = os.path.join(DATA_DIR, 'chainer_mnist')

        chainer = Chainer(entry_point=script_path,
                          role='SageMakerRole',
                          framework_version=chainer_full_version,
                          py_version=PYTHON_VERSION,
                          train_instance_count=instance_count,
                          train_instance_type=instance_type,
                          sagemaker_session=sagemaker_session,
                          hyperparameters={'epochs': 1})

        train_input = chainer.sagemaker_session.upload_data(
            path=os.path.join(data_path, 'train'),
            key_prefix='integ-test-data/chainer_mnist/train')
        test_input = chainer.sagemaker_session.upload_data(
            path=os.path.join(data_path, 'test'),
            key_prefix='integ-test-data/chainer_mnist/test')

        job_name = unique_name_from_base('test-chainer-training')
        chainer.fit({
            'train': train_input,
            'test': test_input
        },
                    wait=wait,
                    job_name=job_name)
        return chainer.latest_training_job.name
def test_async_fit(sagemaker_session):
    endpoint_name = 'test-chainer-attach-deploy-{}'.format(sagemaker_timestamp())

    with timeout(minutes=5):
        training_job_name = _run_mnist_training_job(sagemaker_session, "ml.c4.xlarge", 1,
                                                    chainer_full_version=CHAINER_VERSION, wait=False)

        print("Waiting to re-attach to the training job: %s" % training_job_name)
        time.sleep(20)

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        print("Re-attaching now to: %s" % training_job_name)
        estimator = Chainer.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        predictor = estimator.deploy(1, "ml.c4.xlarge", endpoint_name=endpoint_name)
        _predict_and_assert(predictor)
Esempio n. 22
0
def exec_training(session, client, job_name, setting, pytorch,
                  max_parallel_jobs):
    sagemaker_session = sagemaker.Session(boto_session=session,
                                          sagemaker_client=client)

    conf = yaml.load(open(setting))

    # input data
    inputs = conf['inputs']

    if 'upload_data' in conf and isinstance(conf['upload_data'], list):
        for d in conf['upload_data']:
            s3_dir = sagemaker_session.upload_data(path=d['path'],
                                                   key_prefix=os.path.join(
                                                       job_name,
                                                       d['key_prefix']))
            inputs[d['name']] = s3_dir

    estimator_args = conf['estimator']
    estimator_args['sagemaker_session'] = sagemaker_session

    hyperparameters = estimator_args.pop('hyperparameters')
    fixed, targets = {}, {}
    for k, v in hyperparameters.items():
        if isinstance(v, dict):
            targets[k] = v
        else:
            fixed[k] = v
    estimator_args['hyperparameters'] = fixed

    if pytorch:
        estimator = PyTorch(**estimator_args)
    else:
        estimator = Chainer(**estimator_args)

    if len(targets) == 0:
        estimator.fit(inputs, job_name=job_name)
    else:
        if 'tuner' in conf:
            tuner_args = conf['tuner']
            hyperparameter_ranges = {}
            for k, v in targets.items():
                hyperparameter_ranges[k] = hp_type[v['type'].lower()](
                    v['range'])
        else:  # use default values
            tuner_args = {
                'objective_metric_name': 'metric_name',
                'metric_definitions': [{
                    'Name': 'metric_name',
                    'Regex': 'ignore'
                }],
                'strategy': 'Random',
                'objective_type': 'Maximize',
                'early_stopping_type': 'Off'
            }
            max_jobs = 1
            hyperparameter_ranges = {}
            for k, v in targets.items():
                if v['type'].lower() != 'categorical':
                    raise ValueError(
                        'the default tuner only supports Categorigal params.')
                max_jobs *= len(v['range'])
                hyperparameter_ranges[k] = hp_type[v['type'].lower()](
                    v['range'])
            tuner_args['max_jobs'] = max_jobs

        tuner_args['estimator'] = estimator
        tuner_args['hyperparameter_ranges'] = hyperparameter_ranges
        tuner_args['max_parallel_jobs'] = max_parallel_jobs
        tuner_args['base_tuning_job_name'] = job_name
        tuner_args['warm_start_config'] = None  # not supported yet.

        tuner = HyperparameterTuner(**tuner_args)
        tuner.fit(inputs, job_name=job_name)
Esempio n. 23
0
            entry_point="entry_point.py",
        ),
        RLEstimator(
            entry_point="cartpole.py",
            toolkit=RLToolkit.RAY,
            framework=RLFramework.TENSORFLOW,
            toolkit_version="0.8.5",
            role=sagemaker.get_execution_role(),
            instance_type=INSTANCE_TYPE,
            instance_count=1,
        ),
        Chainer(
            role=sagemaker.get_execution_role(),
            entry_point="entry_point.py",
            use_mpi=True,
            num_processes=4,
            framework_version="5.0.0",
            instance_type=INSTANCE_TYPE,
            instance_count=1,
            py_version="py3",
        ),
    ],
)
def test_training_step_with_framework_estimator(estimator, pipeline_session,
                                                training_input,
                                                hyperparameters):
    estimator.source_dir = DUMMY_S3_SOURCE_DIR
    estimator.set_hyperparameters(**hyperparameters)
    estimator.volume_kms_key = "volume-kms-key"
    estimator.output_kms_key = "output-kms-key"
    estimator.dependencies = ["dep-1", "dep-2"]
Esempio n. 24
0
def exec_training(
    session, client, job_name, setting, pytorch, max_parallel_jobs, is_spot
):
    sagemaker_session = sagemaker.Session(
        boto_session=session, sagemaker_client=client
    )

    conf = yaml.load(open(setting))

    # input data
    inputs = conf["inputs"]

    if "upload_data" in conf and isinstance(conf["upload_data"], list):
        for d in conf["upload_data"]:
            s3_dir = sagemaker_session.upload_data(
                path=d["path"],
                key_prefix=os.path.join(job_name, d["key_prefix"]),
            )
            inputs[d["name"]] = s3_dir

    estimator_args = conf["estimator"]
    estimator_args["sagemaker_session"] = sagemaker_session

    hyperparameters = estimator_args.pop("hyperparameters")
    fixed, targets = {}, {}
    for k, v in hyperparameters.items():
        if isinstance(v, dict):
            targets[k] = v
        else:
            fixed[k] = v
    estimator_args["hyperparameters"] = fixed

    if is_spot:
        estimator_args["train_use_spot_instances"] = True
        if "checkpoint_s3_uri" not in estimator_args:
            bucket_name = sagemaker_session.default_bucket()
            uri = os.path.join("s3://", bucket_name, job_name, "checkpoints")
            estimator_args["checkpoint_s3_uri"] = uri

    if pytorch:
        estimator = PyTorch(**estimator_args)
    else:
        estimator = Chainer(**estimator_args)

    if len(targets) == 0:
        estimator.fit(inputs, wait=False, job_name=job_name)
    else:
        if "tuner" in conf:
            tuner_args = conf["tuner"]
            hyperparameter_ranges = {}
            for k, v in targets.items():
                hyperparameter_ranges[k] = hp_type[v["type"].lower()](
                    v["range"]
                )
        else:  # use default values
            tuner_args = {
                "objective_metric_name": "metric_name",
                "metric_definitions": [
                    {"Name": "metric_name", "Regex": "ignore"}
                ],
                "strategy": "Random",
                "objective_type": "Maximize",
                "early_stopping_type": "Off",
            }
            max_jobs = 1
            hyperparameter_ranges = {}
            for k, v in targets.items():
                if v["type"].lower() != "categorical":
                    raise ValueError(
                        "the default tuner only supports Categorigal params."
                    )
                max_jobs *= len(v["range"])
                hyperparameter_ranges[k] = hp_type[v["type"].lower()](
                    v["range"]
                )
            tuner_args["max_jobs"] = max_jobs

        tuner_args["estimator"] = estimator
        tuner_args["hyperparameter_ranges"] = hyperparameter_ranges
        tuner_args["max_parallel_jobs"] = max_parallel_jobs
        tuner_args["base_tuning_job_name"] = job_name
        tuner_args["warm_start_config"] = None  # not supported yet.

        tuner = HyperparameterTuner(**tuner_args)
        tuner.fit(inputs, job_name=job_name)