Esempio n. 1
0
def test_training_with_additional_hyperparameters(sagemaker_session,
                                                  chainer_full_version):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py')
        data_path = os.path.join(DATA_DIR, 'chainer_mnist')

        chainer = Chainer(entry_point=script_path,
                          role='SageMakerRole',
                          train_instance_count=1,
                          train_instance_type="ml.c4.xlarge",
                          framework_version=chainer_full_version,
                          py_version=PYTHON_VERSION,
                          sagemaker_session=sagemaker_session,
                          hyperparameters={'epochs': 1},
                          use_mpi=True,
                          num_processes=2,
                          process_slots_per_host=2,
                          additional_mpi_options="-x NCCL_DEBUG=INFO")

        train_input = chainer.sagemaker_session.upload_data(
            path=os.path.join(data_path, 'train'),
            key_prefix='integ-test-data/chainer_mnist/train')
        test_input = chainer.sagemaker_session.upload_data(
            path=os.path.join(data_path, 'test'),
            key_prefix='integ-test-data/chainer_mnist/test')

        job_name = unique_name_from_base('test-chainer-training')
        chainer.fit({
            'train': train_input,
            'test': test_input
        },
                    job_name=job_name)
        return chainer.latest_training_job.name
def _run_mnist_training_job(sagemaker_session, instance_type, instance_count,
                            chainer_version, py_version):
    script_path = (os.path.join(DATA_DIR, "chainer_mnist", "mnist.py")
                   if instance_type == 1 else os.path.join(
                       DATA_DIR, "chainer_mnist", "distributed_mnist.py"))

    data_path = os.path.join(DATA_DIR, "chainer_mnist")

    chainer = Chainer(
        entry_point=script_path,
        role="SageMakerRole",
        framework_version=chainer_version,
        py_version=py_version,
        instance_count=instance_count,
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        hyperparameters={"epochs": 1},
        # test output_path without trailing slash
        output_path="s3://{}".format(sagemaker_session.default_bucket()),
    )

    train_input = "file://" + os.path.join(data_path, "train")
    test_input = "file://" + os.path.join(data_path, "test")

    job_name = unique_name_from_base("test-chainer-training")
    chainer.fit({"train": train_input, "test": test_input}, job_name=job_name)
    return chainer
def test_training_with_additional_hyperparameters(sagemaker_local_session,
                                                  chainer_latest_version,
                                                  chainer_latest_py_version):
    script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py")
    data_path = os.path.join(DATA_DIR, "chainer_mnist")

    chainer = Chainer(
        entry_point=script_path,
        role="SageMakerRole",
        instance_count=1,
        instance_type="local",
        framework_version=chainer_latest_version,
        py_version=chainer_latest_py_version,
        sagemaker_session=sagemaker_local_session,
        hyperparameters={"epochs": 1},
        use_mpi=True,
        num_processes=2,
        process_slots_per_host=2,
        additional_mpi_options="-x NCCL_DEBUG=INFO",
    )

    train_input = "file://" + os.path.join(data_path, "train")
    test_input = "file://" + os.path.join(data_path, "test")

    chainer.fit({"train": train_input, "test": test_input})
def test_training_with_additional_hyperparameters(sagemaker_session):
    with timeout(minutes=15):
        script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py')
        data_path = os.path.join(DATA_DIR, 'chainer_mnist')

        chainer = Chainer(entry_point=script_path,
                          role='SageMakerRole',
                          train_instance_count=1,
                          train_instance_type="ml.c4.xlarge",
                          sagemaker_session=sagemaker_session,
                          hyperparameters={'epochs': 1},
                          use_mpi=True,
                          num_processes=2,
                          process_slots_per_host=2,
                          additional_mpi_options="-x NCCL_DEBUG=INFO")

        train_input = chainer.sagemaker_session.upload_data(
            path=os.path.join(data_path, 'train'),
            key_prefix='integ-test-data/chainer_mnist/train')
        test_input = chainer.sagemaker_session.upload_data(
            path=os.path.join(data_path, 'test'),
            key_prefix='integ-test-data/chainer_mnist/test')

        chainer.fit({'train': train_input, 'test': test_input})
        return chainer.latest_training_job.name
def _run_mnist_training_job(sagemaker_session,
                            instance_type,
                            instance_count,
                            chainer_full_version,
                            wait=True):
    with timeout(minutes=15):

        script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py') if instance_type == 1 else \
            os.path.join(DATA_DIR, 'chainer_mnist', 'distributed_mnist.py')

        data_path = os.path.join(DATA_DIR, 'chainer_mnist')

        chainer = Chainer(entry_point=script_path,
                          role='SageMakerRole',
                          framework_version=chainer_full_version,
                          train_instance_count=instance_count,
                          train_instance_type=instance_type,
                          sagemaker_session=sagemaker_session,
                          hyperparameters={'epochs': 1})

        train_input = chainer.sagemaker_session.upload_data(
            path=os.path.join(data_path, 'train'),
            key_prefix='integ-test-data/chainer_mnist/train')
        test_input = chainer.sagemaker_session.upload_data(
            path=os.path.join(data_path, 'test'),
            key_prefix='integ-test-data/chainer_mnist/test')

        chainer.fit({'train': train_input, 'test': test_input}, wait=wait)
        return chainer.latest_training_job.name
def _run_mnist_training_job(sagemaker_session,
                            instance_type,
                            instance_count,
                            chainer_full_version,
                            wait=True):
    script_path = (os.path.join(DATA_DIR, "chainer_mnist", "mnist.py")
                   if instance_type == 1 else os.path.join(
                       DATA_DIR, "chainer_mnist", "distributed_mnist.py"))

    data_path = os.path.join(DATA_DIR, "chainer_mnist")

    chainer = Chainer(
        entry_point=script_path,
        role="SageMakerRole",
        framework_version=chainer_full_version,
        py_version=PYTHON_VERSION,
        train_instance_count=instance_count,
        train_instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        hyperparameters={"epochs": 1},
    )

    train_input = "file://" + os.path.join(data_path, "train")
    test_input = "file://" + os.path.join(data_path, "test")

    job_name = unique_name_from_base("test-chainer-training")
    chainer.fit({
        "train": train_input,
        "test": test_input
    },
                wait=wait,
                job_name=job_name)
    return chainer
def main(parser=argparse.ArgumentParser()):
    import logging
    logging.basicConfig(level=logging.WARN)

    parser.add_argument('--profile', type=str)
    parser.add_argument('--local-mode', action='store_true')
    parser.add_argument('--instance-type',
                        type=str,
                        default=DEFAULT_INSTANCE_TYPE)
    parser.add_argument('--region', type=str, default=DEFAULT_REGION)
    parser.add_argument('--arn', type=str, default=None)
    parser.add_argument('--max-runtime',
                        type=int,
                        default=DEFAULT_RUNTIME,
                        help='seconds')
    args = parser.parse_args()

    boto_session = Session(profile_name=args.profile, region_name=args.region)
    sagemaker_session = sagemaker.Session(boto_session=boto_session)
    role = args.arn if args.arn is not None else sagemaker.get_execution_role(
        sagemaker_session)

    hyperparameters = {
        'gpu':
        0 if args.instance_type.startswith('ml.p') else -1,
        'mjcf':
        'env/ant_simple.xml',
        'action-dim':
        8,
        'obs-dim':
        28,
        'skip-step':
        10,
        'algorithm':
        'TRPO',
        'foot-list':
        'right_back_foot left_back_foot front_right_foot front_left_foot'
    }
    chainer_estimator = Chainer(
        entry_point='train.py',
        source_dir='../',
        role=role,
        image_name=IMAGE,
        framework_version='5.0.0',
        sagemaker_session=LocalSession(boto_session)
        if args.local_mode else sagemaker_session,
        train_instance_count=1,
        train_instance_type='local' if args.local_mode else args.instance_type,
        hyperparameters=hyperparameters,
        base_job_name='roboschool-TRPO-skipstep10-continue-1',
        train_max_run=args.max_runtime)
    chainer_estimator.fit(wait=args.local_mode)
def test_failed_training_job(sagemaker_session, chainer_full_version):
    with timeout(minutes=15):
        script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'failure_script.py')
        data_path = os.path.join(DATA_DIR, 'chainer_mnist')

        chainer = Chainer(entry_point=script_path, role='SageMakerRole',
                          framework_version=chainer_full_version,
                          train_instance_count=1, train_instance_type='ml.c4.xlarge',
                          sagemaker_session=sagemaker_session)

        train_input = chainer.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                            key_prefix='integ-test-data/chainer_mnist/train')

        with pytest.raises(ValueError):
            chainer.fit(train_input)
def test_failed_training_job(sagemaker_session, chainer_full_version):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'failure_script.py')
        data_path = os.path.join(DATA_DIR, 'chainer_mnist')

        chainer = Chainer(entry_point=script_path, role='SageMakerRole',
                          framework_version=chainer_full_version,
                          train_instance_count=1, train_instance_type='ml.c4.xlarge',
                          sagemaker_session=sagemaker_session)

        train_input = chainer.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                            key_prefix='integ-test-data/chainer_mnist/train')

        with pytest.raises(ValueError):
            chainer.fit(train_input)
Esempio n. 10
0
def test_failed_training_job(sagemaker_session, chainer_full_version):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'chainer_mnist',
                                   'failure_script.py')

        chainer = Chainer(entry_point=script_path,
                          role='SageMakerRole',
                          framework_version=chainer_full_version,
                          py_version=PYTHON_VERSION,
                          train_instance_count=1,
                          train_instance_type='ml.c4.xlarge',
                          sagemaker_session=sagemaker_session)

        with pytest.raises(ValueError) as e:
            chainer.fit()
        assert 'This failure is expected' in str(e.value)
def test_attach_deploy(sagemaker_session, chainer_latest_version,
                       chainer_latest_py_version, cpu_instance_type):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py")
        data_path = os.path.join(DATA_DIR, "chainer_mnist")

        chainer = Chainer(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=chainer_latest_version,
            py_version=chainer_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            hyperparameters={"epochs": 1},
        )

        train_input = sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/chainer_mnist/train")

        test_input = sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/chainer_mnist/test")

        job_name = unique_name_from_base("test-chainer-training")
        chainer.fit({
            "train": train_input,
            "test": test_input
        },
                    wait=False,
                    job_name=job_name)

    endpoint_name = unique_name_from_base("test-chainer-attach-deploy")

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = Chainer.attach(chainer.latest_training_job.name,
                                   sagemaker_session=sagemaker_session)
        predictor = estimator.deploy(1,
                                     cpu_instance_type,
                                     endpoint_name=endpoint_name)
        _predict_and_assert(predictor)
Esempio n. 12
0
def main(parser=argparse.ArgumentParser()):
    import logging
    logging.basicConfig(level=logging.WARN)

    parser.add_argument('--profile', type=str, default='default')
    parser.add_argument('--local-mode', action='store_true')
    parser.add_argument('--instance-type',
                        type=str,
                        default=DEFAULT_INSTANCE_TYPE)
    parser.add_argument('--region', type=str, default=DEFAULT_REGION)
    parser.add_argument('--arn', type=str, default=None)
    parser.add_argument('--max-runtime',
                        type=int,
                        default=DEFAULT_RUNTIME,
                        help='seconds')
    args = parser.parse_args()

    boto_session = Session(profile_name=args.profile, region_name=args.region)
    sagemaker_session = LocalSession(
        boto_session) if args.local_mode else sagemaker.Session(boto_session)
    role = args.arn if args.arn is not None else sagemaker.get_execution_role(
        sagemaker_session)

    gpu = 0 if args.instance_type.startswith(
        'ml.p') and not args.local_mode else -1
    hyperparameters = {
        'gpu': gpu,
    }
    chainer_estimator = Chainer(
        entry_point='train.py',
        source_dir='./',
        role=role,
        image_name=IMAGE,
        framework_version='5.0.0',
        sagemaker_session=sagemaker_session,
        train_instance_count=1,
        train_instance_type='local' if args.local_mode else args.instance_type,
        hyperparameters=hyperparameters,
        base_job_name='chainer-sagemaker-sample',
        train_max_run=args.max_runtime)
    chainer_estimator.fit(wait=args.local_mode)
def test_training_with_additional_hyperparameters(sagemaker_session, chainer_full_version):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py')
        data_path = os.path.join(DATA_DIR, 'chainer_mnist')

        chainer = Chainer(entry_point=script_path, role='SageMakerRole',
                          train_instance_count=1, train_instance_type="ml.c4.xlarge",
                          framework_version=chainer_full_version,
                          sagemaker_session=sagemaker_session, hyperparameters={'epochs': 1},
                          use_mpi=True,
                          num_processes=2,
                          process_slots_per_host=2,
                          additional_mpi_options="-x NCCL_DEBUG=INFO")

        train_input = chainer.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                            key_prefix='integ-test-data/chainer_mnist/train')
        test_input = chainer.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
                                                           key_prefix='integ-test-data/chainer_mnist/test')

        chainer.fit({'train': train_input, 'test': test_input})
        return chainer.latest_training_job.name
def _run_mnist_training_job(sagemaker_session, instance_type, instance_count,
                            chainer_full_version, wait=True):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):

        script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py') if instance_type == 1 else \
            os.path.join(DATA_DIR, 'chainer_mnist', 'distributed_mnist.py')

        data_path = os.path.join(DATA_DIR, 'chainer_mnist')

        chainer = Chainer(entry_point=script_path, role='SageMakerRole',
                          framework_version=chainer_full_version,
                          train_instance_count=instance_count, train_instance_type=instance_type,
                          sagemaker_session=sagemaker_session, hyperparameters={'epochs': 1})

        train_input = chainer.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                            key_prefix='integ-test-data/chainer_mnist/train')
        test_input = chainer.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
                                                           key_prefix='integ-test-data/chainer_mnist/test')

        chainer.fit({'train': train_input, 'test': test_input}, wait=wait)
        return chainer.latest_training_job.name
def _run_mnist_training_job(sagemaker_session,
                            instance_type,
                            instance_count,
                            chainer_full_version,
                            wait=True):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):

        script_path = (os.path.join(DATA_DIR, "chainer_mnist", "mnist.py")
                       if instance_type == 1 else os.path.join(
                           DATA_DIR, "chainer_mnist", "distributed_mnist.py"))

        data_path = os.path.join(DATA_DIR, "chainer_mnist")

        chainer = Chainer(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=chainer_full_version,
            py_version=PYTHON_VERSION,
            train_instance_count=instance_count,
            train_instance_type=instance_type,
            sagemaker_session=sagemaker_session,
            hyperparameters={"epochs": 1},
        )

        train_input = chainer.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/chainer_mnist/train")
        test_input = chainer.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/chainer_mnist/test")

        job_name = unique_name_from_base("test-chainer-training")
        chainer.fit({
            "train": train_input,
            "test": test_input
        },
                    wait=wait,
                    job_name=job_name)
        return chainer.latest_training_job.name
Esempio n. 16
0
def _run_mnist_training_job(sagemaker_session,
                            instance_type,
                            instance_count,
                            chainer_full_version,
                            wait=True):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):

        script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py') if instance_type == 1 else \
            os.path.join(DATA_DIR, 'chainer_mnist', 'distributed_mnist.py')

        data_path = os.path.join(DATA_DIR, 'chainer_mnist')

        chainer = Chainer(entry_point=script_path,
                          role='SageMakerRole',
                          framework_version=chainer_full_version,
                          py_version=PYTHON_VERSION,
                          train_instance_count=instance_count,
                          train_instance_type=instance_type,
                          sagemaker_session=sagemaker_session,
                          hyperparameters={'epochs': 1})

        train_input = chainer.sagemaker_session.upload_data(
            path=os.path.join(data_path, 'train'),
            key_prefix='integ-test-data/chainer_mnist/train')
        test_input = chainer.sagemaker_session.upload_data(
            path=os.path.join(data_path, 'test'),
            key_prefix='integ-test-data/chainer_mnist/test')

        job_name = unique_name_from_base('test-chainer-training')
        chainer.fit({
            'train': train_input,
            'test': test_input
        },
                    wait=wait,
                    job_name=job_name)
        return chainer.latest_training_job.name