コード例 #1
0
def test_github(sagemaker_local_session, pytorch_inference_latest_version,
                pytorch_inference_latest_py_version):
    script_path = "mnist.py"
    git_config = {"repo": GIT_REPO, "branch": BRANCH, "commit": COMMIT}

    pytorch = PyTorch(
        entry_point=script_path,
        role="SageMakerRole",
        source_dir="pytorch",
        framework_version=pytorch_inference_latest_version,
        py_version=pytorch_inference_latest_py_version,
        instance_count=1,
        instance_type="local",
        sagemaker_session=sagemaker_local_session,
        git_config=git_config,
    )

    data_path = os.path.join(DATA_DIR, "pytorch_mnist")
    pytorch.fit({"training": "file://" + os.path.join(data_path, "training")})

    with lock.lock(LOCK_PATH):
        try:
            predictor = pytorch.deploy(initial_instance_count=1,
                                       instance_type="local")
            data = numpy.zeros(shape=(1, 1, 28, 28)).astype(numpy.float32)
            result = predictor.predict(data)
            assert 10 == len(
                result[0])  # check that there is a probability for each label
        finally:
            predictor.delete_endpoint()
コード例 #2
0
def test_source_dirs(tmpdir, sagemaker_local_session):
    source_dir = os.path.join(DATA_DIR, "pytorch_source_dirs")
    lib = os.path.join(str(tmpdir), "alexa.py")

    with open(lib, "w") as f:
        f.write("def question(to_anything): return 42")

    estimator = PyTorch(
        entry_point="train.py",
        role="SageMakerRole",
        source_dir=source_dir,
        dependencies=[lib],
        py_version=PYTHON_VERSION,
        train_instance_count=1,
        train_instance_type="local",
        sagemaker_session=sagemaker_local_session,
    )
    estimator.fit()

    # endpoint tests all use the same port, so we use this lock to prevent concurrent execution
    with lock.lock():
        try:
            predictor = estimator.deploy(initial_instance_count=1, instance_type="local")
            predict_response = predictor.predict([7])
            assert predict_response == [49]
        finally:
            estimator.delete_endpoint()
コード例 #3
0
def test_github(sagemaker_local_session):
    script_path = "mnist.py"
    data_path = os.path.join(DATA_DIR, "pytorch_mnist")
    git_config = {"repo": GIT_REPO, "branch": BRANCH, "commit": COMMIT}
    pytorch = PyTorch(
        entry_point=script_path,
        role="SageMakerRole",
        source_dir="pytorch",
        framework_version=PYTORCH_VERSION,
        py_version=PYTHON_VERSION,
        train_instance_count=1,
        train_instance_type="local",
        sagemaker_session=sagemaker_local_session,
        git_config=git_config,
    )

    pytorch.fit({
        "training":
        "file://" + os.path.join(data_path, "training", MNIST_FOLDER_NAME)
    })

    with lock.lock(LOCK_PATH):
        try:
            predictor = pytorch.deploy(initial_instance_count=1,
                                       instance_type="local")
            data = numpy.zeros(shape=(1, 1, 28, 28)).astype(numpy.float32)
            result = predictor.predict(data)
            assert result is not None
        finally:
            predictor.delete_endpoint()
コード例 #4
0
def test_source_dirs(tmpdir, sagemaker_local_session):
    source_dir = os.path.join(DATA_DIR, 'pytorch_source_dirs')
    lib = os.path.join(str(tmpdir), 'alexa.py')

    with open(lib, 'w') as f:
        f.write('def question(to_anything): return 42')

    estimator = PyTorch(entry_point='train.py',
                        role='SageMakerRole',
                        source_dir=source_dir,
                        dependencies=[lib],
                        py_version=PYTHON_VERSION,
                        train_instance_count=1,
                        train_instance_type='local',
                        sagemaker_session=sagemaker_local_session)
    try:

        estimator.fit()

        predictor = estimator.deploy(initial_instance_count=1,
                                     instance_type='local')

        predict_response = predictor.predict([7])

        assert predict_response == [49]
    finally:
        estimator.delete_endpoint()
コード例 #5
0
def test_source_dirs(tmpdir, sagemaker_local_session):
    source_dir = os.path.join(DATA_DIR, "pytorch_source_dirs")
    lib = os.path.join(str(tmpdir), "alexa.py")

    with open(lib, "w") as f:
        f.write("def question(to_anything): return 42")

    # TODO: fails on newer versions of pytorch in call to np.load(BytesIO(stream.read()))
    # "ValueError: Cannot load file containing pickled data when allow_pickle=False"
    estimator = PyTorch(
        entry_point="train.py",
        role="SageMakerRole",
        source_dir=source_dir,
        dependencies=[lib],
        framework_version=
        "0.4",  # hard-code to last known good pytorch for now (see TODO above)
        py_version="py3",
        instance_count=1,
        instance_type="local",
        sagemaker_session=sagemaker_local_session,
    )
    estimator.fit()

    # endpoint tests all use the same port, so we use this lock to prevent concurrent execution
    with lock.lock():
        try:
            predictor = estimator.deploy(initial_instance_count=1,
                                         instance_type="local")
            predict_response = predictor.predict([7])
            assert predict_response == [49]
        finally:
            predictor.delete_endpoint()
コード例 #6
0
def test_fit_deploy(sagemaker_local_session, pytorch_full_version):
    pytorch = PyTorch(
        entry_point=MNIST_SCRIPT,
        role="SageMakerRole",
        framework_version=pytorch_full_version,
        py_version="py3",
        train_instance_count=1,
        train_instance_type="local",
        sagemaker_session=sagemaker_local_session,
    )

    pytorch.fit({"training": "file://" + os.path.join(MNIST_DIR, "training")})

    predictor = pytorch.deploy(1, "local")
    try:
        batch_size = 100
        data = numpy.random.rand(batch_size, 1, 28, 28).astype(numpy.float32)
        output = predictor.predict(data)

        assert output.shape == (batch_size, 10)
    finally:
        predictor.delete_endpoint()
コード例 #7
0
testpath = sess.upload_data(path='boston_test.csv',
                            bucket=bucket,
                            key_prefix='sagemaker/sklearncontainer')

pytorch_estimator = PyTorch(
    entry_point='train.py',
    source_dir=os.path.abspath(os.path.dirname(__file__)),
    role=sm_role,
    train_instance_count=1,
    train_instance_type='ml.c5.xlarge',
    framework_version='1.0.0',
    base_job_name='dense-pytorch',
    metric_definitions=[{
        'Name': 'median-AE',
        'Regex': "AE-at-50th-percentile: ([0-9.]+).*$"
    }],
    hyperparameters={
        'n-epochs': 1500,
        'features':
        'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT',
        'target': 'target'
    },
    tags=[{
        "Key": "CostCentre",
        "Value": "SageMaker"
    }])

# launch training job, with asynchronous call
pytorch_estimator.fit({'train': trainpath, 'test': testpath}, wait=False)
コード例 #8
0
def exec_training(
    session, client, job_name, setting, pytorch, max_parallel_jobs, is_spot
):
    sagemaker_session = sagemaker.Session(
        boto_session=session, sagemaker_client=client
    )

    conf = yaml.load(open(setting))

    # input data
    inputs = conf["inputs"]

    if "upload_data" in conf and isinstance(conf["upload_data"], list):
        for d in conf["upload_data"]:
            s3_dir = sagemaker_session.upload_data(
                path=d["path"],
                key_prefix=os.path.join(job_name, d["key_prefix"]),
            )
            inputs[d["name"]] = s3_dir

    estimator_args = conf["estimator"]
    estimator_args["sagemaker_session"] = sagemaker_session

    hyperparameters = estimator_args.pop("hyperparameters")
    fixed, targets = {}, {}
    for k, v in hyperparameters.items():
        if isinstance(v, dict):
            targets[k] = v
        else:
            fixed[k] = v
    estimator_args["hyperparameters"] = fixed

    if is_spot:
        estimator_args["train_use_spot_instances"] = True
        if "checkpoint_s3_uri" not in estimator_args:
            bucket_name = sagemaker_session.default_bucket()
            uri = os.path.join("s3://", bucket_name, job_name, "checkpoints")
            estimator_args["checkpoint_s3_uri"] = uri

    if pytorch:
        estimator = PyTorch(**estimator_args)
    else:
        estimator = Chainer(**estimator_args)

    if len(targets) == 0:
        estimator.fit(inputs, wait=False, job_name=job_name)
    else:
        if "tuner" in conf:
            tuner_args = conf["tuner"]
            hyperparameter_ranges = {}
            for k, v in targets.items():
                hyperparameter_ranges[k] = hp_type[v["type"].lower()](
                    v["range"]
                )
        else:  # use default values
            tuner_args = {
                "objective_metric_name": "metric_name",
                "metric_definitions": [
                    {"Name": "metric_name", "Regex": "ignore"}
                ],
                "strategy": "Random",
                "objective_type": "Maximize",
                "early_stopping_type": "Off",
            }
            max_jobs = 1
            hyperparameter_ranges = {}
            for k, v in targets.items():
                if v["type"].lower() != "categorical":
                    raise ValueError(
                        "the default tuner only supports Categorigal params."
                    )
                max_jobs *= len(v["range"])
                hyperparameter_ranges[k] = hp_type[v["type"].lower()](
                    v["range"]
                )
            tuner_args["max_jobs"] = max_jobs

        tuner_args["estimator"] = estimator
        tuner_args["hyperparameter_ranges"] = hyperparameter_ranges
        tuner_args["max_parallel_jobs"] = max_parallel_jobs
        tuner_args["base_tuning_job_name"] = job_name
        tuner_args["warm_start_config"] = None  # not supported yet.

        tuner = HyperparameterTuner(**tuner_args)
        tuner.fit(inputs, job_name=job_name)
コード例 #9
0
ファイル: run_aws.py プロジェクト: jkhouja/experimenter
    def my_aws_app(cfg: DictConfig) -> None:

        script_folder = "."  # todo. this is overriden by hydra
        script_folder = (hydra.utils.get_original_cwd()
                         )  # todo. this is overriden by hydra

        as_dict = OmegaConf.to_container(cfg, resolve=False)

        # Override s3 datapath
        aws_bucket = cfg.aws.bucket_prefix
        try:
            aws_root_path = aws_bucket + cfg.aws.root_path

        except errors.ConfigAttributeError:
            aws_root_path = aws_bucket + cfg.root_path

        # Get the s3 location to load /save to
        aws_out_path = aws_root_path + "/" + as_dict["output_subdir"]
        aws_data_path = aws_root_path + "/" + as_dict["data_subdir"]

        # Override the job json file with sagemaker local dirs
        as_dict["root_path"] = "/opt/ml/"
        as_dict["data_subdir"] = "input/data/train"
        as_dict["output_subdir"] = "output/data"

        # Set the local dir for tensorboard
        tb_log_dir = "/opt/ml/output/tensorboard/"
        as_dict["tb_log_dir"] = tb_log_dir
        tensorboard_output_config = TensorBoardOutputConfig(
            s3_output_path=aws_out_path,
            container_local_output_path=tb_log_dir,
        )

        print(OmegaConf.to_yaml(cfg))
        print("Overriden Root Path: " + aws_root_path)

        # Save json file to tmp location to be uploaded with script
        tmp_relative_path = "tmp/tmp_job.json"
        tmp_path = script_folder + "/" + tmp_relative_path

        with open(tmp_path, "w") as json_file:
            json.dump(as_dict, json_file)

        wait = cfg.aws.wait
        role = cfg.aws.role
        instance_count = cfg.aws.instance_count
        instance_type = cfg.aws.instance_type
        env = {
            "SAGEMAKER_REQUIREMENTS":
            "requirements.txt",  # path relative to `source_dir` below.
        }

        # Using Sagemaker prebuilt Pytorch container
        pytorch_estimator = PyTorch(
            entry_point="run.py",
            source_dir=script_folder,
            hyperparameters={"config_file": tmp_relative_path},
            role=role,
            env=env,
            instance_count=instance_count,
            py_version="py3",
            framework_version="1.5.0",
            output_path=aws_out_path,
            base_job_name=cfg.experiment_name,
            instance_type=instance_type,
            tensorboard_output_config=tensorboard_output_config,
        )

        pytorch_estimator.fit({"train": aws_data_path}, wait=wait)
        os.remove(tmp_path)
コード例 #10
0
def exec_training(session, client, job_name, setting, pytorch,
                  max_parallel_jobs):
    sagemaker_session = sagemaker.Session(boto_session=session,
                                          sagemaker_client=client)

    conf = yaml.load(open(setting))

    # input data
    inputs = conf['inputs']

    if 'upload_data' in conf and isinstance(conf['upload_data'], list):
        for d in conf['upload_data']:
            s3_dir = sagemaker_session.upload_data(path=d['path'],
                                                   key_prefix=os.path.join(
                                                       job_name,
                                                       d['key_prefix']))
            inputs[d['name']] = s3_dir

    estimator_args = conf['estimator']
    estimator_args['sagemaker_session'] = sagemaker_session

    hyperparameters = estimator_args.pop('hyperparameters')
    fixed, targets = {}, {}
    for k, v in hyperparameters.items():
        if isinstance(v, dict):
            targets[k] = v
        else:
            fixed[k] = v
    estimator_args['hyperparameters'] = fixed

    if pytorch:
        estimator = PyTorch(**estimator_args)
    else:
        estimator = Chainer(**estimator_args)

    if len(targets) == 0:
        estimator.fit(inputs, job_name=job_name)
    else:
        if 'tuner' in conf:
            tuner_args = conf['tuner']
            hyperparameter_ranges = {}
            for k, v in targets.items():
                hyperparameter_ranges[k] = hp_type[v['type'].lower()](
                    v['range'])
        else:  # use default values
            tuner_args = {
                'objective_metric_name': 'metric_name',
                'metric_definitions': [{
                    'Name': 'metric_name',
                    'Regex': 'ignore'
                }],
                'strategy': 'Random',
                'objective_type': 'Maximize',
                'early_stopping_type': 'Off'
            }
            max_jobs = 1
            hyperparameter_ranges = {}
            for k, v in targets.items():
                if v['type'].lower() != 'categorical':
                    raise ValueError(
                        'the default tuner only supports Categorigal params.')
                max_jobs *= len(v['range'])
                hyperparameter_ranges[k] = hp_type[v['type'].lower()](
                    v['range'])
            tuner_args['max_jobs'] = max_jobs

        tuner_args['estimator'] = estimator
        tuner_args['hyperparameter_ranges'] = hyperparameter_ranges
        tuner_args['max_parallel_jobs'] = max_parallel_jobs
        tuner_args['base_tuning_job_name'] = job_name
        tuner_args['warm_start_config'] = None  # not supported yet.

        tuner = HyperparameterTuner(**tuner_args)
        tuner.fit(inputs, job_name=job_name)