def main(): print('Starting model training.') print('Note: if launching for the first time in local mode, container image download might take a few minutes to complete.') sklearn = SKLearn( entry_point="scikit_boston_housing.py", source_dir='code', framework_version="0.23-1", instance_type="local", role=DUMMY_IAM_ROLE ) delta_lake_profile_file = "file://./profile/open-datasets.share" sklearn.fit({"train": delta_lake_profile_file}) print('Completed model training') print('Deploying endpoint in local mode') predictor = sklearn.deploy(initial_instance_count=1, instance_type='local') test_sample = [[0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98]] prediction = predictor.predict(test_sample) print(f'Prediction: {prediction}') print('About to delete the endpoint to stop paying (if in cloud mode).') predictor.delete_endpoint(predictor.endpoint_name)
def test_training_with_network_isolation(sagemaker_session, sklearn_full_version, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "sklearn_mnist", "mnist.py") data_path = os.path.join(DATA_DIR, "sklearn_mnist") sklearn = SKLearn( entry_point=script_path, role="SageMakerRole", train_instance_type=cpu_instance_type, framework_version=sklearn_full_version, py_version=PYTHON_VERSION, sagemaker_session=sagemaker_session, hyperparameters={"epochs": 1}, enable_network_isolation=True, ) train_input = sklearn.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/sklearn_mnist/train") test_input = sklearn.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/sklearn_mnist/test") job_name = unique_name_from_base("test-sklearn-hp") sklearn.fit({ "train": train_input, "test": test_input }, job_name=job_name) assert sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=job_name)["EnableNetworkIsolation"] return sklearn.latest_training_job.name
def main(args): print("args.local=", args.local) # Initialise SDK sklearn_estimator = SKLearn( entry_point='src/train_and_deploy.py', role=CLOUD_CONFIG['sagemaker_role_id']['value'], train_instance_type='local' if args.local else 'ml.m4.xlarge', hyperparameters={ 'sagemaker_submit_directory': f"s3://{CLOUD_CONFIG['s3bucket']['value']}", }, framework_version='0.23-1', metric_definitions=[{ 'Name': 'train:score', 'Regex': 'train:score=(\S+)' }], ) # Run model training job sklearn_estimator.fit({ 'train': "file://./data/data.csv" if args.local else f"s3://{CLOUD_CONFIG['s3bucket']['value']}/data.csv" }) # Deploy trained model to an endpoint sklearn_estimator.deploy( instance_type='local' if args.local else 'ml.t2.medium', initial_instance_count=1, endpoint_name='demo-endpoint', )
def test_failed_training_job( sagemaker_session, sklearn_latest_version, sklearn_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "sklearn_mnist", "failure_script.py") data_path = os.path.join(DATA_DIR, "sklearn_mnist") sklearn = SKLearn( entry_point=script_path, role="SageMakerRole", framework_version=sklearn_latest_version, py_version=sklearn_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) train_input = sklearn.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/sklearn_mnist/train") job_name = unique_name_from_base("test-sklearn-failed") with pytest.raises(ValueError): sklearn.fit(train_input, job_name=job_name)
def test_training_with_additional_hyperparameters( sagemaker_session, sklearn_latest_version, sklearn_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "sklearn_mnist", "mnist.py") data_path = os.path.join(DATA_DIR, "sklearn_mnist") sklearn = SKLearn( entry_point=script_path, role="SageMakerRole", instance_type=cpu_instance_type, framework_version=sklearn_latest_version, py_version=sklearn_latest_py_version, sagemaker_session=sagemaker_session, hyperparameters={"epochs": 1}, ) train_input = sklearn.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/sklearn_mnist/train") test_input = sklearn.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/sklearn_mnist/test") job_name = unique_name_from_base("test-sklearn-hp") sklearn.fit({ "train": train_input, "test": test_input }, job_name=job_name)
def main(): download_training_and_eval_data() print('Starting model training.') print( 'Note: if launching for the first time in local mode, container image download might take a few minutes to complete.' ) sklearn = SKLearn( entry_point="scikit_learn_iris.py", source_dir='code', framework_version="0.23-1", instance_type="local", role=DUMMY_IAM_ROLE, hyperparameters={"max_leaf_nodes": 30}, ) train_input = "file://./data/iris.csv" sklearn.fit({"train": train_input}) print('Completed model training') print('Deploying endpoint in local mode') predictor = sklearn.deploy(initial_instance_count=1, instance_type='local') do_inference_on_local_endpoint(predictor) print('About to delete the endpoint to stop paying (if in cloud mode).') predictor.delete_endpoint(predictor.endpoint_name)
def test_create_model_with_optional_params(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" enable_cloudwatch_metrics = "true" sklearn = SKLearn( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, py_version=PYTHON_VERSION, base_job_name="job", source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics, ) sklearn.fit(inputs="s3://mybucket/train", job_name="new_name") new_role = "role" model_server_workers = 2 vpc_config = {"Subnets": ["foo"], "SecurityGroupIds": ["bar"]} model = sklearn.create_model(role=new_role, model_server_workers=model_server_workers, vpc_config_override=vpc_config) assert model.role == new_role assert model.model_server_workers == model_server_workers assert model.vpc_config == vpc_config
def test_create_model_from_estimator(sagemaker_session, sklearn_version): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' sklearn = SKLearn(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_type=INSTANCE_TYPE, framework_version=sklearn_version, container_log_level=container_log_level, py_version=PYTHON_VERSION, base_job_name='job', source_dir=source_dir) job_name = 'new_name' sklearn.fit(inputs='s3://mybucket/train', job_name=job_name) model = sklearn.create_model() assert model.sagemaker_session == sagemaker_session assert model.framework_version == sklearn_version assert model.py_version == sklearn.py_version assert model.entry_point == SCRIPT_PATH assert model.role == ROLE assert model.name == job_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir assert model.vpc_config is None
def main(): download_training_and_eval_data() print('Starting model training.') print( 'Note: if launching for the first time in local mode, container image download might take a few minutes to complete.' ) sklearn = SKLearn( entry_point="catboost_train_deploy.py", source_dir='code', framework_version="0.23-1", instance_type="local", role=DUMMY_IAM_ROLE, ) train_location = 'file://' + local_train validation_location = 'file://' + local_validation sklearn.fit({'train': train_location, 'validation': validation_location}) print('Completed model training') print('Deploying endpoint in local mode') predictor = sklearn.deploy(1, 'local', serializer=csv_serializer) with open(local_test, 'r') as f: payload = f.read().strip() predictions = predictor.predict(payload) print('predictions: {}'.format(predictions)) predictor.delete_endpoint(predictor.endpoint)
def test_create_model_with_optional_params(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' enable_cloudwatch_metrics = 'true' sklearn = SKLearn(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, py_version=PYTHON_VERSION, base_job_name='job', source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) sklearn.fit(inputs='s3://mybucket/train', job_name='new_name') new_role = 'role' model_server_workers = 2 vpc_config = {'Subnets': ['foo'], 'SecurityGroupIds': ['bar']} model = sklearn.create_model(role=new_role, model_server_workers=model_server_workers, vpc_config_override=vpc_config) assert model.role == new_role assert model.model_server_workers == model_server_workers assert model.vpc_config == vpc_config
def _run_mnist_training_job(sagemaker_session, instance_type, sklearn_full_version, wait=True): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "sklearn_mnist", "mnist.py") data_path = os.path.join(DATA_DIR, "sklearn_mnist") sklearn = SKLearn( entry_point=script_path, role="SageMakerRole", framework_version=sklearn_full_version, py_version=PYTHON_VERSION, train_instance_type=instance_type, sagemaker_session=sagemaker_session, hyperparameters={"epochs": 1}, ) train_input = sklearn.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/sklearn_mnist/train") test_input = sklearn.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/sklearn_mnist/test") job_name = unique_name_from_base("test-sklearn-mnist") sklearn.fit({ "train": train_input, "test": test_input }, wait=wait, job_name=job_name) return sklearn.latest_training_job.name
def _run_mnist_training_job(sagemaker_session, instance_type, sklearn_full_version, wait=True): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'sklearn_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'sklearn_mnist') sklearn = SKLearn(entry_point=script_path, role='SageMakerRole', framework_version=sklearn_full_version, py_version=PYTHON_VERSION, train_instance_type=instance_type, sagemaker_session=sagemaker_session, hyperparameters={'epochs': 1}) train_input = sklearn.sagemaker_session.upload_data( path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/sklearn_mnist/train') test_input = sklearn.sagemaker_session.upload_data( path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/sklearn_mnist/test') sklearn.fit({'train': train_input, 'test': test_input}, wait=wait) return sklearn.latest_training_job.name
def test_sklearn(strftime, sagemaker_session, sklearn_version): sklearn = SKLearn( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_type=INSTANCE_TYPE, py_version=PYTHON_VERSION, framework_version=sklearn_version, ) inputs = "s3://mybucket/train" sklearn.fit(inputs=inputs, experiment_config=EXPERIMENT_CONFIG) sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls] assert sagemaker_call_names == ["train", "logs_for_job"] boto_call_names = [ c[0] for c in sagemaker_session.boto_session.method_calls ] assert boto_call_names == ["resource"] expected_train_args = _create_train_job(sklearn_version) expected_train_args["input_config"][0]["DataSource"]["S3DataSource"][ "S3Uri"] = inputs expected_train_args["experiment_config"] = EXPERIMENT_CONFIG actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args model = sklearn.create_model() expected_image_base = ( "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:{}-cpu-{}" ) assert { "Environment": { "SAGEMAKER_SUBMIT_DIRECTORY": "s3://mybucket/sagemaker-scikit-learn-{}/source/sourcedir.tar.gz". format(TIMESTAMP), "SAGEMAKER_PROGRAM": "dummy_script.py", "SAGEMAKER_ENABLE_CLOUDWATCH_METRICS": "false", "SAGEMAKER_REGION": "us-west-2", "SAGEMAKER_CONTAINER_LOG_LEVEL": "20", }, "Image": expected_image_base.format(sklearn_version, PYTHON_VERSION), "ModelDataUrl": "s3://m/m.tar.gz", } == model.prepare_container_def(CPU) assert "cpu" in model.prepare_container_def(CPU)["Image"] predictor = sklearn.deploy(1, CPU) assert isinstance(predictor, SKLearnPredictor)
def test_sklearn(strftime, sagemaker_session, sklearn_version): sklearn = SKLearn(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_type=INSTANCE_TYPE, py_version=PYTHON_VERSION, framework_version=sklearn_version) inputs = 's3://mybucket/train' sklearn.fit(inputs=inputs) sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls] assert sagemaker_call_names == ['train', 'logs_for_job'] boto_call_names = [ c[0] for c in sagemaker_session.boto_session.method_calls ] assert boto_call_names == ['resource'] expected_train_args = _create_train_job(sklearn_version) expected_train_args['input_config'][0]['DataSource']['S3DataSource'][ 'S3Uri'] = inputs actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args model = sklearn.create_model() expected_image_base = '246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:{}-cpu-{}' assert { 'Environment': { 'SAGEMAKER_SUBMIT_DIRECTORY': 's3://mybucket/sagemaker-scikit-learn-{}/source/sourcedir.tar.gz'. format(TIMESTAMP), 'SAGEMAKER_PROGRAM': 'dummy_script.py', 'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_REGION': 'us-west-2', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20' }, 'Image': expected_image_base.format(sklearn_version, PYTHON_VERSION), 'ModelDataUrl': 's3://m/m.tar.gz' } == model.prepare_container_def(CPU) assert 'cpu' in model.prepare_container_def(CPU)['Image'] predictor = sklearn.deploy(1, CPU) assert isinstance(predictor, SKLearnPredictor)
def build_and_train_estimator(data_desc: str, classifier: str, count: int = 1, wait: bool = False, **hyperparams: object) -> Tuple[SKLearn, str]: """ Creates or returns an existing sagemaker training job :param data_desc: name of data to use (unique) :param classifier: name of sklearn classifier :param count: cache buster :param wait: waits on job, useful for debugging :param hyperparams: hyperparameters for the model :return: estimator | None """ model_name = build_model_name(data_desc, classifier, hyperparams, count) print('model_name', model_name) # check if model has already been built on this data # if it has check if it's finished and attach try: import boto3 client = boto3.client('sagemaker') response = client.describe_training_job(TrainingJobName=model_name) if wait or response['TrainingJobStatus'] in ['Completed', 'Failed']: return SKLearn.attach(model_name), model_name else: raise Warning(f'{model_name} isn\'t finished training yet') except ClientError: pass output_location = f's3://{bucket}/{S3_MODEL_DIR / data_desc}' estimator = SKLearn('train_and_deploy.py', source_dir='sagemaker_container', code_location=output_location, output_path=output_location, train_instance_type=TRAIN_INSTANCE, framework_version='0.23-1', role=role, hyperparameters={ 'classifier': classifier, **hyperparams }) estimator.fit(f's3://{bucket}/{S3_FEATURE_DIR / data_desc}', wait=wait, job_name=model_name) return estimator, model_name
def test_failed_training_job(sagemaker_session, sklearn_full_version): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'sklearn_mnist', 'failure_script.py') data_path = os.path.join(DATA_DIR, 'sklearn_mnist') sklearn = SKLearn(entry_point=script_path, role='SageMakerRole', framework_version=sklearn_full_version, py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) train_input = sklearn.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/sklearn_mnist/train') job_name = unique_name_from_base('test-sklearn-failed') with pytest.raises(ValueError): sklearn.fit(train_input, job_name=job_name)
def test_create_model_with_custom_image(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' custom_image = 'ubuntu:latest' sklearn = SKLearn(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_type=INSTANCE_TYPE, image_name=custom_image, container_log_level=container_log_level, py_version=PYTHON_VERSION, base_job_name='job', source_dir=source_dir) sklearn.fit(inputs='s3://mybucket/train', job_name='new_name') model = sklearn.create_model() assert model.image == custom_image
def test_transform_multiple_values_for_entry_point_issue(sagemaker_session, sklearn_version): # https://github.com/aws/sagemaker-python-sdk/issues/974 sklearn = SKLearn( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, instance_type=INSTANCE_TYPE, py_version=PYTHON_VERSION, framework_version=sklearn_version, ) inputs = "s3://mybucket/train" sklearn.fit(inputs=inputs) transformer = sklearn.transformer(instance_count=1, instance_type="ml.m4.xlarge") # if we got here, we didn't get a "multiple values" error assert transformer is not None
def test_create_model_with_optional_params(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" enable_cloudwatch_metrics = "true" sklearn = SKLearn( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, py_version=PYTHON_VERSION, base_job_name="job", source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics, ) sklearn.fit(inputs="s3://mybucket/train", job_name="new_name") custom_image = "ubuntu:latest" new_role = "role" model_server_workers = 2 vpc_config = {"Subnets": ["foo"], "SecurityGroupIds": ["bar"]} new_source_dir = "s3://myotherbucket/source" dependencies = ["/directory/a", "/directory/b"] model_name = "model-name" model = sklearn.create_model( image=custom_image, role=new_role, model_server_workers=model_server_workers, vpc_config_override=vpc_config, entry_point=SERVING_SCRIPT_FILE, source_dir=new_source_dir, dependencies=dependencies, name=model_name, ) assert model.image == custom_image assert model.role == new_role assert model.model_server_workers == model_server_workers assert model.vpc_config == vpc_config assert model.entry_point == SERVING_SCRIPT_FILE assert model.source_dir == new_source_dir assert model.dependencies == dependencies assert model.name == model_name
def test_create_model_with_custom_image(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" custom_image = "ubuntu:latest" sklearn = SKLearn( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, instance_type=INSTANCE_TYPE, image_uri=custom_image, container_log_level=container_log_level, py_version=PYTHON_VERSION, base_job_name="job", source_dir=source_dir, ) sklearn.fit(inputs="s3://mybucket/train", job_name="new_name") model = sklearn.create_model() assert model.image_uri == custom_image
def test_training_with_additional_hyperparameters(sagemaker_session, sklearn_full_version): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'sklearn_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'sklearn_mnist') sklearn = SKLearn(entry_point=script_path, role='SageMakerRole', train_instance_type="ml.c4.xlarge", framework_version=sklearn_full_version, py_version=PYTHON_VERSION, sagemaker_session=sagemaker_session, hyperparameters={'epochs': 1}) train_input = sklearn.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/sklearn_mnist/train') test_input = sklearn.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/sklearn_mnist/test') job_name = unique_name_from_base('test-sklearn-hp') sklearn.fit({'train': train_input, 'test': test_input}, job_name=job_name) return sklearn.latest_training_job.name
def test_create_model_from_estimator(name_from_base, sagemaker_session, sklearn_version): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" base_job_name = "job" sklearn = SKLearn( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, instance_type=INSTANCE_TYPE, framework_version=sklearn_version, container_log_level=container_log_level, py_version=PYTHON_VERSION, base_job_name=base_job_name, source_dir=source_dir, enable_network_isolation=True, ) sklearn.fit(inputs="s3://mybucket/train", job_name="new_name") model_name = "model_name" name_from_base.return_value = model_name model = sklearn.create_model() assert model.sagemaker_session == sagemaker_session assert model.framework_version == sklearn_version assert model.py_version == sklearn.py_version assert model.entry_point == SCRIPT_PATH assert model.role == ROLE assert model.name == model_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir assert model.vpc_config is None assert model.enable_network_isolation() name_from_base.assert_called_with(base_job_name)
inters_df.consultant.portfolio = sub_port(consultant_processing( \ list(inters_df.consultant)).portfolio) inters_df.consultant = cons_predictor(consultant_processing( \ list(inters_df.consultant))) inters_df = pd.concat([ inters_df.drop(["client", "duration", "ongoing", "n_transactions"], axis=1), client_processing(list(inters_df.client)) ], axis=1) inters_df.to_csv(key + "interactions.csv") upload_file(key + "interactions.csv") models = {} for name, df in inters_df.groupby("consultant"): model = SKLearn(entry_point="training_scripts.py", train_instance_type="ml.c4.xlarge", role=role, sagemaker_session=sagemaker_session, hyperparameters={"normalize": True}) model_fit = model.fit({"train": df}) models[name] = model_fit.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")
from sagemaker.sklearn import SKLearn # Initialise SDK sklearn_estimator = SKLearn( entry_point='train_and_deploy.py', role='arn:aws:iam::<your-sagemaker-role>', # train_instance_type='ml.m4.xlarge', train_instance_type='local', output_path='s3://<path-to-output-dir>/', hyperparameters={ 'sagemaker_submit_directory': 's3://<path-to-sagemaker_submit_directory>' }, code_location='s3://<path-to-code_location>', framework_version='0.20.0') # Run model training job sklearn_estimator.fit({'train': 's3://<path-to-training-data-dir>'}) # Deploy trained model to an endpoint predictor = sklearn_estimator.deploy( # instance_type='ml.t2.medium', instance_type='local', initial_instance_count=1, endpoint_name='<your-end-point-name>', )