def test_deploy_model_with_update_non_existing_endpoint( mxnet_training_job, sagemaker_session, mxnet_full_version, cpu_instance_type, alternative_cpu_instance_type, ): endpoint_name = "test-mxnet-deploy-model-{}".format(sagemaker_timestamp()) expected_error_message = ( 'Endpoint with name "{}" does not exist; ' "please use an existing endpoint name".format(endpoint_name) ) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): desc = sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=mxnet_training_job ) model_data = desc["ModelArtifacts"]["S3ModelArtifacts"] script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist.py") model = MXNetModel( model_data, "SageMakerRole", entry_point=script_path, py_version=PYTHON_VERSION, sagemaker_session=sagemaker_session, framework_version=mxnet_full_version, ) model.deploy(1, alternative_cpu_instance_type, endpoint_name=endpoint_name) sagemaker_session.sagemaker_client.describe_endpoint(EndpointName=endpoint_name) with pytest.raises(ValueError, message=expected_error_message): model.deploy( 1, cpu_instance_type, update_endpoint=True, endpoint_name="non-existing-endpoint" )
def test_attach_deploy(chainer_training_job, sagemaker_session): endpoint_name = 'test-chainer-attach-deploy-{}'.format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = Chainer.attach(chainer_training_job, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) _predict_and_assert(predictor)
def test_async_fit(sagemaker_session): endpoint_name = 'test-chainer-attach-deploy-{}'.format( sagemaker_timestamp()) with timeout(minutes=5): training_job_name = _run_mnist_training_job( sagemaker_session, "ml.c4.xlarge", 1, chainer_full_version=CHAINER_VERSION, wait=False) print("Waiting to re-attach to the training job: %s" % training_job_name) time.sleep(20) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=35): print("Re-attaching now to: %s" % training_job_name) estimator = Chainer.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, "ml.c4.xlarge", endpoint_name=endpoint_name) _predict_and_assert(predictor)
def test_train_py_version(docker_image, sagemaker_session, py_version, opt_ml, processor): resource_path = 'test/resources/py_version/code' s3_source_archive = fw_utils.tar_and_upload_dir( session=sagemaker_session.boto_session, bucket=sagemaker_session.default_bucket(), s3_key_prefix=sagemaker_timestamp(), script='usermodule.py', directory=resource_path) hp = _py_version_dict(py_version) utils.create_config_files('usermodule.py', s3_source_archive.s3_prefix, opt_ml, additional_hp=hp) os.makedirs(os.path.join(opt_ml, 'model')) docker_utils.train(docker_image, opt_ml, processor) # The usermodule.py train_fn will assert on the expected python versions passed in through hyperparameters, # and training will fail if they are incorrect. success_file = 'output/success' assert os.path.exists(os.path.join( opt_ml, success_file)), 'expected file not found: {}'.format(success_file)
def record_set(self, train, labels=None, channel="train"): """Build a :class:`~RecordSet` from a numpy :class:`~ndarray` matrix and label vector. For the 2D ``ndarray`` ``train``, each row is converted to a :class:`~Record` object. The vector is stored in the "values" entry of the ``features`` property of each Record. If ``labels`` is not None, each corresponding label is assigned to the "values" entry of the ``labels`` property of each Record. The collection of ``Record`` objects are protobuf serialized and uploaded to new S3 locations. A manifest file is generated containing the list of objects created and also stored in S3. The number of S3 objects created is controlled by the ``train_instance_count`` property on this Estimator. One S3 object is created per training instance. Args: train (numpy.ndarray): A 2D numpy array of training data. labels (numpy.ndarray): A 1D numpy array of labels. Its length must be equal to the number of rows in ``train``. channel (str): The SageMaker TrainingJob channel this RecordSet should be assigned to. Returns: RecordSet: A RecordSet referencing the encoded, uploading training and label data. """ s3 = self.sagemaker_session.boto_session.resource('s3') parsed_s3_url = urlparse(self.data_location) bucket, key_prefix = parsed_s3_url.netloc, parsed_s3_url.path key_prefix = key_prefix + '{}-{}/'.format(type(self).__name__, sagemaker_timestamp()) key_prefix = key_prefix.lstrip('/') logger.debug('Uploading to bucket {} and key_prefix {}'.format(bucket, key_prefix)) manifest_s3_file = upload_numpy_to_s3_shards(self.train_instance_count, s3, bucket, key_prefix, train, labels) logger.debug("Created manifest file {}".format(manifest_s3_file)) return RecordSet(manifest_s3_file, num_records=train.shape[0], feature_dim=train.shape[1], channel=channel)
def test_marketplace_estimator(sagemaker_session): with timeout(minutes=15): data_path = os.path.join(DATA_DIR, 'marketplace', 'training') algo = AlgorithmEstimator( algorithm_arn=(ALGORITHM_ARN % sagemaker_session.boto_region_name), role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) train_input = algo.sagemaker_session.upload_data( path=data_path, key_prefix='integ-test-data/marketplace/train') algo.fit({'training': train_input}) endpoint_name = 'test-marketplace-estimator{}'.format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20): predictor = algo.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) shape = pandas.read_csv(os.path.join(data_path, 'iris.csv'), header=None) a = [50 * i for i in range(3)] b = [40 + i for i in range(10)] indices = [i + j for i, j in itertools.product(a, b)] test_data = shape.iloc[indices[:-1]] test_x = test_data.iloc[:, 1:] print(predictor.predict(test_x.values).decode('utf-8'))
def test_deploy_model_with_accelerator(sagemaker_session, tf_training_job, ei_tf_full_version): endpoint_name = 'test-tf-deploy-model-ei-{}'.format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): desc = sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=tf_training_job) model_data = desc['ModelArtifacts']['S3ModelArtifacts'] script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py') model = TensorFlowModel(model_data, 'SageMakerRole', entry_point=script_path, framework_version=ei_tf_full_version, sagemaker_session=sagemaker_session) json_predictor = model.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge', endpoint_name=endpoint_name, accelerator_type='ml.eia1.medium') features = [6.4, 3.2, 4.5, 1.5] dict_result = json_predictor.predict({'inputs': features}) print('predict result: {}'.format(dict_result)) list_result = json_predictor.predict(features) print('predict result: {}'.format(list_result)) assert dict_result == list_result
def test_marketplace_model(sagemaker_session): def predict_wrapper(endpoint, session): return sagemaker.RealTimePredictor( endpoint, session, serializer=sagemaker.predictor.csv_serializer ) model = ModelPackage(role='SageMakerRole', model_package_arn=(MODEL_PACKAGE_ARN % sagemaker_session.boto_region_name), sagemaker_session=sagemaker_session, predictor_cls=predict_wrapper) endpoint_name = 'test-marketplace-model-endpoint{}'.format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20): predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) data_path = os.path.join(DATA_DIR, 'marketplace', 'training') shape = pandas.read_csv(os.path.join(data_path, 'iris.csv'), header=None) a = [50 * i for i in range(3)] b = [40 + i for i in range(10)] indices = [i + j for i, j in itertools.product(a, b)] test_data = shape.iloc[indices[:-1]] test_x = test_data.iloc[:, 1:] print(predictor.predict(test_x.values).decode('utf-8'))
def test_tuning(sagemaker_session, ecr_image, instance_type): mx = MXNet(entry_point=SCRIPT_PATH, role='SageMakerRole', train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, hyperparameters={'epochs': 1}) hyperparameter_ranges = {'learning-rate': ContinuousParameter(0.01, 0.2)} objective_metric_name = 'Validation-accuracy' metric_definitions = [ {'Name': 'Validation-accuracy', 'Regex': 'Validation-accuracy=([0-9\\.]+)'}] tuner = HyperparameterTuner(mx, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2) with timeout(minutes=20): prefix = 'mxnet_mnist/{}'.format(utils.sagemaker_timestamp()) train_input = mx.sagemaker_session.upload_data(path=os.path.join(DATA_PATH, 'train'), key_prefix=prefix + '/train') test_input = mx.sagemaker_session.upload_data(path=os.path.join(DATA_PATH, 'test'), key_prefix=prefix + '/test') job_name = utils.unique_name_from_base('test-mxnet-image', max_length=32) tuner.fit({'train': train_input, 'test': test_input}, job_name=job_name) tuner.wait()
def test_async_fit( sagemaker_session, cpu_instance_type, sklearn_latest_version, sklearn_latest_py_version, ): endpoint_name = "test-sklearn-attach-deploy-{}".format(sagemaker_timestamp()) with timeout(minutes=5): training_job_name = _run_mnist_training_job( sagemaker_session, cpu_instance_type, sklearn_version=sklearn_latest_version, wait=False, ) print("Waiting to re-attach to the training job: %s" % training_job_name) time.sleep(20) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): print("Re-attaching now to: %s" % training_job_name) estimator = SKLearn.attach( training_job_name=training_job_name, sagemaker_session=sagemaker_session ) predictor = estimator.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) _predict_and_assert(predictor)
def test_async_fit(sagemaker_session): endpoint_name = 'test-mxnet-attach-deploy-{}'.format(sagemaker_timestamp()) with timeout(minutes=5): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') mx.fit({'train': train_input, 'test': test_input}, wait=False) training_job_name = mx.latest_training_job.name print("Waiting to re-attach to the training job: %s" % training_job_name) time.sleep(20) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): print("Re-attaching now to: %s" % training_job_name) estimator = MXNet.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) predictor.predict(data)
def test_attach_deploy(chainer_training_job, sagemaker_session): endpoint_name = 'test-chainer-attach-deploy-{}'.format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = Chainer.attach(chainer_training_job, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) _predict_and_assert(predictor)
def test_deploy_model_with_accelerator( mxnet_training_job, sagemaker_session, mxnet_eia_latest_version, mxnet_eia_latest_py_version, cpu_instance_type, ): endpoint_name = "test-mxnet-deploy-model-ei-{}".format( sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): desc = sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=mxnet_training_job) model_data = desc["ModelArtifacts"]["S3ModelArtifacts"] script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_ei.py") model = MXNetModel( model_data, "SageMakerRole", entry_point=script_path, framework_version=mxnet_eia_latest_version, py_version=mxnet_eia_latest_py_version, sagemaker_session=sagemaker_session, ) predictor = model.deploy(1, cpu_instance_type, endpoint_name=endpoint_name, accelerator_type="ml.eia1.medium") data = numpy.zeros(shape=(1, 1, 28, 28)) result = predictor.predict(data) assert result is not None
def _test_training(ecr_image, sagemaker_session, instance_type, instance_count, framework_version): hyperparameters = { 'sagemaker_parameter_server_enabled': True } if instance_count > 1 else {} hyperparameters['epochs'] = 1 mx = MXNet(entry_point=SCRIPT_PATH, role='SageMakerRole', instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, image_uri=ecr_image, framework_version=framework_version, hyperparameters=hyperparameters) mx = _disable_sm_profiler(sagemaker_session.boto_region_name, mx) with timeout(minutes=15): prefix = 'mxnet_mnist/{}'.format(utils.sagemaker_timestamp()) train_input = sagemaker_session.upload_data( path=os.path.join(DATA_PATH, 'train'), key_prefix=prefix + '/train') test_input = sagemaker_session.upload_data(path=os.path.join( DATA_PATH, 'test'), key_prefix=prefix + '/test') job_name = utils.unique_name_from_base('test-mxnet-image') mx.fit({'train': train_input, 'test': test_input}, job_name=job_name)
def test_coach_mxnet(sagemaker_session, coach_mxnet_latest_version, cpu_instance_type): estimator = _test_coach(sagemaker_session, RLFramework.MXNET, coach_mxnet_latest_version, cpu_instance_type) job_name = unique_name_from_base("test-coach-mxnet") with timeout(minutes=15): estimator.fit(wait="False", job_name=job_name) estimator = RLEstimator.attach(estimator.latest_training_job.name, sagemaker_session=sagemaker_session) endpoint_name = "test-mxnet-coach-deploy-{}".format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): predictor = estimator.deploy(1, cpu_instance_type, entry_point="mxnet_deploy.py", endpoint_name=endpoint_name) observation = numpy.asarray([0, 0, 0, 0]) action = predictor.predict(observation) assert 0 < action[0][0] < 1 assert 0 < action[0][1] < 1
def test_deploy_model( mxnet_training_job, sagemaker_session, mxnet_inference_latest_version, mxnet_inference_latest_py_version, cpu_instance_type, ): endpoint_name = "test-mxnet-deploy-model-{}".format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): desc = sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=mxnet_training_job) model_data = desc["ModelArtifacts"]["S3ModelArtifacts"] script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist.py") model = MXNetModel( model_data, "SageMakerRole", entry_point=script_path, py_version=mxnet_inference_latest_py_version, sagemaker_session=sagemaker_session, framework_version=mxnet_inference_latest_version, ) predictor = model.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) result = predictor.predict(data) assert result is not None predictor.delete_model() with pytest.raises(Exception) as exception: sagemaker_session.sagemaker_client.describe_model(ModelName=model.name) assert "Could not find model" in str(exception.value)
def test_linear_regression(docker_image, sagemaker_session, opt_ml, processor): resource_path = 'test/resources/linear_regression' # create training data train_data = np.random.uniform(0, 1, [1000, 2]) train_label = np.array([train_data[i][0] + 2 * train_data[i][1] for i in range(1000)]) # eval data... repeat so there's enough to cover multicpu/gpu contexts eval_data = np.array([[7, 2], [6, 10], [12, 2]]).repeat(32, 0) eval_label = np.array([11, 26, 16]).repeat(32, 0) # save training data for path in ['training', 'evaluation']: os.makedirs(os.path.join(opt_ml, 'input', 'data', path)) np.savetxt(os.path.join(opt_ml, 'input/data/training/train_data.txt'), train_data) np.savetxt(os.path.join(opt_ml, 'input/data/training/train_label.txt'), train_label) np.savetxt(os.path.join(opt_ml, 'input/data/evaluation/eval_data.txt'), eval_data) np.savetxt(os.path.join(opt_ml, 'input/data/evaluation/eval_label.txt'), eval_label) s3_source_archive = fw_utils.tar_and_upload_dir(session=sagemaker_session.boto_session, bucket=sagemaker_session.default_bucket(), s3_key_prefix=sagemaker_timestamp(), script='linear_regression.py', directory=resource_path) utils.create_config_files('linear_regression.py', s3_source_archive.s3_prefix, opt_ml) os.makedirs(os.path.join(opt_ml, 'model')) docker_utils.train(docker_image, opt_ml, processor) for f in ['output/success', 'model/model-symbol.json', 'model/model-0000.params', 'model/model-shapes.json']: assert os.path.exists(os.path.join(opt_ml, f)), 'expected file not found: {}'.format(f)
def create_docker_services(command, tmpdir, hosts, image, additional_volumes, additional_env_vars, customer_script, source_dir, entrypoint): environment = [] session = boto3.Session() optml_dirs = set() if command == 'train': optml_dirs = {'output', 'input'} elif command == 'serve': environment.extend(DEFAULT_HOSTING_ENV) if customer_script: timestamp = utils.sagemaker_timestamp() s3_script_path = fw_utils.tar_and_upload_dir(session=session, bucket=default_bucket(session), s3_key_prefix='test-{}'.format(timestamp), script=customer_script, directory=source_dir)[0] environment.extend([ 'SAGEMAKER_PROGRAM={}'.format(os.path.basename(customer_script)), 'SAGEMAKER_SUBMIT_DIRECTORY={}'.format(s3_script_path) ]) else: raise ValueError('Unexpected command: {}'.format(command)) environment.extend(credentials_to_env(session)) environment.extend(additional_env_vars) return {h: create_docker_host(tmpdir, h, image, environment, optml_dirs, command, additional_volumes, entrypoint) for h in hosts}
def test_deploy_model(mxnet_training_job, sagemaker_session, mxnet_full_version): endpoint_name = 'test-mxnet-deploy-model-{}'.format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): desc = sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=mxnet_training_job) model_data = desc['ModelArtifacts']['S3ModelArtifacts'] script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py') model = MXNetModel(model_data, 'SageMakerRole', entry_point=script_path, py_version=PYTHON_VERSION, sagemaker_session=sagemaker_session, framework_version=mxnet_full_version) predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) result = predictor.predict(data) assert result is not None predictor.delete_model() with pytest.raises(Exception) as exception: sagemaker_session.sagemaker_client.describe_model(ModelName=model.name) assert 'Could not find model' in str(exception.value)
def test_deploy_model_with_update_non_existing_endpoint( mxnet_training_job, sagemaker_session, mxnet_full_version): endpoint_name = 'test-mxnet-deploy-model-{}'.format(sagemaker_timestamp()) expected_error_message = 'Endpoint with name "{}" does not exist; ' \ 'please use an existing endpoint name'.format(endpoint_name) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): desc = sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=mxnet_training_job) model_data = desc['ModelArtifacts']['S3ModelArtifacts'] script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py') model = MXNetModel(model_data, 'SageMakerRole', entry_point=script_path, py_version=PYTHON_VERSION, sagemaker_session=sagemaker_session, framework_version=mxnet_full_version) model.deploy(1, 'ml.t2.medium', endpoint_name=endpoint_name) sagemaker_session.describe_endpoint(EndpointName=endpoint_name) with pytest.raises(ValueError, message=expected_error_message): model.deploy(1, 'ml.m4.xlarge', update_endpoint=True, endpoint_name='non-existing-endpoint')
def test_async_fit_deploy(sagemaker_session, pytorch_full_version): training_job_name = "" # TODO: add tests against local mode when it's ready to be used instance_type = 'ml.p2.xlarge' with timeout(minutes=10): pytorch = _get_pytorch_estimator(sagemaker_session, pytorch_full_version, instance_type) pytorch.fit({'training': _upload_training_data(pytorch)}, wait=False) training_job_name = pytorch.latest_training_job.name print("Waiting to re-attach to the training job: %s" % training_job_name) time.sleep(20) if not _is_local_mode(instance_type): endpoint_name = 'test-pytorch-async-fit-attach-deploy-{}'.format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): print("Re-attaching now to: %s" % training_job_name) estimator = PyTorch.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, instance_type, endpoint_name=endpoint_name) batch_size = 100 data = numpy.random.rand(batch_size, 1, 28, 28).astype(numpy.float32) output = predictor.predict(data) assert output.shape == (batch_size, 10)
def test_inference_pipeline_batch_transform(sagemaker_session): sparkml_model_data = sagemaker_session.upload_data( path=os.path.join(SPARKML_DATA_PATH, 'mleap_model.tar.gz'), key_prefix='integ-test-data/sparkml/model') xgb_model_data = sagemaker_session.upload_data( path=os.path.join(XGBOOST_DATA_PATH, 'xgb_model.tar.gz'), key_prefix='integ-test-data/xgboost/model') batch_job_name = 'test-inference-pipeline-batch-{}'.format( sagemaker_timestamp()) sparkml_model = SparkMLModel(model_data=sparkml_model_data, env={'SAGEMAKER_SPARKML_SCHEMA': SCHEMA}, sagemaker_session=sagemaker_session) xgb_image = get_image_uri(sagemaker_session.boto_region_name, 'xgboost') xgb_model = Model(model_data=xgb_model_data, image=xgb_image, sagemaker_session=sagemaker_session) model = PipelineModel(models=[sparkml_model, xgb_model], role='SageMakerRole', sagemaker_session=sagemaker_session, name=batch_job_name) transformer = model.transformer(1, 'ml.m4.xlarge') transform_input_key_prefix = 'integ-test-data/sparkml_xgboost/transform' transform_input = transformer.sagemaker_session.upload_data( path=VALID_DATA_PATH, key_prefix=transform_input_key_prefix) with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES): transformer.transform(transform_input, content_type=CONTENT_TYPE_CSV, job_name=batch_job_name) transformer.wait()
def test_deploy_model_with_update_endpoint(mxnet_training_job, sagemaker_session): endpoint_name = 'test-mxnet-deploy-model-{}'.format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): desc = sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=mxnet_training_job) model_data = desc['ModelArtifacts']['S3ModelArtifacts'] script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py') model = MXNetModel(model_data, 'SageMakerRole', entry_point=script_path, py_version=PYTHON_VERSION, sagemaker_session=sagemaker_session) model.deploy(1, 'ml.t2.medium', endpoint_name=endpoint_name) old_endpoint = sagemaker_session.describe_endpoint( EndpointName=endpoint_name) old_config_name = old_endpoint['EndpointConfigName'] model.deploy(1, 'ml.m4.xlarge', update_endpoint=True, endpoint_name=endpoint_name) new_endpoint = sagemaker_session.describe_endpoint( EndpointName=endpoint_name)['ProductionVariants'] new_production_variants = new_endpoint['ProductionVariants'] new_config_name = new_endpoint['EndpointConfigName'] assert old_config_name != new_config_name assert new_production_variants['InstanceType'] == 'ml.m4.xlarge' assert new_production_variants['InitialInstanceCount'] == 1 assert new_production_variants['AcceleratorType'] is None
def test_deploy_model(mxnet_training_job, sagemaker_session): endpoint_name = "test-mxnet-deploy-model-{}".format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): desc = sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=mxnet_training_job ) model_data = desc["ModelArtifacts"]["S3ModelArtifacts"] script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_neo.py") role = "SageMakerRole" model = MXNetModel( model_data, role, entry_point=script_path, py_version=PYTHON_VERSION, sagemaker_session=sagemaker_session, ) model.compile( target_instance_family="ml_m4", input_shape={"data": [1, 1, 28, 28]}, role=role, job_name="test-deploy-model-compilation-job-{}".format(int(time.time())), output_path="/".join(model_data.split("/")[:-1]), ) predictor = model.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name) predictor.content_type = "application/vnd+python.numpy+binary" data = numpy.zeros(shape=(1, 1, 28, 28)) predictor.predict(data)
def test_byo_airflow_config_uploads_data_source_to_s3_when_inputs_provided( sagemaker_session, cpu_instance_type): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): training_data_path = os.path.join(DATA_DIR, "dummy_tensor") data_source_location = "test-airflow-config-{}".format( sagemaker_timestamp()) inputs = sagemaker_session.upload_data(path=training_data_path, key_prefix=os.path.join( data_source_location, "train")) estimator = Estimator( image_name=get_image_uri( sagemaker_session.boto_session.region_name, "factorization-machines"), role=ROLE, train_instance_count=SINGLE_INSTANCE_COUNT, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) training_config = _build_airflow_workflow( estimator=estimator, instance_type=cpu_instance_type, inputs=inputs) _assert_that_s3_url_contains_data( sagemaker_session, training_config["InputDataConfig"][0]["DataSource"]["S3DataSource"] ["S3Uri"], )
def test_async_fit(sagemaker_session): endpoint_name = 'test-mxnet-attach-deploy-{}'.format(sagemaker_timestamp()) with timeout(minutes=5): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') mx.fit({'train': train_input, 'test': test_input}, wait=False) training_job_name = mx.latest_training_job.name print("Waiting to re-attach to the training job: %s" % training_job_name) time.sleep(20) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): print("Re-attaching now to: %s" % training_job_name) estimator = MXNet.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) predictor.predict(data)
def test_deploy_model_with_accelerator( sagemaker_session, cpu_instance_type, pytorch_eia_latest_version, pytorch_eia_latest_py_version, ): endpoint_name = "test-pytorch-deploy-eia-{}".format(sagemaker_timestamp()) model_data = sagemaker_session.upload_data(path=EIA_MODEL) pytorch = PyTorchModel( model_data, "SageMakerRole", entry_point=EIA_SCRIPT, framework_version=pytorch_eia_latest_version, py_version=pytorch_eia_latest_py_version, sagemaker_session=sagemaker_session, ) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): predictor = pytorch.deploy( initial_instance_count=1, instance_type=cpu_instance_type, accelerator_type="ml.eia1.medium", endpoint_name=endpoint_name, ) batch_size = 100 data = numpy.random.rand(batch_size, 1, 28, 28).astype(numpy.float32) output = predictor.predict(data) assert output.shape == (batch_size, 10)
def test_training(sagemaker_session, ecr_image, instance_type, instance_count, framework_version): hyperparameters = { 'random_seed': True, 'num_steps': 50, 'smdebug_path': '/tmp/ml/output/tensors', 'epochs': 1 } mx = MXNet(entry_point=SCRIPT_PATH, role='SageMakerRole', instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, image_uri=ecr_image, framework_version=framework_version, hyperparameters=hyperparameters) with timeout(minutes=15): prefix = 'mxnet_mnist_gluon_basic_hook_demo/{}'.format( utils.sagemaker_timestamp()) train_input = mx.sagemaker_session.upload_data( path=os.path.join(DATA_PATH, 'train'), key_prefix=prefix + '/train') test_input = mx.sagemaker_session.upload_data( path=os.path.join(DATA_PATH, 'test'), key_prefix=prefix + '/test') job_name = utils.unique_name_from_base('test-mxnet-image') mx.fit({'train': train_input, 'test': test_input}, job_name=job_name)
def test_deploy_model( pytorch_training_job, sagemaker_session, cpu_instance_type, pytorch_inference_latest_version, pytorch_inference_latest_py_version, ): endpoint_name = "test-pytorch-deploy-model-{}".format( sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): desc = sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=pytorch_training_job) model_data = desc["ModelArtifacts"]["S3ModelArtifacts"] model = PyTorchModel( model_data, "SageMakerRole", entry_point=MNIST_SCRIPT, framework_version=pytorch_inference_latest_version, py_version=pytorch_inference_latest_py_version, sagemaker_session=sagemaker_session, ) predictor = model.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) batch_size = 100 data = numpy.random.rand(batch_size, 1, 28, 28).astype(numpy.float32) output = predictor.predict(data) assert output.shape == (batch_size, 10)
def test_deploy_packed_model_with_entry_point_name( sagemaker_session, cpu_instance_type, pytorch_inference_latest_version, pytorch_inference_latest_py_version, ): endpoint_name = "test-pytorch-deploy-model-{}".format( sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): model_data = sagemaker_session.upload_data(path=PACKED_MODEL) model = PyTorchModel( model_data, "SageMakerRole", entry_point="mnist.py", framework_version=pytorch_inference_latest_version, py_version=pytorch_inference_latest_py_version, sagemaker_session=sagemaker_session, ) predictor = model.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) batch_size = 100 data = numpy.random.rand(batch_size, 1, 28, 28).astype(numpy.float32) output = predictor.predict(data) assert output.shape == (batch_size, 10)
def _upload_data_to_s3( self, data, input_path=None, ): """Upload request data to Amazon S3 for users""" if input_path: bucket, key = parse_s3_url(input_path) else: my_uuid = str(uuid.uuid4()) timestamp = sagemaker_timestamp() bucket = self.sagemaker_session.default_bucket() key = "async-endpoint-inputs/{}/{}-{}".format( name_from_base(self.name, short=True), timestamp, my_uuid, ) data = self.serializer.serialize(data) self.s3_client.put_object(Body=data, Bucket=bucket, Key=key, ContentType=self.serializer.CONTENT_TYPE) input_path = input_path or "s3://{}/{}".format( self.sagemaker_session.default_bucket(), key) return input_path
def test_marketplace_model(sagemaker_session, cpu_instance_type): region = sagemaker_session.boto_region_name account = REGION_ACCOUNT_MAP[region] model_package_arn = MODEL_PACKAGE_ARN % (region, account) def predict_wrapper(endpoint, session): return sagemaker.RealTimePredictor( endpoint, session, serializer=sagemaker.predictor.csv_serializer) model = ModelPackage( role="SageMakerRole", model_package_arn=model_package_arn, sagemaker_session=sagemaker_session, predictor_cls=predict_wrapper, ) endpoint_name = "test-marketplace-model-endpoint{}".format( sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20): predictor = model.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) data_path = os.path.join(DATA_DIR, "marketplace", "training") shape = pandas.read_csv(os.path.join(data_path, "iris.csv"), header=None) a = [50 * i for i in range(3)] b = [40 + i for i in range(10)] indices = [i + j for i, j in itertools.product(a, b)] test_data = shape.iloc[indices[:-1]] test_x = test_data.iloc[:, 1:] print(predictor.predict(test_x.values).decode("utf-8"))
def test_sparkml_model_deploy(sagemaker_session, cpu_instance_type): # Uploads an MLeap serialized MLeap model to S3 and use that to deploy a SparkML model to perform inference data_path = os.path.join(DATA_DIR, "sparkml_model") endpoint_name = "test-sparkml-deploy-{}".format(sagemaker_timestamp()) model_data = sagemaker_session.upload_data( path=os.path.join(data_path, "mleap_model.tar.gz"), key_prefix="integ-test-data/sparkml/model", ) schema = json.dumps( { "input": [ {"name": "Pclass", "type": "float"}, {"name": "Embarked", "type": "string"}, {"name": "Age", "type": "float"}, {"name": "Fare", "type": "float"}, {"name": "SibSp", "type": "float"}, {"name": "Sex", "type": "string"}, ], "output": {"name": "features", "struct": "vector", "type": "double"}, } ) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): model = SparkMLModel( model_data=model_data, role="SageMakerRole", sagemaker_session=sagemaker_session, env={"SAGEMAKER_SPARKML_SCHEMA": schema}, ) predictor = model.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) valid_data = "1.0,C,38.0,71.5,1.0,female" assert predictor.predict(valid_data) == "1.0,0.0,38.0,1.0,71.5,0.0,1.0" invalid_data = "1.0,28.0,C,38.0,71.5,1.0" assert predictor.predict(invalid_data) is None
def test_async_fit_deploy(sagemaker_session, pytorch_full_version): training_job_name = "" # TODO: add tests against local mode when it's ready to be used instance_type = 'ml.p2.xlarge' with timeout(minutes=10): pytorch = _get_pytorch_estimator(sagemaker_session, pytorch_full_version, instance_type) pytorch.fit({'training': _upload_training_data(pytorch)}, wait=False) training_job_name = pytorch.latest_training_job.name print("Waiting to re-attach to the training job: %s" % training_job_name) time.sleep(20) if not _is_local_mode(instance_type): endpoint_name = 'test-pytorch-async-fit-attach-deploy-{}'.format( sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): print("Re-attaching now to: %s" % training_job_name) estimator = PyTorch.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, instance_type, endpoint_name=endpoint_name) batch_size = 100 data = numpy.random.rand(batch_size, 1, 28, 28).astype(numpy.float32) output = predictor.predict(data) assert output.shape == (batch_size, 10)
def test_attach_deploy(mxnet_training_job, sagemaker_session): endpoint_name = 'test-mxnet-attach-deploy-{}'.format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = MXNet.attach(mxnet_training_job, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) predictor.predict(data)
def test_deploy_model(chainer_training_job, sagemaker_session): endpoint_name = 'test-chainer-deploy-model-{}'.format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): desc = sagemaker_session.sagemaker_client.describe_training_job(TrainingJobName=chainer_training_job) model_data = desc['ModelArtifacts']['S3ModelArtifacts'] script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py') model = ChainerModel(model_data, 'SageMakerRole', entry_point=script_path, sagemaker_session=sagemaker_session) predictor = model.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name) _predict_and_assert(predictor)
def test_deploy_model(mxnet_training_job, sagemaker_session): endpoint_name = 'test-mxnet-deploy-model-{}'.format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): desc = sagemaker_session.sagemaker_client.describe_training_job(TrainingJobName=mxnet_training_job) model_data = desc['ModelArtifacts']['S3ModelArtifacts'] script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py') model = MXNetModel(model_data, 'SageMakerRole', entry_point=script_path, sagemaker_session=sagemaker_session) predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) predictor.predict(data)
def test_sync_fit_deploy(pytorch_training_job, sagemaker_session): # TODO: add tests against local mode when it's ready to be used endpoint_name = 'test-pytorch-sync-fit-attach-deploy{}'.format(sagemaker_timestamp()) with timeout(minutes=20): estimator = PyTorch.attach(pytorch_training_job, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28), dtype=numpy.float32) predictor.predict(data) batch_size = 100 data = numpy.random.rand(batch_size, 1, 28, 28).astype(numpy.float32) output = predictor.predict(data) assert output.shape == (batch_size, 10)
def test_deploy_model(pytorch_training_job, sagemaker_session): endpoint_name = 'test-pytorch-deploy-model-{}'.format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): desc = sagemaker_session.sagemaker_client.describe_training_job(TrainingJobName=pytorch_training_job) model_data = desc['ModelArtifacts']['S3ModelArtifacts'] model = PyTorchModel(model_data, 'SageMakerRole', entry_point=MNIST_SCRIPT, sagemaker_session=sagemaker_session) predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) batch_size = 100 data = numpy.random.rand(batch_size, 1, 28, 28).astype(numpy.float32) output = predictor.predict(data) assert output.shape == (batch_size, 10)
def test_async_fit(sagemaker_session): endpoint_name = 'test-chainer-attach-deploy-{}'.format(sagemaker_timestamp()) with timeout(minutes=5): training_job_name = _run_mnist_training_job(sagemaker_session, "ml.c4.xlarge", 1, chainer_full_version=CHAINER_VERSION, wait=False) print("Waiting to re-attach to the training job: %s" % training_job_name) time.sleep(20) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): print("Re-attaching now to: %s" % training_job_name) estimator = Chainer.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, "ml.c4.xlarge", endpoint_name=endpoint_name) _predict_and_assert(predictor)
def prepare_record_set_from_local_files(dir_path, destination, num_records, feature_dim, sagemaker_session): """Build a :class:`~RecordSet` by pointing to local files. Args: dir_path (string): Path to local directory from where the files shall be uploaded. destination (string): S3 path to upload the file to. num_records (int): Number of records in all the files feature_dim (int): Number of features in the data set sagemaker_session (sagemaker.session.Session): Session object to manage interactions with Amazon SageMaker APIs. Returns: RecordSet: A RecordSet specified by S3Prefix to to be used in training. """ key_prefix = urlparse(destination).path key_prefix = key_prefix + '{}-{}'.format("testfiles", sagemaker_timestamp()) key_prefix = key_prefix.lstrip('/') uploaded_location = sagemaker_session.upload_data(path=dir_path, key_prefix=key_prefix) return RecordSet(uploaded_location, num_records, feature_dim, s3_data_type='S3Prefix')
def test_deploy_elastic_inference_with_pretrained_model(pretrained_model_data, docker_image_uri, sagemaker_session, instance_type, accelerator_type): resource_path = os.path.join(SCRIPT_PATH, '../resources') endpoint_name = 'test-tf-ei-deploy-model-{}'.format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name=endpoint_name, sagemaker_session=sagemaker_session, minutes=20): tensorflow_model = TensorFlowModel(model_data=pretrained_model_data, entry_point='default_entry_point.py', source_dir=resource_path, role='SageMakerRole', image=docker_image_uri, sagemaker_session=sagemaker_session) logger.info('deploying model to endpoint: {}'.format(endpoint_name)) predictor = tensorflow_model.deploy(initial_instance_count=1, instance_type=instance_type, accelerator_type=accelerator_type, endpoint_name=endpoint_name) random_input = np.random.rand(1, 1, 3, 3) predict_response = predictor.predict({'input': random_input.tolist()}) assert predict_response['outputs']['probabilities']
def test_async_linear_learner(sagemaker_session): training_job_name = "" endpoint_name = 'test-linear-learner-async-{}'.format(sagemaker_timestamp()) with timeout(minutes=5): data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz') pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'} # Load the data into memory as numpy arrays with gzip.open(data_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) train_set[1][:100] = 1 train_set[1][100:200] = 0 train_set = train_set[0], train_set[1].astype(np.dtype('float32')) ll = LinearLearner('SageMakerRole', 1, 'ml.c4.2xlarge', base_job_name='test-linear-learner', predictor_type='binary_classifier', sagemaker_session=sagemaker_session) ll.binary_classifier_model_selection_criteria = 'accuracy' ll.target_recall = 0.5 ll.target_precision = 0.5 ll.positive_example_weight_mult = 0.1 ll.epochs = 1 ll.use_bias = True ll.num_models = 1 ll.num_calibration_samples = 1 ll.init_method = 'uniform' ll.init_scale = 0.5 ll.init_sigma = 0.2 ll.init_bias = 5 ll.optimizer = 'adam' ll.loss = 'logistic' ll.wd = 0.5 ll.l1 = 0.5 ll.momentum = 0.5 ll.learning_rate = 0.1 ll.beta_1 = 0.1 ll.beta_2 = 0.1 ll.use_lr_scheduler = True ll.lr_scheduler_step = 2 ll.lr_scheduler_factor = 0.5 ll.lr_scheduler_minimum_lr = 0.1 ll.normalize_data = False ll.normalize_label = False ll.unbias_data = True ll.unbias_label = False ll.num_point_for_scaler = 10000 ll.margin = 1.0 ll.quantile = 0.5 ll.loss_insensitivity = 0.1 ll.huber_delta = 0.1 ll.early_stopping_tolerance = 0.0001 ll.early_stopping_patience = 3 ll.fit(ll.record_set(train_set[0][:200], train_set[1][:200]), wait=False) training_job_name = ll.latest_training_job.name print("Waiting to re-attach to the training job: %s" % training_job_name) time.sleep(20) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = LinearLearner.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) model = LinearLearnerModel(estimator.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session) predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name) result = predictor.predict(train_set[0][0:100]) assert len(result) == 100 for record in result: assert record.label["predicted_label"] is not None assert record.label["score"] is not None