def mxnet_training_job( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): s3_prefix = "integ-test-data/mxnet_mnist" data_path = os.path.join(DATA_DIR, "mxnet_mnist") s3_source = sagemaker_session.upload_data( path=os.path.join(data_path, "sourcedir.tar.gz"), key_prefix="{}/src".format(s3_prefix)) mx = MXNet( entry_point="mxnet_mnist/mnist.py", source_dir=s3_source, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="{}/train".format(s3_prefix)) test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="{}/test".format(s3_prefix)) mx.fit({"train": train_input, "test": test_input}) return mx.latest_training_job.name
def test_mxnet_distributed(sagemaker_session, ecr_image, instance_type, framework_version): data_path = os.path.join(RESOURCE_PATH, 'mnist') script_path = os.path.join(data_path, 'mnist.py') mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=2, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, framework_version=framework_version, hyperparameters={'sagemaker_parameter_server_enabled': True}) prefix = 'mxnet_mnist/{}'.format(sagemaker_timestamp()) with timeout(minutes=15): train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, 'train'), key_prefix=prefix + '/train') test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, 'test'), key_prefix=prefix + '/test') mx.fit({'train': train_input, 'test': test_input}) with timeout_and_delete_endpoint(estimator=mx, minutes=30): predictor = mx.deploy(initial_instance_count=1, instance_type=instance_type) data = np.zeros(shape=(1, 1, 28, 28)) predictor.predict(data)
def mxnet_training_job( sagemaker_session, cpu_instance_type, mxnet_training_latest_version, mxnet_training_latest_py_version, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_neo.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train" ) test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test" ) mx.fit({"train": train_input, "test": test_input}) return mx.latest_training_job.name
def test_async_fit(sagemaker_session): endpoint_name = 'test-mxnet-attach-deploy-{}'.format(sagemaker_timestamp()) with timeout(minutes=5): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') mx.fit({'train': train_input, 'test': test_input}, wait=False) training_job_name = mx.latest_training_job.name print("Waiting to re-attach to the training job: %s" % training_job_name) time.sleep(20) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): print("Re-attaching now to: %s" % training_job_name) estimator = MXNet.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) predictor.predict(data)
def test_mxnet_with_debugger_hook_config(sagemaker_session, mxnet_full_version, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): debugger_hook_config = DebuggerHookConfig( s3_output_path=os.path.join( "s3://", sagemaker_session.default_bucket(), str(uuid.uuid4()), "tensors" ) ) script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_full_version, py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, debugger_hook_config=debugger_hook_config, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train" ) test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test" ) mx.fit({"train": train_input, "test": test_input}) job_description = mx.latest_training_job.describe() assert job_description["DebugHookConfig"] == debugger_hook_config._to_request_dict() _wait_and_assert_that_no_rule_jobs_errored(training_job=mx.latest_training_job)
def test_mxnet_with_rules_and_debugger_hook_config( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): rules = [ Rule.sagemaker(rule_configs.vanishing_gradient()), Rule.sagemaker(base_config=rule_configs.all_zero(), rule_parameters={"tensor_regex": ".*"}), Rule.sagemaker(rule_configs.loss_not_decreasing()), ] debugger_hook_config = DebuggerHookConfig(s3_output_path=os.path.join( "s3://", sagemaker_session.default_bucket(), str(uuid.uuid4()), "tensors")) script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, rules=rules, debugger_hook_config=debugger_hook_config, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") mx.fit({"train": train_input, "test": test_input}) job_description = mx.latest_training_job.describe() for index, rule in enumerate(rules): assert (job_description["DebugRuleConfigurations"][index] ["RuleConfigurationName"] == rule.name) assert (job_description["DebugRuleConfigurations"][index] ["RuleEvaluatorImage"] == rule.image_uri) assert job_description["DebugRuleConfigurations"][index][ "VolumeSizeInGB"] == 0 assert (job_description["DebugRuleConfigurations"][index] ["RuleParameters"]["rule_to_invoke"] == rule.rule_parameters["rule_to_invoke"]) assert job_description[ "DebugHookConfig"] == debugger_hook_config._to_request_dict() assert (job_description["DebugRuleEvaluationStatuses"] == mx.latest_training_job.rule_job_summary()) _wait_and_assert_that_no_rule_jobs_errored( training_job=mx.latest_training_job)
def _test_training(ecr_image, sagemaker_session, instance_type, instance_count, framework_version): hyperparameters = { 'random_seed': True, 'num_steps': 50, 'smdebug_path': '/tmp/ml/output/tensors', 'epochs': 1 } mx = MXNet(entry_point=SCRIPT_PATH, role='SageMakerRole', instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, image_uri=ecr_image, framework_version=framework_version, hyperparameters=hyperparameters) with timeout(minutes=15): prefix = 'mxnet_mnist_gluon_basic_hook_demo/{}'.format( utils.sagemaker_timestamp()) train_input = sagemaker_session.upload_data( path=os.path.join(DATA_PATH, 'train'), key_prefix=prefix + '/train') test_input = sagemaker_session.upload_data(path=os.path.join( DATA_PATH, 'test'), key_prefix=prefix + '/test') job_name = utils.unique_name_from_base('test-mxnet-image') mx.fit({'train': train_input, 'test': test_input}, job_name=job_name)
def _test_training(ecr_image, sagemaker_session, instance_type, instance_count, framework_version): hyperparameters = { 'sagemaker_parameter_server_enabled': True } if instance_count > 1 else {} hyperparameters['epochs'] = 1 mx = MXNet(entry_point=SCRIPT_PATH, role='SageMakerRole', instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, image_uri=ecr_image, framework_version=framework_version, hyperparameters=hyperparameters) mx = _disable_sm_profiler(sagemaker_session.boto_region_name, mx) with timeout(minutes=15): prefix = 'mxnet_mnist/{}'.format(utils.sagemaker_timestamp()) train_input = sagemaker_session.upload_data( path=os.path.join(DATA_PATH, 'train'), key_prefix=prefix + '/train') test_input = sagemaker_session.upload_data(path=os.path.join( DATA_PATH, 'test'), key_prefix=prefix + '/test') job_name = utils.unique_name_from_base('test-mxnet-image') mx.fit({'train': train_input, 'test': test_input}, job_name=job_name)
def test_async_fit(sagemaker_session, mxnet_full_version): endpoint_name = 'test-mxnet-attach-deploy-{}'.format(sagemaker_timestamp()) with timeout(minutes=5): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') mx = MXNet(entry_point=script_path, role='SageMakerRole', py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, framework_version=mxnet_full_version, distributions={'parameter_server': {'enabled': True}}) train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') mx.fit({'train': train_input, 'test': test_input}, wait=False) training_job_name = mx.latest_training_job.name print("Waiting to re-attach to the training job: %s" % training_job_name) time.sleep(20) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): print("Re-attaching now to: %s" % training_job_name) estimator = MXNet.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) predictor.predict(data)
def test_mxnet_with_debugger_hook_config_disabled( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, debugger_hook_config=False, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") mx.fit({"train": train_input, "test": test_input}) job_description = mx.latest_training_job.describe() assert job_description.get("DebugHookConfig") is None
def test_git_support_codecommit_with_mxnet(sagemaker_local_session): script_path = "mnist.py" data_path = os.path.join(DATA_DIR, "mxnet_mnist") git_config = { "repo": CODECOMMIT_REPO, "branch": CODECOMMIT_BRANCH, "username": "******", "password": "******", } source_dir = "mxnet" dependencies = ["foo/bar.py"] mx = MXNet( entry_point=script_path, role="SageMakerRole", source_dir=source_dir, dependencies=dependencies, framework_version=MXNet.LATEST_VERSION, py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type="local", sagemaker_session=sagemaker_local_session, git_config=git_config, ) mx.fit( { "train": "file://" + os.path.join(data_path, "train"), "test": "file://" + os.path.join(data_path, "test"), } ) files = [file for file in os.listdir(mx.source_dir)] assert "some_file" in files assert "mnist.py" in files assert os.path.exists(mx.dependencies[0]) with lock.lock(LOCK_PATH): try: client = sagemaker_local_session.sagemaker_client desc = client.describe_training_job(TrainingJobName=mx.latest_training_job.name) model_data = desc["ModelArtifacts"]["S3ModelArtifacts"] model = MXNetModel( model_data, "SageMakerRole", entry_point=script_path, source_dir=source_dir, dependencies=dependencies, py_version=PYTHON_VERSION, sagemaker_session=sagemaker_local_session, framework_version=MXNet.LATEST_VERSION, git_config=git_config, ) predictor = model.deploy(1, "local") data = numpy.zeros(shape=(1, 1, 28, 28)) result = predictor.predict(data) assert result is not None finally: predictor.delete_endpoint()
def test_mxnet_with_built_in_profiler_rule_with_custom_parameters( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): custom_profiler_report_rule = ProfilerRule.sagemaker( rule_configs.ProfilerReport(CPUBottleneck_threshold=90), name="CustomProfilerReportRule") script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, rules=[custom_profiler_report_rule], ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") training_job_name = unique_name_from_base( "test-profiler-mxnet-training") mx.fit( inputs={ "train": train_input, "test": test_input }, job_name=training_job_name, wait=False, ) job_description = mx.latest_training_job.describe() assert job_description.get("ProfilingStatus") == "Enabled" assert (job_description.get("ProfilerConfig") == ProfilerConfig( s3_output_path=mx.output_path, system_monitor_interval_millis=500)._to_request_dict()) profiler_rule_configuration = job_description.get( "ProfilerRuleConfigurations")[0] assert profiler_rule_configuration[ "RuleConfigurationName"] == "CustomProfilerReportRule" assert profiler_rule_configuration["RuleEvaluatorImage"] == mx.rules[ 0].image_uri assert profiler_rule_configuration["RuleParameters"] == { "rule_to_invoke": "ProfilerReport", "CPUBottleneck_threshold": "90", }
def test_failed_training_job(sagemaker_session, mxnet_full_version): with timeout(): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'failure_script.py') mx = MXNet(entry_point=script_path, role='SageMakerRole', framework_version=mxnet_full_version, py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) with pytest.raises(ValueError) as e: mx.fit() assert 'ExecuteUserScriptError' in str(e.value)
def test_nlp_training(sagemaker_session, ecr_image, instance_type): nlp = MXNet(entry_point=NLP_SCRIPT_PATH, role='SageMakerRole', train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, train_max_run=5 * 60) job_name = utils.unique_name_from_base('test-nlp-image') nlp.fit(job_name=job_name)
def test_dgl_training(sagemaker_session, ecr_image, instance_type): dgl = MXNet(entry_point=DGL_SCRIPT_PATH, role='SageMakerRole', train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image) with timeout(minutes=15): job_name = utils.unique_name_from_base('test-dgl-image') dgl.fit(job_name=job_name)
def test_mxnet_with_custom_rule_and_actions( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, actions, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): rules = [_get_custom_rule(sagemaker_session, actions)] script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, rules=rules, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train" ) test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test" ) mx.fit({"train": train_input, "test": test_input}) job_description = mx.latest_training_job.describe() for index, rule in enumerate(rules): assert ( job_description["DebugRuleConfigurations"][index]["RuleConfigurationName"] == rule.name ) assert ( job_description["DebugRuleConfigurations"][index]["RuleEvaluatorImage"] == rule.image_uri ) assert job_description["DebugRuleConfigurations"][index]["VolumeSizeInGB"] == 30 assert ( _get_rule_evaluation_statuses(job_description) == mx.latest_training_job.rule_job_summary() ) _wait_and_assert_that_no_rule_jobs_errored(training_job=mx.latest_training_job)
def test_private_github( sagemaker_local_session, mxnet_training_latest_version, mxnet_training_latest_py_version ): script_path = "mnist.py" data_path = os.path.join(DATA_DIR, "mxnet_mnist") git_config = { "repo": PRIVATE_GIT_REPO, "branch": PRIVATE_BRANCH, "commit": PRIVATE_COMMIT, "2FA_enabled": False, "username": "******", "password": "", # TODO: find a secure approach } source_dir = "mxnet" dependencies = ["foo/bar.py"] mx = MXNet( entry_point=script_path, role="SageMakerRole", source_dir=source_dir, dependencies=dependencies, framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type="local", sagemaker_session=sagemaker_local_session, git_config=git_config, ) mx.fit( { "train": "file://" + os.path.join(data_path, "train"), "test": "file://" + os.path.join(data_path, "test"), } ) files = [file for file in os.listdir(mx.source_dir)] assert "some_file" in files assert "mnist.py" in files assert os.path.exists(mx.dependencies[0]) with lock.lock(LOCK_PATH): try: serving_script_path = "mnist_hosting_with_custom_handlers.py" predictor = mx.deploy(1, "local", entry_point=serving_script_path) data = numpy.zeros(shape=(1, 1, 28, 28)) result = predictor.predict(data) assert result is not None finally: predictor.delete_endpoint()
def test_training(sagemaker_session, ecr_image, instance_type, instance_count): hyperparameters = { 'sagemaker_parameter_server_enabled': True } if instance_count > 1 else {} hyperparameters['epochs'] = 1 mx = MXNet(entry_point=SCRIPT_PATH, role='SageMakerRole', train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, hyperparameters=hyperparameters) with timeout(minutes=15): prefix = 'mxnet_mnist/{}'.format(utils.sagemaker_timestamp()) train_input = mx.sagemaker_session.upload_data( path=os.path.join(DATA_PATH, 'train'), key_prefix=prefix + '/train') test_input = mx.sagemaker_session.upload_data( path=os.path.join(DATA_PATH, 'test'), key_prefix=prefix + '/test') job_name = utils.unique_name_from_base('test-mxnet-image') mx.fit({'train': train_input, 'test': test_input}, job_name=job_name) dgl = MXNet(entry_point=DGL_SCRIPT_PATH, role='SageMakerRole', train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image) with timeout(minutes=15): job_name = utils.unique_name_from_base('test-mxnet-dgl-image') dgl.fit(job_name=job_name)
def test_async_fit( sagemaker_session, mxnet_training_latest_version, mxnet_inference_latest_py_version, cpu_instance_type, ): endpoint_name = "test-mxnet-attach-deploy-{}".format(sagemaker_timestamp()) with timeout(minutes=5): script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", py_version=mxnet_inference_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, framework_version=mxnet_training_latest_version, distribution={"parameter_server": { "enabled": True }}, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") mx.fit({"train": train_input, "test": test_input}, wait=False) training_job_name = mx.latest_training_job.name print("Waiting to re-attach to the training job: %s" % training_job_name) time.sleep(20) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): print("Re-attaching now to: %s" % training_job_name) estimator = MXNet.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) result = predictor.predict(data) assert result is not None
def test_failed_training_job(sagemaker_session, mxnet_full_version): with timeout(minutes=15): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'failure_script.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') mx = MXNet(entry_point=script_path, role='SageMakerRole', framework_version=mxnet_full_version, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train-failure') with pytest.raises(ValueError) as e: mx.fit(train_input) assert 'This failure is expected' in str(e.value)
def test_tuning_mxnet(sagemaker_session): with timeout(minutes=15): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'tuning.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') estimator = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.m4.xlarge', sagemaker_session=sagemaker_session, base_job_name='tune-mxnet') hyperparameter_ranges = {'learning_rate': ContinuousParameter(0.01, 0.2)} objective_metric_name = 'Validation-accuracy' metric_definitions = [{'Name': 'Validation-accuracy', 'Regex': 'Validation-accuracy=([0-9\\.]+)'}] tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=4, max_parallel_jobs=2) train_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') tuner.fit({'train': train_input, 'test': test_input}) print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, 'ml.c4.xlarge') data = np.zeros(shape=(1, 1, 28, 28)) predictor.predict(data)
def test_tuning(sagemaker_session, ecr_image, instance_type): mx = MXNet(entry_point=SCRIPT_PATH, role='SageMakerRole', train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, hyperparameters={'epochs': 1}) hyperparameter_ranges = {'learning-rate': ContinuousParameter(0.01, 0.2)} objective_metric_name = 'Validation-accuracy' metric_definitions = [ {'Name': 'Validation-accuracy', 'Regex': 'Validation-accuracy=([0-9\\.]+)'}] tuner = HyperparameterTuner(mx, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2) with timeout(minutes=20): prefix = 'mxnet_mnist/{}'.format(utils.sagemaker_timestamp()) train_input = mx.sagemaker_session.upload_data(path=os.path.join(DATA_PATH, 'train'), key_prefix=prefix + '/train') test_input = mx.sagemaker_session.upload_data(path=os.path.join(DATA_PATH, 'test'), key_prefix=prefix + '/test') job_name = utils.unique_name_from_base('test-mxnet-image', max_length=32) tuner.fit({'train': train_input, 'test': test_input}, job_name=job_name) tuner.wait()
def mxnet_training_job(sagemaker_session, mxnet_full_version): with timeout(minutes=15): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') mx = MXNet(entry_point=script_path, role='SageMakerRole', framework_version=mxnet_full_version, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') mx.fit({'train': train_input, 'test': test_input}) return mx.latest_training_job.name
def mxnet_training_job(sagemaker_session, mxnet_full_version): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') mx = MXNet(entry_point=script_path, role='SageMakerRole', framework_version=mxnet_full_version, py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') mx.fit({'train': train_input, 'test': test_input}) return mx.latest_training_job.name
def mxnet_training_job(sagemaker_session): with timeout(minutes=15): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') mx.fit({'train': train_input, 'test': test_input}) return mx.latest_training_job.name
def test_attach_deploy( mxnet_training_job, sagemaker_session, cpu_instance_type, cpu_instance_family ): endpoint_name = unique_name_from_base("test-neo-attach-deploy") with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = MXNet.attach(mxnet_training_job, sagemaker_session=sagemaker_session) estimator.compile_model( target_instance_family=cpu_instance_family, input_shape={"data": [1, 1, 28, 28]}, output_path=estimator.output_path, ) serializer = JSONSerializer(content_type="application/vnd+python.numpy+binary") predictor = estimator.deploy( 1, cpu_instance_type, serializer=serializer, use_compiled_model=True, endpoint_name=endpoint_name, ) data = numpy.zeros(shape=(1, 1, 28, 28)) predictor.predict(data)
def test_codecommit( sagemaker_local_session, mxnet_training_latest_version, mxnet_training_latest_py_version ): script_path = "mnist.py" data_path = os.path.join(DATA_DIR, "mxnet_mnist") git_config = { "repo": CODECOMMIT_REPO, "branch": CODECOMMIT_BRANCH, "username": "******", "password": "", # TODO: assume a role to get temporary credentials } source_dir = "mxnet" dependencies = ["foo/bar.py"] mx = MXNet( entry_point=script_path, role="SageMakerRole", source_dir=source_dir, dependencies=dependencies, framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type="local", sagemaker_session=sagemaker_local_session, git_config=git_config, ) mx.fit( { "train": "file://" + os.path.join(data_path, "train"), "test": "file://" + os.path.join(data_path, "test"), } ) files = [file for file in os.listdir(mx.source_dir)] assert "some_file" in files assert "mnist.py" in files assert os.path.exists(mx.dependencies[0]) with lock.lock(LOCK_PATH): try: predictor = mx.deploy(1, "local") data = numpy.zeros(shape=(1, 1, 28, 28)) result = predictor.predict(data) assert result is not None finally: predictor.delete_endpoint()
def test_attach_deploy(mxnet_training_job, sagemaker_session): endpoint_name = 'test-mxnet-attach-deploy-{}'.format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = MXNet.attach(mxnet_training_job, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) predictor.predict(data)
def test_tuning_mxnet( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") estimator = MXNet( entry_point=script_path, role="SageMakerRole", py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, framework_version=mxnet_training_latest_version, sagemaker_session=sagemaker_session, ) hyperparameter_ranges = { "learning-rate": ContinuousParameter(0.01, 0.2) } objective_metric_name = "Validation-accuracy" metric_definitions = [{ "Name": "Validation-accuracy", "Regex": "Validation-accuracy=([0-9\\.]+)" }] tuner = HyperparameterTuner( estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=4, max_parallel_jobs=2, ) train_input = estimator.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = estimator.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") tuning_job_name = unique_name_from_base("tune-mxnet", max_length=32) print("Started hyperparameter tuning job with name:" + tuning_job_name) tuner.fit({ "train": train_input, "test": test_input }, job_name=tuning_job_name) best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, cpu_instance_type) data = np.zeros(shape=(1, 1, 28, 28)) predictor.predict(data)
def test_attach_deploy(mxnet_training_job, sagemaker_session, cpu_instance_type): endpoint_name = "test-mxnet-attach-deploy-{}".format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = MXNet.attach(mxnet_training_job, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) result = predictor.predict(data) assert result is not None
def create_estimator(self): from sagemaker.mxnet.estimator import MXNet return MXNet(self.script, role=self.role_name, base_job_name=self.job_name, train_instance_count=self.instance_count, train_instance_type=self.instance_type, hyperparameters=self.hyperparameters, py_version=self.python)
def test_jumpstart_mxnet_image_uri(patched_get_model_specs, session): patched_get_model_specs.side_effect = get_prototype_model_spec model_id, model_version = "mxnet-semseg-fcn-resnet50-ade", "*" instance_type = "ml.p2.xlarge" region = "us-west-2" model_specs = accessors.JumpStartModelsAccessor.get_model_specs( region, model_id, model_version) # inference uri = image_uris.retrieve( framework=None, region=region, image_scope="inference", model_id=model_id, model_version=model_version, instance_type=instance_type, ) framework_class_uri = MXNetModel( role="mock_role", model_data="mock_data", entry_point="mock_entry_point", framework_version=model_specs.hosting_ecr_specs.framework_version, py_version=model_specs.hosting_ecr_specs.py_version, sagemaker_session=session, ).serving_image_uri(region, instance_type) assert uri == framework_class_uri assert uri == "763104351884.dkr.ecr.us-west-2.amazonaws.com/mxnet-inference:1.7.0-gpu-py3" # training uri = image_uris.retrieve( framework=None, region=region, image_scope="training", model_id=model_id, model_version=model_version, instance_type=instance_type, ) framework_class_uri = MXNet( role="mock_role", entry_point="mock_entry_point", framework_version=model_specs.training_ecr_specs.framework_version, py_version=model_specs.training_ecr_specs.py_version, instance_type=instance_type, instance_count=1, sagemaker_session=session, ).training_image_uri(region=region) assert uri == framework_class_uri assert uri == "763104351884.dkr.ecr.us-west-2.amazonaws.com/mxnet-training:1.7.0-gpu-py3"