def mxnet_training_job( sagemaker_session, cpu_instance_type, mxnet_training_latest_version, mxnet_training_latest_py_version, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_neo.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train" ) test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test" ) mx.fit({"train": train_input, "test": test_input}) return mx.latest_training_job.name
def test_mxnet_distributed(sagemaker_session, ecr_image, instance_type, framework_version): data_path = os.path.join(RESOURCE_PATH, 'mnist') script_path = os.path.join(data_path, 'mnist.py') mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=2, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, framework_version=framework_version, hyperparameters={'sagemaker_parameter_server_enabled': True}) prefix = 'mxnet_mnist/{}'.format(sagemaker_timestamp()) with timeout(minutes=15): train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, 'train'), key_prefix=prefix + '/train') test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, 'test'), key_prefix=prefix + '/test') mx.fit({'train': train_input, 'test': test_input}) with timeout_and_delete_endpoint(estimator=mx, minutes=30): predictor = mx.deploy(initial_instance_count=1, instance_type=instance_type) data = np.zeros(shape=(1, 1, 28, 28)) predictor.predict(data)
def mxnet_training_job( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): s3_prefix = "integ-test-data/mxnet_mnist" data_path = os.path.join(DATA_DIR, "mxnet_mnist") s3_source = sagemaker_session.upload_data( path=os.path.join(data_path, "sourcedir.tar.gz"), key_prefix="{}/src".format(s3_prefix)) mx = MXNet( entry_point="mxnet_mnist/mnist.py", source_dir=s3_source, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="{}/train".format(s3_prefix)) test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="{}/test".format(s3_prefix)) mx.fit({"train": train_input, "test": test_input}) return mx.latest_training_job.name
def test_async_fit(sagemaker_session): endpoint_name = 'test-mxnet-attach-deploy-{}'.format(sagemaker_timestamp()) with timeout(minutes=5): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') mx.fit({'train': train_input, 'test': test_input}, wait=False) training_job_name = mx.latest_training_job.name print("Waiting to re-attach to the training job: %s" % training_job_name) time.sleep(20) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): print("Re-attaching now to: %s" % training_job_name) estimator = MXNet.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) predictor.predict(data)
def test_training(sagemaker_session, ecr_image, instance_type, instance_count): hyperparameters = { 'sagemaker_parameter_server_enabled': True } if instance_count > 1 else {} hyperparameters['epochs'] = 1 mx = MXNet(entry_point=SCRIPT_PATH, role='SageMakerRole', train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, hyperparameters=hyperparameters) with timeout(minutes=15): prefix = 'mxnet_mnist/{}'.format(utils.sagemaker_timestamp()) train_input = mx.sagemaker_session.upload_data( path=os.path.join(DATA_PATH, 'train'), key_prefix=prefix + '/train') test_input = mx.sagemaker_session.upload_data( path=os.path.join(DATA_PATH, 'test'), key_prefix=prefix + '/test') job_name = utils.unique_name_from_base('test-mxnet-image') mx.fit({'train': train_input, 'test': test_input}, job_name=job_name) dgl = MXNet(entry_point=DGL_SCRIPT_PATH, role='SageMakerRole', train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image) with timeout(minutes=15): job_name = utils.unique_name_from_base('test-mxnet-dgl-image') dgl.fit(job_name=job_name)
def test_mxnet_with_debugger_hook_config(sagemaker_session, mxnet_full_version, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): debugger_hook_config = DebuggerHookConfig( s3_output_path=os.path.join( "s3://", sagemaker_session.default_bucket(), str(uuid.uuid4()), "tensors" ) ) script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_full_version, py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, debugger_hook_config=debugger_hook_config, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train" ) test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test" ) mx.fit({"train": train_input, "test": test_input}) job_description = mx.latest_training_job.describe() assert job_description["DebugHookConfig"] == debugger_hook_config._to_request_dict() _wait_and_assert_that_no_rule_jobs_errored(training_job=mx.latest_training_job)
def test_mxnet_with_rules_and_debugger_hook_config( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): rules = [ Rule.sagemaker(rule_configs.vanishing_gradient()), Rule.sagemaker(base_config=rule_configs.all_zero(), rule_parameters={"tensor_regex": ".*"}), Rule.sagemaker(rule_configs.loss_not_decreasing()), ] debugger_hook_config = DebuggerHookConfig(s3_output_path=os.path.join( "s3://", sagemaker_session.default_bucket(), str(uuid.uuid4()), "tensors")) script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, rules=rules, debugger_hook_config=debugger_hook_config, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") mx.fit({"train": train_input, "test": test_input}) job_description = mx.latest_training_job.describe() for index, rule in enumerate(rules): assert (job_description["DebugRuleConfigurations"][index] ["RuleConfigurationName"] == rule.name) assert (job_description["DebugRuleConfigurations"][index] ["RuleEvaluatorImage"] == rule.image_uri) assert job_description["DebugRuleConfigurations"][index][ "VolumeSizeInGB"] == 0 assert (job_description["DebugRuleConfigurations"][index] ["RuleParameters"]["rule_to_invoke"] == rule.rule_parameters["rule_to_invoke"]) assert job_description[ "DebugHookConfig"] == debugger_hook_config._to_request_dict() assert (job_description["DebugRuleEvaluationStatuses"] == mx.latest_training_job.rule_job_summary()) _wait_and_assert_that_no_rule_jobs_errored( training_job=mx.latest_training_job)
def _test_training(ecr_image, sagemaker_session, instance_type, instance_count, framework_version): hyperparameters = { 'random_seed': True, 'num_steps': 50, 'smdebug_path': '/tmp/ml/output/tensors', 'epochs': 1 } mx = MXNet(entry_point=SCRIPT_PATH, role='SageMakerRole', instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, image_uri=ecr_image, framework_version=framework_version, hyperparameters=hyperparameters) with timeout(minutes=15): prefix = 'mxnet_mnist_gluon_basic_hook_demo/{}'.format( utils.sagemaker_timestamp()) train_input = sagemaker_session.upload_data( path=os.path.join(DATA_PATH, 'train'), key_prefix=prefix + '/train') test_input = sagemaker_session.upload_data(path=os.path.join( DATA_PATH, 'test'), key_prefix=prefix + '/test') job_name = utils.unique_name_from_base('test-mxnet-image') mx.fit({'train': train_input, 'test': test_input}, job_name=job_name)
def test_mxnet_with_debugger_hook_config_disabled( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, debugger_hook_config=False, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") mx.fit({"train": train_input, "test": test_input}) job_description = mx.latest_training_job.describe() assert job_description.get("DebugHookConfig") is None
def _test_training(ecr_image, sagemaker_session, instance_type, instance_count, framework_version): hyperparameters = { 'sagemaker_parameter_server_enabled': True } if instance_count > 1 else {} hyperparameters['epochs'] = 1 mx = MXNet(entry_point=SCRIPT_PATH, role='SageMakerRole', instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, image_uri=ecr_image, framework_version=framework_version, hyperparameters=hyperparameters) mx = _disable_sm_profiler(sagemaker_session.boto_region_name, mx) with timeout(minutes=15): prefix = 'mxnet_mnist/{}'.format(utils.sagemaker_timestamp()) train_input = sagemaker_session.upload_data( path=os.path.join(DATA_PATH, 'train'), key_prefix=prefix + '/train') test_input = sagemaker_session.upload_data(path=os.path.join( DATA_PATH, 'test'), key_prefix=prefix + '/test') job_name = utils.unique_name_from_base('test-mxnet-image') mx.fit({'train': train_input, 'test': test_input}, job_name=job_name)
def test_async_fit(sagemaker_session, mxnet_full_version): endpoint_name = 'test-mxnet-attach-deploy-{}'.format(sagemaker_timestamp()) with timeout(minutes=5): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') mx = MXNet(entry_point=script_path, role='SageMakerRole', py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, framework_version=mxnet_full_version, distributions={'parameter_server': {'enabled': True}}) train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') mx.fit({'train': train_input, 'test': test_input}, wait=False) training_job_name = mx.latest_training_job.name print("Waiting to re-attach to the training job: %s" % training_job_name) time.sleep(20) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): print("Re-attaching now to: %s" % training_job_name) estimator = MXNet.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) predictor.predict(data)
def test_git_support_codecommit_with_mxnet(sagemaker_local_session): script_path = "mnist.py" data_path = os.path.join(DATA_DIR, "mxnet_mnist") git_config = { "repo": CODECOMMIT_REPO, "branch": CODECOMMIT_BRANCH, "username": "******", "password": "******", } source_dir = "mxnet" dependencies = ["foo/bar.py"] mx = MXNet( entry_point=script_path, role="SageMakerRole", source_dir=source_dir, dependencies=dependencies, framework_version=MXNet.LATEST_VERSION, py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type="local", sagemaker_session=sagemaker_local_session, git_config=git_config, ) mx.fit( { "train": "file://" + os.path.join(data_path, "train"), "test": "file://" + os.path.join(data_path, "test"), } ) files = [file for file in os.listdir(mx.source_dir)] assert "some_file" in files assert "mnist.py" in files assert os.path.exists(mx.dependencies[0]) with lock.lock(LOCK_PATH): try: client = sagemaker_local_session.sagemaker_client desc = client.describe_training_job(TrainingJobName=mx.latest_training_job.name) model_data = desc["ModelArtifacts"]["S3ModelArtifacts"] model = MXNetModel( model_data, "SageMakerRole", entry_point=script_path, source_dir=source_dir, dependencies=dependencies, py_version=PYTHON_VERSION, sagemaker_session=sagemaker_local_session, framework_version=MXNet.LATEST_VERSION, git_config=git_config, ) predictor = model.deploy(1, "local") data = numpy.zeros(shape=(1, 1, 28, 28)) result = predictor.predict(data) assert result is not None finally: predictor.delete_endpoint()
def test_mxnet_with_default_profiler_config_and_profiler_rule( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") training_job_name = unique_name_from_base( "test-profiler-mxnet-training") mx.fit( inputs={ "train": train_input, "test": test_input }, job_name=training_job_name, wait=False, ) job_description = mx.latest_training_job.describe() assert (job_description["ProfilerConfig"] == ProfilerConfig( s3_output_path=mx.output_path, system_monitor_interval_millis=500)._to_request_dict()) assert job_description.get("ProfilingStatus") == "Enabled" profiler_rule_configuration = job_description.get( "ProfilerRuleConfigurations")[0] assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"]) assert profiler_rule_configuration[ "RuleEvaluatorImage"] == get_rule_container_image_uri( mx.sagemaker_session.boto_region_name) assert profiler_rule_configuration["RuleParameters"] == { "rule_to_invoke": "ProfilerReport" } with pytest.raises(ValueError) as error: mx.enable_default_profiling() assert "Debugger monitoring is already enabled." in str(error)
def test_mxnet_with_built_in_profiler_rule_with_custom_parameters( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): custom_profiler_report_rule = ProfilerRule.sagemaker( rule_configs.ProfilerReport(CPUBottleneck_threshold=90), name="CustomProfilerReportRule") script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, rules=[custom_profiler_report_rule], ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") training_job_name = unique_name_from_base( "test-profiler-mxnet-training") mx.fit( inputs={ "train": train_input, "test": test_input }, job_name=training_job_name, wait=False, ) job_description = mx.latest_training_job.describe() assert job_description.get("ProfilingStatus") == "Enabled" assert (job_description.get("ProfilerConfig") == ProfilerConfig( s3_output_path=mx.output_path, system_monitor_interval_millis=500)._to_request_dict()) profiler_rule_configuration = job_description.get( "ProfilerRuleConfigurations")[0] assert profiler_rule_configuration[ "RuleConfigurationName"] == "CustomProfilerReportRule" assert profiler_rule_configuration["RuleEvaluatorImage"] == mx.rules[ 0].image_uri assert profiler_rule_configuration["RuleParameters"] == { "rule_to_invoke": "ProfilerReport", "CPUBottleneck_threshold": "90", }
def test_failed_training_job(sagemaker_session, mxnet_full_version): with timeout(): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'failure_script.py') mx = MXNet(entry_point=script_path, role='SageMakerRole', framework_version=mxnet_full_version, py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) with pytest.raises(ValueError) as e: mx.fit() assert 'ExecuteUserScriptError' in str(e.value)
def test_dgl_training(sagemaker_session, ecr_image, instance_type): dgl = MXNet(entry_point=DGL_SCRIPT_PATH, role='SageMakerRole', train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image) with timeout(minutes=15): job_name = utils.unique_name_from_base('test-dgl-image') dgl.fit(job_name=job_name)
def test_nlp_training(sagemaker_session, ecr_image, instance_type): nlp = MXNet(entry_point=NLP_SCRIPT_PATH, role='SageMakerRole', train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, train_max_run=5 * 60) job_name = utils.unique_name_from_base('test-nlp-image') nlp.fit(job_name=job_name)
def test_mxnet_with_disable_profiler_then_enable_default_profiling( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, disable_profiler=True, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") training_job_name = unique_name_from_base( "test-profiler-mxnet-training") mx.fit( inputs={ "train": train_input, "test": test_input }, job_name=training_job_name, wait=False, ) job_description = mx.latest_training_job.describe() assert job_description.get("ProfilerConfig") is None assert job_description.get("ProfilerRuleConfigurations") is None assert job_description.get("ProfilingStatus") == "Disabled" _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name) mx.enable_default_profiling() job_description = mx.latest_training_job.describe() assert job_description["ProfilerConfig"][ "S3OutputPath"] == mx.output_path
def test_mxnet_with_custom_rule_and_actions( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, actions, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): rules = [_get_custom_rule(sagemaker_session, actions)] script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, rules=rules, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train" ) test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test" ) mx.fit({"train": train_input, "test": test_input}) job_description = mx.latest_training_job.describe() for index, rule in enumerate(rules): assert ( job_description["DebugRuleConfigurations"][index]["RuleConfigurationName"] == rule.name ) assert ( job_description["DebugRuleConfigurations"][index]["RuleEvaluatorImage"] == rule.image_uri ) assert job_description["DebugRuleConfigurations"][index]["VolumeSizeInGB"] == 30 assert ( _get_rule_evaluation_statuses(job_description) == mx.latest_training_job.rule_job_summary() ) _wait_and_assert_that_no_rule_jobs_errored(training_job=mx.latest_training_job)
def test_private_github( sagemaker_local_session, mxnet_training_latest_version, mxnet_training_latest_py_version ): script_path = "mnist.py" data_path = os.path.join(DATA_DIR, "mxnet_mnist") git_config = { "repo": PRIVATE_GIT_REPO, "branch": PRIVATE_BRANCH, "commit": PRIVATE_COMMIT, "2FA_enabled": False, "username": "******", "password": "", # TODO: find a secure approach } source_dir = "mxnet" dependencies = ["foo/bar.py"] mx = MXNet( entry_point=script_path, role="SageMakerRole", source_dir=source_dir, dependencies=dependencies, framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type="local", sagemaker_session=sagemaker_local_session, git_config=git_config, ) mx.fit( { "train": "file://" + os.path.join(data_path, "train"), "test": "file://" + os.path.join(data_path, "test"), } ) files = [file for file in os.listdir(mx.source_dir)] assert "some_file" in files assert "mnist.py" in files assert os.path.exists(mx.dependencies[0]) with lock.lock(LOCK_PATH): try: serving_script_path = "mnist_hosting_with_custom_handlers.py" predictor = mx.deploy(1, "local", entry_point=serving_script_path) data = numpy.zeros(shape=(1, 1, 28, 28)) result = predictor.predict(data) assert result is not None finally: predictor.delete_endpoint()
def test_async_fit( sagemaker_session, mxnet_training_latest_version, mxnet_inference_latest_py_version, cpu_instance_type, ): endpoint_name = "test-mxnet-attach-deploy-{}".format(sagemaker_timestamp()) with timeout(minutes=5): script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", py_version=mxnet_inference_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, framework_version=mxnet_training_latest_version, distribution={"parameter_server": { "enabled": True }}, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") mx.fit({"train": train_input, "test": test_input}, wait=False) training_job_name = mx.latest_training_job.name print("Waiting to re-attach to the training job: %s" % training_job_name) time.sleep(20) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): print("Re-attaching now to: %s" % training_job_name) estimator = MXNet.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) result = predictor.predict(data) assert result is not None
def test_failed_training_job(sagemaker_session, mxnet_full_version): with timeout(minutes=15): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'failure_script.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') mx = MXNet(entry_point=script_path, role='SageMakerRole', framework_version=mxnet_full_version, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train-failure') with pytest.raises(ValueError) as e: mx.fit(train_input) assert 'This failure is expected' in str(e.value)
def test_failed_training_job(sagemaker_session, mxnet_full_version): with timeout(minutes=15): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'failure_script.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') mx = MXNet(entry_point=script_path, role='SageMakerRole', framework_version=mxnet_full_version, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train-failure') with pytest.raises(ValueError) as e: mx.fit(train_input) assert 'This failure is expected' in str(e.value)
def mxnet_training_job(sagemaker_session, mxnet_full_version): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') mx = MXNet(entry_point=script_path, role='SageMakerRole', framework_version=mxnet_full_version, py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') mx.fit({'train': train_input, 'test': test_input}) return mx.latest_training_job.name
def mxnet_training_job(sagemaker_session, mxnet_full_version): with timeout(minutes=15): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') mx = MXNet(entry_point=script_path, role='SageMakerRole', framework_version=mxnet_full_version, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') mx.fit({'train': train_input, 'test': test_input}) return mx.latest_training_job.name
def mxnet_training_job(sagemaker_session): with timeout(minutes=15): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') mx.fit({'train': train_input, 'test': test_input}) return mx.latest_training_job.name
def test_codecommit( sagemaker_local_session, mxnet_training_latest_version, mxnet_training_latest_py_version ): script_path = "mnist.py" data_path = os.path.join(DATA_DIR, "mxnet_mnist") git_config = { "repo": CODECOMMIT_REPO, "branch": CODECOMMIT_BRANCH, "username": "******", "password": "", # TODO: assume a role to get temporary credentials } source_dir = "mxnet" dependencies = ["foo/bar.py"] mx = MXNet( entry_point=script_path, role="SageMakerRole", source_dir=source_dir, dependencies=dependencies, framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type="local", sagemaker_session=sagemaker_local_session, git_config=git_config, ) mx.fit( { "train": "file://" + os.path.join(data_path, "train"), "test": "file://" + os.path.join(data_path, "test"), } ) files = [file for file in os.listdir(mx.source_dir)] assert "some_file" in files assert "mnist.py" in files assert os.path.exists(mx.dependencies[0]) with lock.lock(LOCK_PATH): try: predictor = mx.deploy(1, "local") data = numpy.zeros(shape=(1, 1, 28, 28)) result = predictor.predict(data) assert result is not None finally: predictor.delete_endpoint()
def test_git_support_with_mxnet(sagemaker_local_session, mxnet_full_version): script_path = "mnist.py" data_path = os.path.join(DATA_DIR, "mxnet_mnist") git_config = {"repo": GIT_REPO, "branch": BRANCH, "commit": COMMIT} dependencies = ["foo/bar.py"] mx = MXNet( entry_point=script_path, role="SageMakerRole", source_dir="mxnet", dependencies=dependencies, framework_version=MXNet.LATEST_VERSION, py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type="local", sagemaker_session=sagemaker_local_session, git_config=git_config, ) mx.fit({ "train": "file://" + os.path.join(data_path, "train"), "test": "file://" + os.path.join(data_path, "test"), }) files = [file for file in os.listdir(mx.source_dir)] assert "some_file" in files assert "mnist.py" in files assert os.path.exists(mx.dependencies[0]) with lock.lock(LOCK_PATH): try: predictor = mx.deploy(initial_instance_count=1, instance_type="local") data = numpy.zeros(shape=(1, 1, 28, 28)) result = predictor.predict(data) assert result is not None finally: predictor.delete_endpoint()
def _test_distributed_training_horovod(ecr_image, sagemaker_session, instance_type, tmpdir, framework_version): mpi_options = '-verbose -x orte_base_help_aggregate=0' estimator = MXNet( entry_point=os.path.join(RESOURCE_PATH, 'mnist', 'horovod_mnist.py'), role='SageMakerRole', instance_type=instance_type, instance_count=2, image_uri=ecr_image, framework_version=framework_version, py_version='py3', hyperparameters={'sagemaker_mpi_enabled': True, 'sagemaker_mpi_custom_mpi_options': mpi_options, 'sagemaker_mpi_num_of_processes_per_host': 1}, sagemaker_session=sagemaker_session ) estimator.fit(job_name=unique_name_from_base('test-mx-horovod')) model_data_source = sagemaker.local.data.get_data_source_instance( estimator.model_data, sagemaker_session) for filename in model_data_source.get_file_list(): assert os.path.basename(filename) == 'model.tar.gz'
def test_mxnet_with_custom_profiler_config_then_update_rule_and_config( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): profiler_config = ProfilerConfig( s3_output_path= f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/system", system_monitor_interval_millis=1000, ) script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, profiler_config=profiler_config, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") training_job_name = unique_name_from_base( "test-profiler-mxnet-training") mx.fit( inputs={ "train": train_input, "test": test_input }, job_name=training_job_name, wait=False, ) job_description = mx.latest_training_job.describe() assert job_description.get( "ProfilerConfig") == profiler_config._to_request_dict() assert job_description.get("ProfilingStatus") == "Enabled" profiler_rule_configuration = job_description.get( "ProfilerRuleConfigurations")[0] assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"]) assert profiler_rule_configuration[ "RuleEvaluatorImage"] == get_rule_container_image_uri( mx.sagemaker_session.boto_region_name) assert profiler_rule_configuration["RuleParameters"] == { "rule_to_invoke": "ProfilerReport" } _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name) mx.update_profiler( rules=[ProfilerRule.sagemaker(rule_configs.CPUBottleneck())], system_monitor_interval_millis=500, ) job_description = mx.latest_training_job.describe() assert job_description["ProfilerConfig"][ "S3OutputPath"] == profiler_config.s3_output_path assert job_description["ProfilerConfig"][ "ProfilingIntervalInMilliseconds"] == 500 profiler_report_rule_config = job_description.get( "ProfilerRuleConfigurations")[0] assert re.match(r"ProfilerReport-\d*", profiler_report_rule_config["RuleConfigurationName"]) assert profiler_report_rule_config[ "RuleEvaluatorImage"] == get_rule_container_image_uri( mx.sagemaker_session.boto_region_name) assert profiler_report_rule_config["RuleParameters"] == { "rule_to_invoke": "ProfilerReport" }
def test_training(sagemaker_session, ecr_image, instance_type, instance_count): from smexperiments.experiment import Experiment from smexperiments.trial import Trial from smexperiments.trial_component import TrialComponent sm_client = sagemaker_session.sagemaker_client experiment_name = "mxnet-container-integ-test-{}".format(int(time.time())) experiment = Experiment.create( experiment_name=experiment_name, description= "Integration test experiment from sagemaker-mxnet-container", sagemaker_boto_client=sm_client, ) trial_name = "mxnet-container-integ-test-{}".format(int(time.time())) trial = Trial.create(experiment_name=experiment_name, trial_name=trial_name, sagemaker_boto_client=sm_client) hyperparameters = { "random_seed": True, "num_steps": 50, "smdebug_path": "/opt/ml/output/tensors", "epochs": 1, } mx = MXNet( entry_point=SCRIPT_PATH, role="SageMakerRole", train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, hyperparameters=hyperparameters, ) training_job_name = utils.unique_name_from_base("test-mxnet-image") # create a training job and wait for it to complete with timeout(minutes=15): prefix = "mxnet_mnist_gluon_basic_hook_demo/{}".format( utils.sagemaker_timestamp()) train_input = mx.sagemaker_session.upload_data( path=os.path.join(DATA_PATH, "train"), key_prefix=prefix + "/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(DATA_PATH, "test"), key_prefix=prefix + "/test") mx.fit({ "train": train_input, "test": test_input }, job_name=training_job_name, wait=False) training_job = sm_client.describe_training_job( TrainingJobName=training_job_name) training_job_arn = training_job["TrainingJobArn"] # verify trial component auto created from the training job trial_component_summary = None attempts = 0 while True: trial_components = list( TrialComponent.list(source_arn=training_job_arn, sagemaker_boto_client=sm_client)) if len(trial_components) > 0: trial_component_summary = trial_components[0] break if attempts < 10: attempts += 1 sleep(500) assert trial_component_summary is not None trial_component = TrialComponent.load( trial_component_name=trial_component_summary.trial_component_name, sagemaker_boto_client=sm_client, ) # associate the trial component with the trial trial.add_trial_component(trial_component) # cleanup trial.remove_trial_component(trial_component_summary.trial_component_name) trial_component.delete() trial.delete() experiment.delete()
def test_mxnet_with_profiler_and_debugger_then_disable_framework_metrics( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): rules = [ Rule.sagemaker(rule_configs.vanishing_gradient()), Rule.sagemaker(base_config=rule_configs.all_zero(), rule_parameters={"tensor_regex": ".*"}), ProfilerRule.sagemaker(rule_configs.ProfilerReport(), name="CustomProfilerReportRule"), ] debugger_hook_config = DebuggerHookConfig( s3_output_path= f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/tensors", ) profiler_config = ProfilerConfig( s3_output_path= f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/system", system_monitor_interval_millis=1000, framework_profile_params=FrameworkProfile(), ) script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, rules=rules, debugger_hook_config=debugger_hook_config, profiler_config=profiler_config, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") training_job_name = unique_name_from_base( "test-profiler-mxnet-training") mx.fit( inputs={ "train": train_input, "test": test_input }, job_name=training_job_name, wait=False, ) job_description = mx.latest_training_job.describe() assert job_description[ "ProfilerConfig"] == profiler_config._to_request_dict() assert job_description[ "DebugHookConfig"] == debugger_hook_config._to_request_dict() assert job_description.get("ProfilingStatus") == "Enabled" profiler_rule_configuration = job_description.get( "ProfilerRuleConfigurations")[0] assert profiler_rule_configuration[ "RuleConfigurationName"] == "CustomProfilerReportRule" assert profiler_rule_configuration["RuleEvaluatorImage"] == mx.rules[ 0].image_uri assert profiler_rule_configuration["RuleParameters"] == { "rule_to_invoke": "ProfilerReport", } for index, rule in enumerate(mx.debugger_rules): assert (job_description["DebugRuleConfigurations"][index] ["RuleConfigurationName"] == rule.name) assert (job_description["DebugRuleConfigurations"][index] ["RuleEvaluatorImage"] == rule.image_uri) _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name) mx.update_profiler(disable_framework_metrics=True) job_description = mx.latest_training_job.describe() assert job_description["ProfilerConfig"]["ProfilingParameters"] == {}
def test_mxnet_with_enable_framework_metrics_then_update_framework_metrics( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): profiler_config = ProfilerConfig( framework_profile_params=FrameworkProfile(start_step=1, num_steps=5)) script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, profiler_config=profiler_config, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") training_job_name = unique_name_from_base( "test-profiler-mxnet-training") mx.fit( inputs={ "train": train_input, "test": test_input }, job_name=training_job_name, wait=False, ) job_description = mx.latest_training_job.describe() assert (job_description["ProfilerConfig"]["ProfilingParameters"] == profiler_config._to_request_dict()["ProfilingParameters"]) assert job_description.get("ProfilingStatus") == "Enabled" _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name) updated_framework_profile = FrameworkProfile( detailed_profiling_config=DetailedProfilingConfig( profile_default_steps=True)) mx.update_profiler(framework_profile_params=updated_framework_profile) job_description = mx.latest_training_job.describe() assert (job_description["ProfilerConfig"]["ProfilingParameters"] == updated_framework_profile.profiling_parameters) profiler_rule_configuration = job_description.get( "ProfilerRuleConfigurations")[0] assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"]) assert profiler_rule_configuration[ "RuleEvaluatorImage"] == get_rule_container_image_uri( mx.sagemaker_session.boto_region_name) assert profiler_rule_configuration["RuleParameters"] == { "rule_to_invoke": "ProfilerReport" }