def test_mxnet_with_debugger_hook_config(sagemaker_session, mxnet_full_version, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): debugger_hook_config = DebuggerHookConfig( s3_output_path=os.path.join( "s3://", sagemaker_session.default_bucket(), str(uuid.uuid4()), "tensors" ) ) script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_full_version, py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, debugger_hook_config=debugger_hook_config, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train" ) test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test" ) mx.fit({"train": train_input, "test": test_input}) job_description = mx.latest_training_job.describe() assert job_description["DebugHookConfig"] == debugger_hook_config._to_request_dict() _wait_and_assert_that_no_rule_jobs_errored(training_job=mx.latest_training_job)
def test_mxnet_with_rules_and_debugger_hook_config( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): rules = [ Rule.sagemaker(rule_configs.vanishing_gradient()), Rule.sagemaker(base_config=rule_configs.all_zero(), rule_parameters={"tensor_regex": ".*"}), Rule.sagemaker(rule_configs.loss_not_decreasing()), ] debugger_hook_config = DebuggerHookConfig(s3_output_path=os.path.join( "s3://", sagemaker_session.default_bucket(), str(uuid.uuid4()), "tensors")) script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, rules=rules, debugger_hook_config=debugger_hook_config, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") mx.fit({"train": train_input, "test": test_input}) job_description = mx.latest_training_job.describe() for index, rule in enumerate(rules): assert (job_description["DebugRuleConfigurations"][index] ["RuleConfigurationName"] == rule.name) assert (job_description["DebugRuleConfigurations"][index] ["RuleEvaluatorImage"] == rule.image_uri) assert job_description["DebugRuleConfigurations"][index][ "VolumeSizeInGB"] == 0 assert (job_description["DebugRuleConfigurations"][index] ["RuleParameters"]["rule_to_invoke"] == rule.rule_parameters["rule_to_invoke"]) assert job_description[ "DebugHookConfig"] == debugger_hook_config._to_request_dict() assert (job_description["DebugRuleEvaluationStatuses"] == mx.latest_training_job.rule_job_summary()) _wait_and_assert_that_no_rule_jobs_errored( training_job=mx.latest_training_job)
def tensorflow_estimator(): s3_output_location = 's3://sagemaker/models' s3_source_location = 's3://sagemaker/source' estimator = TensorFlow( entry_point='tf_train.py', role=EXECUTION_ROLE, framework_version='1.13', instance_count=1, instance_type='ml.p2.xlarge', output_path=s3_output_location, source_dir=s3_source_location, image_uri=TENSORFLOW_IMAGE, model_dir=False, hyperparameters={ 'training_steps': 1000, 'evaluation_steps': 100, 'checkpoint_path': 's3://sagemaker/models/sagemaker-tensorflow/checkpoints', }) estimator.debugger_hook_config = DebuggerHookConfig( s3_output_path='s3://sagemaker/models/debug') estimator.sagemaker_session = MagicMock() estimator.sagemaker_session.boto_region_name = 'us-east-1' estimator.sagemaker_session._default_bucket = 'sagemaker' return estimator
def sklearn_preprocessor(): script_path = 'sklearn_abalone_featurizer.py' source_dir = 's3://sagemaker/source' sagemaker_session = MagicMock() sagemaker_session.boto_region_name = 'us-east-1' sklearn_preprocessor = SKLearn(entry_point=script_path, role=SAGEMAKER_EXECUTION_ROLE, train_instance_type="ml.c4.xlarge", source_dir=source_dir, sagemaker_session=sagemaker_session) sklearn_preprocessor.debugger_hook_config = DebuggerHookConfig( s3_output_path='s3://sagemaker/source/debug') return sklearn_preprocessor
def pca_estimator_with_debug_hook(): s3_output_location = 's3://sagemaker/models' hook_config = DebuggerHookConfig( s3_output_path='s3://sagemaker/output/debug', hook_parameters={ "save_interval": "1" }, collection_configs=[ CollectionConfig("hyperparameters"), CollectionConfig("metrics") ] ) rules = [Rule.sagemaker(rule_configs.confusion(), rule_parameters={ "category_no": "15", "min_diag": "0.7", "max_off_diag": "0.3", "start_step": "17", "end_step": "19"} )] pca = sagemaker.estimator.Estimator( PCA_IMAGE, role=EXECUTION_ROLE, train_instance_count=1, train_instance_type='ml.c4.xlarge', output_path=s3_output_location, debugger_hook_config = hook_config, rules=rules ) pca.set_hyperparameters( feature_dim=50000, num_components=10, subtract_mean=True, algorithm_mode='randomized', mini_batch_size=200 ) pca.sagemaker_session = MagicMock() pca.sagemaker_session.boto_region_name = 'us-east-1' pca.sagemaker_session._default_bucket = 'sagemaker' return pca
def _validate_and_set_debugger_configs(self): """Disable Debugger Hook Config for ParameterServer (PS) as it is not supported in smdebug. Else, set default HookConfig """ ps_enabled = "parameter_server" in self.distribution and self.distribution[ "parameter_server"].get("enabled", False) if ps_enabled: if self.debugger_hook_config is not None or self.debugger_rule_configs is not None: logger.info( "Amazon SageMaker Debugger does not currently support " "Parameter Server distribution") self.debugger_hook_config = None self.debugger_rule_configs = None elif self.debugger_hook_config is None and fw._region_supports_debugger( self.sagemaker_session.boto_session.region_name): # Set defaults for debugging. self.debugger_hook_config = DebuggerHookConfig( s3_output_path=self.output_path)
def create_model(image: str, hyperparameters: dict, instance_type: str, output_path: str, region_name: str, role: str, s3_train: str, s3_validation: str, job_name: str): if image == 'xgboost': container = get_image_uri(region_name, image, '0.90-2') else: container = get_image_uri(region_name, image) save_interval = '1' model = sagemaker.estimator.Estimator( container, role=role, train_instance_count=1, train_instance_type=instance_type, train_use_spot_instances=True, train_max_run=300, train_max_wait=600, output_path=output_path, debugger_hook_config=DebuggerHookConfig( s3_output_path=f's3://{bucket}/{prefix}/debug', collection_configs=[ CollectionConfig(name='metrics', parameters={'save_interval': save_interval}), CollectionConfig(name='feature_importance', parameters={'save_interval': save_interval}), CollectionConfig(name='full_shap', parameters={'save_interval': save_interval}), CollectionConfig(name='average_shap', parameters={'save_interval': save_interval}) ]), rules=[ Rule.sagemaker(rule_configs.class_imbalance(), rule_parameters={'collection_names': 'metrics'}) ]) model.set_hyperparameters(**hyperparameters) data_channel = { 'train': s3_input(s3_train, content_type='text/csv'), 'validation': s3_input(s3_validation, content_type='text/csv') } model.fit(data_channel, job_name=job_name) return model
def _validate_and_set_debugger_configs(self): """ Disable Debugger Hook Config for PS and Horovod as they are not supported in smdebug 0.4.13, the current latest version of smdebug Else, set default HookConfig """ ps_enabled = "parameter_server" in self.distributions and self.distributions[ "parameter_server"].get("enabled", False) mpi_enabled = "mpi" in self.distributions and self.distributions[ "mpi"].get("enabled", False) if ps_enabled or mpi_enabled: if self.debugger_hook_config is not None or self.debugger_rule_configs is not None: logger.info( "Amazon SageMaker Debugger does not currently support " "Parameter Server and MPI distributions") self.debugger_hook_config = None self.debugger_rule_configs = None elif self.debugger_hook_config is None: # Set defaults for debugging. self.debugger_hook_config = DebuggerHookConfig( s3_output_path=self.output_path)
def linear_learner_estimator(): s3_output_location = 's3://sagemaker/models' sagemaker_session = MagicMock() sagemaker_session.boto_region_name = 'us-east-1' ll_estimator = sagemaker.estimator.Estimator( LINEAR_LEARNER_IMAGE, SAGEMAKER_EXECUTION_ROLE, train_instance_count=1, train_instance_type='ml.c4.xlarge', train_volume_size=20, train_max_run=3600, input_mode='File', output_path=s3_output_location, sagemaker_session=sagemaker_session) ll_estimator.debugger_hook_config = DebuggerHookConfig( s3_output_path='s3://sagemaker/models/debug') ll_estimator.set_hyperparameters(feature_dim=10, predictor_type='regressor', mini_batch_size=32) return ll_estimator
def test_training_job_with_debugger_and_profiler( sagemaker_session, pipeline_name, role, pytorch_training_latest_version, pytorch_training_latest_py_version, ): instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") rules = [ Rule.sagemaker(rule_configs.vanishing_gradient()), Rule.sagemaker(base_config=rule_configs.all_zero(), rule_parameters={"tensor_regex": ".*"}), Rule.sagemaker(rule_configs.loss_not_decreasing()), ] debugger_hook_config = DebuggerHookConfig( s3_output_path= f"s3://{sagemaker_session.default_bucket()}/{uuid.uuid4()}/tensors") base_dir = os.path.join(DATA_DIR, "pytorch_mnist") script_path = os.path.join(base_dir, "mnist.py") input_path = sagemaker_session.upload_data( path=os.path.join(base_dir, "training"), key_prefix="integ-test-data/pytorch_mnist/training", ) inputs = TrainingInput(s3_data=input_path) pytorch_estimator = PyTorch( entry_point=script_path, role="SageMakerRole", framework_version=pytorch_training_latest_version, py_version=pytorch_training_latest_py_version, instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, rules=rules, debugger_hook_config=debugger_hook_config, ) step_train = TrainingStep( name="pytorch-train", estimator=pytorch_estimator, inputs=inputs, ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count, instance_type], steps=[step_train], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] execution = pipeline.start() response = execution.describe() assert response["PipelineArn"] == create_arn try: execution.wait(delay=10, max_attempts=60) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 assert execution_steps[0].get("FailureReason", "") == "" assert execution_steps[0]["StepName"] == "pytorch-train" assert execution_steps[0]["StepStatus"] == "Succeeded" training_job_arn = execution_steps[0]["Metadata"]["TrainingJob"]["Arn"] job_description = sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=training_job_arn.split("/")[1]) for index, rule in enumerate(rules): config = job_description["DebugRuleConfigurations"][index] assert config["RuleConfigurationName"] == rule.name assert config["RuleEvaluatorImage"] == rule.image_uri assert config["VolumeSizeInGB"] == 0 assert (config["RuleParameters"]["rule_to_invoke"] == rule.rule_parameters["rule_to_invoke"]) assert job_description[ "DebugHookConfig"] == debugger_hook_config._to_request_dict() assert job_description["ProfilingStatus"] == "Enabled" assert job_description["ProfilerConfig"][ "ProfilingIntervalInMilliseconds"] == 500 finally: try: pipeline.delete() except Exception: pass
input_train_path = "s3://{}/{}/data/train".format(bucket_name, prefix) input_validation_path = "s3://{}/{}/data/val".format(bucket_name, prefix) model_output_path = "s3://{}/{}/model".format(bucket_name, prefix) debug_output_path = 's3://{0}/{1}/model/debug'.format(bucket_name, prefix) model_code_location = 's3://{0}/{1}/code'.format(bucket_name, prefix) entry_point = 'train_xgboost.py' source_dir = 'workflow/training/' # TODO: Upload source files here given we are not calling fit debug_hook_config = DebuggerHookConfig( s3_output_path=debug_output_path, hook_parameters={"save_interval": "1"}, collection_configs=[ CollectionConfig("hyperparameters"), CollectionConfig("metrics"), CollectionConfig("predictions"), CollectionConfig("labels"), CollectionConfig("feature_importance") ]) debug_rules = [ Rule.sagemaker(rule_configs.confusion(), rule_parameters={ "category_no": "15", "min_diag": "0.7", "max_off_diag": "0.3", "start_step": "17", "end_step": "19" }) ]
def test_mxnet_with_profiler_and_debugger_then_disable_framework_metrics( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): rules = [ Rule.sagemaker(rule_configs.vanishing_gradient()), Rule.sagemaker(base_config=rule_configs.all_zero(), rule_parameters={"tensor_regex": ".*"}), ProfilerRule.sagemaker(rule_configs.ProfilerReport(), name="CustomProfilerReportRule"), ] debugger_hook_config = DebuggerHookConfig( s3_output_path= f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/tensors", ) profiler_config = ProfilerConfig( s3_output_path= f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/system", system_monitor_interval_millis=1000, framework_profile_params=FrameworkProfile(), ) script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, rules=rules, debugger_hook_config=debugger_hook_config, profiler_config=profiler_config, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") training_job_name = unique_name_from_base( "test-profiler-mxnet-training") mx.fit( inputs={ "train": train_input, "test": test_input }, job_name=training_job_name, wait=False, ) job_description = mx.latest_training_job.describe() assert job_description[ "ProfilerConfig"] == profiler_config._to_request_dict() assert job_description[ "DebugHookConfig"] == debugger_hook_config._to_request_dict() assert job_description.get("ProfilingStatus") == "Enabled" profiler_rule_configuration = job_description.get( "ProfilerRuleConfigurations")[0] assert profiler_rule_configuration[ "RuleConfigurationName"] == "CustomProfilerReportRule" assert profiler_rule_configuration["RuleEvaluatorImage"] == mx.rules[ 0].image_uri assert profiler_rule_configuration["RuleParameters"] == { "rule_to_invoke": "ProfilerReport", } for index, rule in enumerate(mx.debugger_rules): assert (job_description["DebugRuleConfigurations"][index] ["RuleConfigurationName"] == rule.name) assert (job_description["DebugRuleConfigurations"][index] ["RuleEvaluatorImage"] == rule.image_uri) _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name) mx.update_profiler(disable_framework_metrics=True) job_description = mx.latest_training_job.describe() assert job_description["ProfilerConfig"]["ProfilingParameters"] == {}
# TODO: change me BUCKET_NAME = "MY_BUCKET" REPO_NAME = "REPO_NAME" s3_output_location = f"s3://{BUCKET_NAME}/sagemaker/{REPO_NAME}" tensorboard_output_config = TensorBoardOutputConfig( s3_output_path=f"{s3_output_location}/tensorboard", container_local_output_path="/opt/ml/output/tensorboard", ) hook_config = DebuggerHookConfig( s3_output_path=s3_output_location, collection_configs=[ CollectionConfig("weights"), CollectionConfig("gradients"), CollectionConfig("biases") ], ) sess = sagemaker.Session(default_bucket=BUCKET_NAME) role = os.environ["SAGEMAKER_ROLE"] tag = os.environ.get("CIRCLE_BRANCH") or "latest" account_url = os.environ["AWS_ECR_ACCOUNT_URL"] tf_estimator = Estimator( role=role, train_instance_count=1, train_instance_type="ml.m5.large", base_job_name=tag, sagemaker_session=sess,