def test_mxnet_with_all_rules_and_configs(sagemaker_session, mxnet_full_version, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): rules = [ Rule.sagemaker(rule_configs.vanishing_gradient()), Rule.sagemaker(base_config=rule_configs.all_zero(), rule_parameters={"tensor_regex": ".*"}), Rule.sagemaker(rule_configs.loss_not_decreasing()), _get_custom_rule(sagemaker_session), ] debugger_hook_config = DebuggerHookConfig(s3_output_path=os.path.join( "s3://", sagemaker_session.default_bucket(), str(uuid.uuid4()), "tensors")) tensorboard_output_config = TensorBoardOutputConfig( s3_output_path=os.path.join("s3://", sagemaker_session.default_bucket(), str(uuid.uuid4()), "tensorboard")) script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_full_version, py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, rules=rules, debugger_hook_config=debugger_hook_config, tensorboard_output_config=tensorboard_output_config, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") mx.fit({"train": train_input, "test": test_input}) job_description = mx.latest_training_job.describe() for index, rule in enumerate(rules): assert (job_description["DebugRuleConfigurations"][index] ["RuleConfigurationName"] == rule.name) assert (job_description["DebugRuleConfigurations"][index] ["RuleEvaluatorImage"] == rule.image_uri) assert job_description[ "DebugHookConfig"] == debugger_hook_config._to_request_dict() assert (job_description["TensorBoardOutputConfig"] == tensorboard_output_config._to_request_dict()) assert (job_description["DebugRuleEvaluationStatuses"] == mx.latest_training_job.rule_job_summary()) _wait_and_assert_that_no_rule_jobs_errored( training_job=mx.latest_training_job)
def test_mxnet_with_tensorboard_output_config( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): tensorboard_output_config = TensorBoardOutputConfig( s3_output_path=os.path.join("s3://", sagemaker_session.default_bucket(), str(uuid.uuid4()), "tensorboard")) script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, tensorboard_output_config=tensorboard_output_config, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") mx.fit({"train": train_input, "test": test_input}) job_description = mx.latest_training_job.describe() assert (job_description["TensorBoardOutputConfig"] == tensorboard_output_config._to_request_dict()) _wait_and_assert_that_no_rule_jobs_errored( training_job=mx.latest_training_job)
def my_aws_app(cfg: DictConfig) -> None: script_folder = "." # todo. this is overriden by hydra script_folder = (hydra.utils.get_original_cwd() ) # todo. this is overriden by hydra as_dict = OmegaConf.to_container(cfg, resolve=False) # Override s3 datapath aws_bucket = cfg.aws.bucket_prefix try: aws_root_path = aws_bucket + cfg.aws.root_path except errors.ConfigAttributeError: aws_root_path = aws_bucket + cfg.root_path # Get the s3 location to load /save to aws_out_path = aws_root_path + "/" + as_dict["output_subdir"] aws_data_path = aws_root_path + "/" + as_dict["data_subdir"] # Override the job json file with sagemaker local dirs as_dict["root_path"] = "/opt/ml/" as_dict["data_subdir"] = "input/data/train" as_dict["output_subdir"] = "output/data" # Set the local dir for tensorboard tb_log_dir = "/opt/ml/output/tensorboard/" as_dict["tb_log_dir"] = tb_log_dir tensorboard_output_config = TensorBoardOutputConfig( s3_output_path=aws_out_path, container_local_output_path=tb_log_dir, ) print(OmegaConf.to_yaml(cfg)) print("Overriden Root Path: " + aws_root_path) # Save json file to tmp location to be uploaded with script tmp_relative_path = "tmp/tmp_job.json" tmp_path = script_folder + "/" + tmp_relative_path with open(tmp_path, "w") as json_file: json.dump(as_dict, json_file) wait = cfg.aws.wait role = cfg.aws.role instance_count = cfg.aws.instance_count instance_type = cfg.aws.instance_type env = { "SAGEMAKER_REQUIREMENTS": "requirements.txt", # path relative to `source_dir` below. } # Using Sagemaker prebuilt Pytorch container pytorch_estimator = PyTorch( entry_point="run.py", source_dir=script_folder, hyperparameters={"config_file": tmp_relative_path}, role=role, env=env, instance_count=instance_count, py_version="py3", framework_version="1.5.0", output_path=aws_out_path, base_job_name=cfg.experiment_name, instance_type=instance_type, tensorboard_output_config=tensorboard_output_config, ) pytorch_estimator.fit({"train": aws_data_path}, wait=wait) os.remove(tmp_path)
import sagemaker from sagemaker.debugger import ( TensorBoardOutputConfig, DebuggerHookConfig, CollectionConfig, ) # TODO: change me BUCKET_NAME = "MY_BUCKET" REPO_NAME = "REPO_NAME" s3_output_location = f"s3://{BUCKET_NAME}/sagemaker/{REPO_NAME}" tensorboard_output_config = TensorBoardOutputConfig( s3_output_path=f"{s3_output_location}/tensorboard", container_local_output_path="/opt/ml/output/tensorboard", ) hook_config = DebuggerHookConfig( s3_output_path=s3_output_location, collection_configs=[ CollectionConfig("weights"), CollectionConfig("gradients"), CollectionConfig("biases") ], ) sess = sagemaker.Session(default_bucket=BUCKET_NAME) role = os.environ["SAGEMAKER_ROLE"] tag = os.environ.get("CIRCLE_BRANCH") or "latest" account_url = os.environ["AWS_ECR_ACCOUNT_URL"]
def runTrainingJob( self, framework, source_dir, entry_point, dependencies, hyperparameters, instance_type=constants.DEFAULT_INSTANCE_TYPE_TRAINING, instance_count=constants.DEFAULT_INSTANCE_COUNT, role_name=constants.DEFAULT_IAM_ROLE, additional_inputs=dict(), model_uri=None, use_spot_instances=constants.DEFAULT_USE_SPOT, max_wait_mins=constants.DEFAULT_MAX_WAIT, volume_size=constants.DEFAULT_VOLUME_SIZE, max_run_mins=constants.DEFAULT_MAX_RUN, tags=dict(), input_distribution="FullyReplicated", metric_definitions=dict(), enable_sagemaker_metrics=False, **additionalEstimatorArgs, ): """ Runs a training job Arguments: source_dir - local/s3 entry_point - entry point dependencies - additional local dependencies (directories) to be copied to the code path hyperparameters - instance_type - instance_count - model_uri - local/s3 ... Returns estimator object """ logger.info( f"===== Running a training job {self.task_name} source_dir={source_dir} " f"entry_point={entry_point} hyperparameters={hyperparameters}... =====" ) job_name = self._getJobName() # append the internal dependencies dependencies.extend(self.internalDependencies) tags["SimpleSagemakerTask"] = self.task_name tags["SimpleSagemakerVersion"] = VERSION tags = [{"Key": k, "Value": v} for k, v in tags.items()] metric_definitions = [{ "Name": k, "Regex": v } for k, v in metric_definitions.items()] # zero max_wait_mins if not using spot instances if not use_spot_instances: max_wait_mins = 0 # if using spot, and max_wait_mins isn't specified -> set it to max_run_mins elif not max_wait_mins: max_wait_mins = max_run_mins classes = { "pytorch": PyTorch, "tensorflow": TensorFlow, } estimator_class = classes[framework] # Configure TensorBoard tensorboard_output_config = TensorBoardOutputConfig( s3_output_path=self.baseTaskS3Uri, container_local_output_path="/opt/ml/output/tensorboard", ) estimator = estimator_class( entry_point=entry_point, source_dir=source_dir, hyperparameters=hyperparameters, image_uri=self.image_uri, role=role_name, instance_count=instance_count, instance_type=instance_type, sagemaker_session=self.smSession, checkpoint_s3_uri=self.stateS3Uri, checkpoint_local_path=self.stateLocalPath, output_path=self.baseTaskS3Uri, code_location=self.baseTaskS3Uri, dependencies=dependencies, container_log_level=logging.INFO, volume_size=volume_size, max_run=max_run_mins * 60, model_uri=model_uri, use_spot_instances=use_spot_instances, max_wait=max_wait_mins * 60, tags=tags, metric_definitions=metric_definitions, enable_sagemaker_metrics=enable_sagemaker_metrics, tensorboard_output_config=tensorboard_output_config, debugger_hook_config=False, **additionalEstimatorArgs, ) inputs = dict() if self.inputS3Uri: inputs.update({ "data": TrainingInput(self.inputS3Uri, distribution=input_distribution) }) if additional_inputs: inputs.update(additional_inputs) estimator.fit(inputs=inputs if inputs else None, job_name=job_name) # training_job_description = estimator.latest_training_job.describe() # logging.info(f"Job is done: {training_job_description}") training_job_description = self.smSession.describe_training_job( job_name) self.estimators.append(estimator) self.jobNames.append(job_name) self.descriptions.append(training_job_description) if "Completed" != training_job_description["TrainingJobStatus"]: logger.error( f"Task failed with status: {training_job_description['TrainingJobStatus']}" ) return job_name