コード例 #1
0
def test_mxnet_with_all_rules_and_configs(sagemaker_session,
                                          mxnet_full_version,
                                          cpu_instance_type):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        rules = [
            Rule.sagemaker(rule_configs.vanishing_gradient()),
            Rule.sagemaker(base_config=rule_configs.all_zero(),
                           rule_parameters={"tensor_regex": ".*"}),
            Rule.sagemaker(rule_configs.loss_not_decreasing()),
            _get_custom_rule(sagemaker_session),
        ]
        debugger_hook_config = DebuggerHookConfig(s3_output_path=os.path.join(
            "s3://", sagemaker_session.default_bucket(), str(uuid.uuid4()),
            "tensors"))
        tensorboard_output_config = TensorBoardOutputConfig(
            s3_output_path=os.path.join("s3://",
                                        sagemaker_session.default_bucket(),
                                        str(uuid.uuid4()), "tensorboard"))

        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_full_version,
            py_version=PYTHON_VERSION,
            train_instance_count=1,
            train_instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            rules=rules,
            debugger_hook_config=debugger_hook_config,
            tensorboard_output_config=tensorboard_output_config,
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/mxnet_mnist/train")
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/mxnet_mnist/test")

        mx.fit({"train": train_input, "test": test_input})

        job_description = mx.latest_training_job.describe()

        for index, rule in enumerate(rules):
            assert (job_description["DebugRuleConfigurations"][index]
                    ["RuleConfigurationName"] == rule.name)
            assert (job_description["DebugRuleConfigurations"][index]
                    ["RuleEvaluatorImage"] == rule.image_uri)
        assert job_description[
            "DebugHookConfig"] == debugger_hook_config._to_request_dict()
        assert (job_description["TensorBoardOutputConfig"] ==
                tensorboard_output_config._to_request_dict())
        assert (job_description["DebugRuleEvaluationStatuses"] ==
                mx.latest_training_job.rule_job_summary())

        _wait_and_assert_that_no_rule_jobs_errored(
            training_job=mx.latest_training_job)
コード例 #2
0
def test_mxnet_with_tensorboard_output_config(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        tensorboard_output_config = TensorBoardOutputConfig(
            s3_output_path=os.path.join("s3://",
                                        sagemaker_session.default_bucket(),
                                        str(uuid.uuid4()), "tensorboard"))

        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            tensorboard_output_config=tensorboard_output_config,
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/mxnet_mnist/train")
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/mxnet_mnist/test")

        mx.fit({"train": train_input, "test": test_input})

        job_description = mx.latest_training_job.describe()
        assert (job_description["TensorBoardOutputConfig"] ==
                tensorboard_output_config._to_request_dict())

        _wait_and_assert_that_no_rule_jobs_errored(
            training_job=mx.latest_training_job)
コード例 #3
0
ファイル: run_aws.py プロジェクト: jkhouja/experimenter
    def my_aws_app(cfg: DictConfig) -> None:

        script_folder = "."  # todo. this is overriden by hydra
        script_folder = (hydra.utils.get_original_cwd()
                         )  # todo. this is overriden by hydra

        as_dict = OmegaConf.to_container(cfg, resolve=False)

        # Override s3 datapath
        aws_bucket = cfg.aws.bucket_prefix
        try:
            aws_root_path = aws_bucket + cfg.aws.root_path

        except errors.ConfigAttributeError:
            aws_root_path = aws_bucket + cfg.root_path

        # Get the s3 location to load /save to
        aws_out_path = aws_root_path + "/" + as_dict["output_subdir"]
        aws_data_path = aws_root_path + "/" + as_dict["data_subdir"]

        # Override the job json file with sagemaker local dirs
        as_dict["root_path"] = "/opt/ml/"
        as_dict["data_subdir"] = "input/data/train"
        as_dict["output_subdir"] = "output/data"

        # Set the local dir for tensorboard
        tb_log_dir = "/opt/ml/output/tensorboard/"
        as_dict["tb_log_dir"] = tb_log_dir
        tensorboard_output_config = TensorBoardOutputConfig(
            s3_output_path=aws_out_path,
            container_local_output_path=tb_log_dir,
        )

        print(OmegaConf.to_yaml(cfg))
        print("Overriden Root Path: " + aws_root_path)

        # Save json file to tmp location to be uploaded with script
        tmp_relative_path = "tmp/tmp_job.json"
        tmp_path = script_folder + "/" + tmp_relative_path

        with open(tmp_path, "w") as json_file:
            json.dump(as_dict, json_file)

        wait = cfg.aws.wait
        role = cfg.aws.role
        instance_count = cfg.aws.instance_count
        instance_type = cfg.aws.instance_type
        env = {
            "SAGEMAKER_REQUIREMENTS":
            "requirements.txt",  # path relative to `source_dir` below.
        }

        # Using Sagemaker prebuilt Pytorch container
        pytorch_estimator = PyTorch(
            entry_point="run.py",
            source_dir=script_folder,
            hyperparameters={"config_file": tmp_relative_path},
            role=role,
            env=env,
            instance_count=instance_count,
            py_version="py3",
            framework_version="1.5.0",
            output_path=aws_out_path,
            base_job_name=cfg.experiment_name,
            instance_type=instance_type,
            tensorboard_output_config=tensorboard_output_config,
        )

        pytorch_estimator.fit({"train": aws_data_path}, wait=wait)
        os.remove(tmp_path)
コード例 #4
0
import sagemaker

from sagemaker.debugger import (
    TensorBoardOutputConfig,
    DebuggerHookConfig,
    CollectionConfig,
)

# TODO: change me
BUCKET_NAME = "MY_BUCKET"
REPO_NAME = "REPO_NAME"

s3_output_location = f"s3://{BUCKET_NAME}/sagemaker/{REPO_NAME}"

tensorboard_output_config = TensorBoardOutputConfig(
    s3_output_path=f"{s3_output_location}/tensorboard",
    container_local_output_path="/opt/ml/output/tensorboard",
)

hook_config = DebuggerHookConfig(
    s3_output_path=s3_output_location,
    collection_configs=[
        CollectionConfig("weights"),
        CollectionConfig("gradients"),
        CollectionConfig("biases")
    ],
)

sess = sagemaker.Session(default_bucket=BUCKET_NAME)
role = os.environ["SAGEMAKER_ROLE"]
tag = os.environ.get("CIRCLE_BRANCH") or "latest"
account_url = os.environ["AWS_ECR_ACCOUNT_URL"]
コード例 #5
0
    def runTrainingJob(
        self,
        framework,
        source_dir,
        entry_point,
        dependencies,
        hyperparameters,
        instance_type=constants.DEFAULT_INSTANCE_TYPE_TRAINING,
        instance_count=constants.DEFAULT_INSTANCE_COUNT,
        role_name=constants.DEFAULT_IAM_ROLE,
        additional_inputs=dict(),
        model_uri=None,
        use_spot_instances=constants.DEFAULT_USE_SPOT,
        max_wait_mins=constants.DEFAULT_MAX_WAIT,
        volume_size=constants.DEFAULT_VOLUME_SIZE,
        max_run_mins=constants.DEFAULT_MAX_RUN,
        tags=dict(),
        input_distribution="FullyReplicated",
        metric_definitions=dict(),
        enable_sagemaker_metrics=False,
        **additionalEstimatorArgs,
    ):
        """
        Runs a training job

        Arguments:
            source_dir - local/s3
            entry_point - entry point
            dependencies - additional local dependencies (directories) to be copied to the code path
            hyperparameters -
            instance_type -
            instance_count -
            model_uri - local/s3
            ...

        Returns estimator object
        """
        logger.info(
            f"===== Running a training job {self.task_name} source_dir={source_dir} "
            f"entry_point={entry_point} hyperparameters={hyperparameters}... ====="
        )
        job_name = self._getJobName()

        # append the internal dependencies
        dependencies.extend(self.internalDependencies)

        tags["SimpleSagemakerTask"] = self.task_name
        tags["SimpleSagemakerVersion"] = VERSION
        tags = [{"Key": k, "Value": v} for k, v in tags.items()]

        metric_definitions = [{
            "Name": k,
            "Regex": v
        } for k, v in metric_definitions.items()]

        # zero max_wait_mins if not using spot instances
        if not use_spot_instances:
            max_wait_mins = 0
        # if using spot, and max_wait_mins isn't specified -> set it to max_run_mins
        elif not max_wait_mins:
            max_wait_mins = max_run_mins

        classes = {
            "pytorch": PyTorch,
            "tensorflow": TensorFlow,
        }
        estimator_class = classes[framework]

        # Configure TensorBoard
        tensorboard_output_config = TensorBoardOutputConfig(
            s3_output_path=self.baseTaskS3Uri,
            container_local_output_path="/opt/ml/output/tensorboard",
        )

        estimator = estimator_class(
            entry_point=entry_point,
            source_dir=source_dir,
            hyperparameters=hyperparameters,
            image_uri=self.image_uri,
            role=role_name,
            instance_count=instance_count,
            instance_type=instance_type,
            sagemaker_session=self.smSession,
            checkpoint_s3_uri=self.stateS3Uri,
            checkpoint_local_path=self.stateLocalPath,
            output_path=self.baseTaskS3Uri,
            code_location=self.baseTaskS3Uri,
            dependencies=dependencies,
            container_log_level=logging.INFO,
            volume_size=volume_size,
            max_run=max_run_mins * 60,
            model_uri=model_uri,
            use_spot_instances=use_spot_instances,
            max_wait=max_wait_mins * 60,
            tags=tags,
            metric_definitions=metric_definitions,
            enable_sagemaker_metrics=enable_sagemaker_metrics,
            tensorboard_output_config=tensorboard_output_config,
            debugger_hook_config=False,
            **additionalEstimatorArgs,
        )
        inputs = dict()
        if self.inputS3Uri:
            inputs.update({
                "data":
                TrainingInput(self.inputS3Uri, distribution=input_distribution)
            })
        if additional_inputs:
            inputs.update(additional_inputs)

        estimator.fit(inputs=inputs if inputs else None, job_name=job_name)
        # training_job_description = estimator.latest_training_job.describe()
        # logging.info(f"Job is done: {training_job_description}")
        training_job_description = self.smSession.describe_training_job(
            job_name)

        self.estimators.append(estimator)
        self.jobNames.append(job_name)
        self.descriptions.append(training_job_description)
        if "Completed" != training_job_description["TrainingJobStatus"]:
            logger.error(
                f"Task failed with status: {training_job_description['TrainingJobStatus']}"
            )
        return job_name