Esempio n. 1
0
    def __init__(self,
                 instance_type,
                 instance_count,
                 image,
                 sagemaker_session=None):
        """Initialize a SageMakerContainer instance

        It uses a :class:`sagemaker.session.Session` for general interaction with user configuration
        such as getting the default sagemaker S3 bucket. However this class does not call any of the
        SageMaker APIs.

        Args:
            instance_type (str): The instance type to use. Either 'local' or 'local_gpu'
            instance_count (int): The number of instances to create.
            image (str): docker image to use.
            sagemaker_session (sagemaker.session.Session): a sagemaker session to use when interacting
                with SageMaker.
        """
        from sagemaker.local.local_session import LocalSession
        self.sagemaker_session = sagemaker_session or LocalSession()
        self.instance_type = instance_type
        self.instance_count = instance_count
        self.image = image
        # Since we are using a single docker network, Generate a random suffix to attach to the container names.
        #  This way multiple jobs can run in parallel.
        suffix = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(5))
        self.hosts = [
            '{}-{}-{}'.format(CONTAINER_PREFIX, i, suffix)
            for i in range(1, self.instance_count + 1)
        ]
        self.container_root = None
        self.container = None
Esempio n. 2
0
    def __init__(self, role, train_instance_count, train_instance_type,
                 train_volume_size=30, train_max_run=24 * 60 * 60, input_mode='File',
                 output_path=None, output_kms_key=None, base_job_name=None, sagemaker_session=None):
        """Initialize an ``EstimatorBase`` instance.

        Args:
            role (str): An AWS IAM role (either name or full ARN). The Amazon SageMaker training jobs and APIs
                that create Amazon SageMaker endpoints use this role to access training data and model artifacts.
                After the endpoint is created, the inference code might use the IAM role,
                if it needs to access an AWS resource.
            train_instance_count (int): Number of Amazon EC2 instances to use for training.
            train_instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
            train_volume_size (int): Size in GB of the EBS volume to use for storing input data
                during training (default: 30). Must be large enough to store training data if File Mode is used
                (which is the default).
            train_max_run (int): Timeout in seconds for training (default: 24 * 60 * 60).
                After this amount of time Amazon SageMaker terminates the job regardless of its current status.
            input_mode (str): The input mode that the algorithm supports (default: 'File'). Valid modes:
                'File' - Amazon SageMaker copies the training dataset from the S3 location to a local directory.
                'Pipe' - Amazon SageMaker streams data directly from S3 to the container via a Unix-named pipe.
            output_path (str): S3 location for saving the trainig result (model artifacts and output files).
                If not specified, results are stored to a default bucket. If the bucket with the specific name
                does not exist, the estimator creates the bucket during the
                :meth:`~sagemaker.estimator.EstimatorBase.fit` method execution.
            output_kms_key (str): Optional. KMS key ID for encrypting the training output (default: None).
            base_job_name (str): Prefix for training job name when the :meth:`~sagemaker.estimator.EstimatorBase.fit`
                method launches. If not specified, the estimator generates a default job name, based on
                the training image name and current timestamp.
            sagemaker_session (sagemaker.session.Session): Session object which manages interactions with
                Amazon SageMaker APIs and any other AWS services needed. If not specified, the estimator creates one
                using the default AWS configuration chain.
        """
        self.role = role
        self.train_instance_count = train_instance_count
        self.train_instance_type = train_instance_type
        self.train_volume_size = train_volume_size
        self.train_max_run = train_max_run
        self.input_mode = input_mode

        if self.train_instance_type in ('local', 'local_gpu'):
            self.local_mode = True
            if self.train_instance_type == 'local_gpu' and self.train_instance_count > 1:
                raise RuntimeError("Distributed Training in Local GPU is not supported")

            self.sagemaker_session = sagemaker_session or LocalSession()
        else:
            self.local_mode = False
            self.sagemaker_session = sagemaker_session or Session()

        self.base_job_name = base_job_name
        self._current_job_name = None
        self.output_path = output_path
        self.output_kms_key = output_kms_key
        self.latest_training_job = None
Esempio n. 3
0
    def __init__(
        self,
        instance_type,
        instance_count,
        image,
        sagemaker_session=None,
        container_entrypoint=None,
        container_arguments=None,
    ):
        """Initialize a SageMakerContainer instance

        It uses a :class:`sagemaker.session.Session` for general interaction
        with user configuration such as getting the default sagemaker S3 bucket.
        However this class does not call any of the SageMaker APIs.

        Args:
            instance_type (str): The instance type to use. Either 'local' or
                'local_gpu'
            instance_count (int): The number of instances to create.
            image (str): docker image to use.
            sagemaker_session (sagemaker.session.Session): a sagemaker session
                to use when interacting with SageMaker.
            container_entrypoint (str): the container entrypoint to execute
            container_arguments (str): the container entrypoint arguments
        """
        from sagemaker.local.local_session import LocalSession

        # check if docker-compose is installed
        if find_executable("docker-compose") is None:
            raise ImportError(
                "'docker-compose' is not installed. "
                "Local Mode features will not work without docker-compose. "
                "For more information on how to install 'docker-compose', please, see "
                "https://docs.docker.com/compose/install/")

        self.sagemaker_session = sagemaker_session or LocalSession()
        self.instance_type = instance_type
        self.instance_count = instance_count
        self.image = image
        self.container_entrypoint = container_entrypoint
        self.container_arguments = container_arguments
        # Since we are using a single docker network, Generate a random suffix to attach to the
        # container names. This way multiple jobs can run in parallel.
        suffix = "".join(
            random.choice(string.ascii_lowercase + string.digits)
            for _ in range(5))
        self.hosts = [
            "{}-{}-{}".format(CONTAINER_PREFIX, i, suffix)
            for i in range(1, self.instance_count + 1)
        ]
        self.container_root = None
        self.container = None
Esempio n. 4
0
    def __init__(
        self,
        model_db_client: ModelDbClient,
        experiment_id,
        model_id,
        image=None,
        role=None,
        instance_config={},
        boto_session=None,
        algor_config={},
        train_state=None,
        evaluation_job_name=None,
        eval_state=None,
        eval_scores={},
        input_model_id=None,
        rl_estimator=None,
        input_data_s3_prefix=None,
        manifest_file_path=None,
        eval_data_s3_path=None,
        s3_model_output_path=None,
        training_start_time=None,
        training_end_time=None,
    ):
        """Initialize a model entity in the current experiment

        Args:
            model_db_client (ModelDBClient): A DynamoDB client
                to query the model table. The 'Model' entity use this client
                to read/update the model state.
            experiment_id (str): A unique id for the experiment. The created/loaded
                model will be associated with the given experiment.
            model_id (str): Aa unique id for the model. The model table uses
                model id to manage associated model metadata.
            image (str): The container image to use for training/evaluation.
            role (str): An AWS IAM role (either name or full ARN). The Amazon
                SageMaker training jobs will use this role to access AWS resources.
            instance_config (dict): A dictionary that specify the resource
                configuration for the model training/evaluation job.
            boto_session (boto3.session.Session): A session stores configuration
                state and allows you to create service clients and resources.
            algor_config (dict): A dictionary that specify the algorithm type
                and hyper parameters of the training/evaluation job.
            train_state (str): State of the model training job.
            evaluation_job_name (str): Job name for Latest Evaluation Job for this model
            eval_state (str): State of the model evaluation job.
            input_model_id (str): A unique model id to specify which model to use
                as a pre-trained model for the model training job.
            rl_estimator (sagemaker.rl.estimator.RLEstimator): A Sagemaker RLEstimator
                entity that handle Reinforcement Learning (RL) execution within
                a SageMaker Training Job.
            input_data_s3_prefix (str): Input data path for the data source of the
                model training job.
            s3_model_output_path (str): Output data path of model artifact for the
                model training job.
            training_start_time (str): Starting timestamp of the model training job.
            training_end_time (str): Finished timestamp of the model training job.

        Returns:
            orchestrator.model_manager.ModelManager: A ``Model`` object associated
            with the given experiment.
        """

        self.model_db_client = model_db_client
        self.experiment_id = experiment_id
        self.model_id = model_id

        # Currently we are not storing image/role and other model params in ModelDb
        self.image = image
        self.role = role
        self.instance_config = instance_config
        self.algor_config = algor_config

        # load configs
        self.instance_type = self.instance_config.get("instance_type", "local")
        self.instance_count = self.instance_config.get("instance_count", 1)
        self.algor_params = self.algor_config.get("algorithms_parameters", {})

        # create a local ModelRecord object.
        self.model_record = ModelRecord(
            experiment_id,
            model_id,
            train_state,
            evaluation_job_name,
            eval_state,
            eval_scores,
            input_model_id,
            input_data_s3_prefix,
            manifest_file_path,
            eval_data_s3_path,
            s3_model_output_path,
            training_start_time,
            training_end_time,
        )

        # try to save this record file. if it throws RecordAlreadyExistsException
        # reload the record from ModelDb, and recreate
        try:
            self.model_db_client.create_new_model_record(
                self.model_record.to_ddb_record())
        except RecordAlreadyExistsException:
            logger.debug("Model already exists. Reloading from model record.")
            model_record = self.model_db_client.get_model_record(
                experiment_id, model_id)
            self.model_record = ModelRecord.load_from_ddb_record(model_record)
        except Exception as e:
            logger.error("Unhandled Exception! " + str(e))
            raise UnhandledWorkflowException(
                "Something went wrong while creating a new model")

        if boto_session is None:
            boto_session = boto3.Session()
        self.boto_session = boto_session

        if self.instance_type == "local":
            self.sagemaker_session = LocalSession()
        else:
            self.sagemaker_session = sagemaker.session.Session(
                self.boto_session)
        self.sagemaker_client = self.sagemaker_session.sagemaker_client