def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir', default='data/')
    args = parser.parse_args()

    data_dir = Path(args.data_dir)
    if not data_dir.exists():
        (x_train, y_train), (x_test, y_test) = mnist.load_data()
        data_dir.mkdir()
        np.savez(str(data_dir / 'train'), image=x_train, label=y_train)
        np.savez(str(data_dir / 'test'), image=x_test, label=y_test)

    session = Session()
    s3_bucket_name = os.getenv('S3_BUCKET_NAME', session.default_bucket())
    session.upload_data(path=str(data_dir),
                        bucket=s3_bucket_name,
                        key_prefix='dataset/mnist')
Beispiel #2
0
if __name__ == '__main__':

    local_output_path = "file://"
    local_data_path = os.getenv("DATA_PATH")
    project_name = "housing-price-prediction"
    job_name = project_name + time.strftime("-%Y-%m-%d-%H-%M", time.gmtime())

    # Credentials
    role = os.getenv("AWS_SM_ROLE")
    aws_id = os.getenv("AWS_ID")
    region = os.getenv("AWS_REGION")
    image_uri = "{}.dkr.ecr.{}.amazonaws.com/aws-train".format(aws_id, region)
    print("Training image uri:{}".format(image_uri))
    instance_type = os.getenv("AWS_DEFAULT_INSTANCE")
    bucket_name = os.getenv("AWS_BUCKET")

    # comment this out if you have a bucket already or use specific bucket
    # s3 = boto3.client('s3', region_name=region)
    # s3.create_bucket(Bucket=bucket_name,CreateBucketConfiguration={'LocationConstraint': region})

    # upload the data to sagemaker
    boto_session = boto3.Session(region_name=region)
    sm_session = Session(boto_session=boto_session)
    data_uri = sm_session.upload_data(local_data_path,
                                      bucket=bucket_name,
                                      key_prefix='data',
                                      extra_args=None)
    print(data_uri)
    run(mode="sagemaker")
class Sagemaker:
    """
    Class to provide AWS specific execution of the models.
    In the future, we can make a superclass that defines the basic methods (such as
    uploading data to the right folder/location, loading models etc).
    For now, we will only have AWS.
    This will be very similar to default session objects.
    """

    training_instance_count = 1
    training_instance_type = "ml.m4.xlarge"
    transformer_instance_count = 1
    transformer_instance_type = "ml.c4.xlarge"
    deploy_instance_count = 1
    deploy_instance_type = "ml.c4.xlarge"

    def __init__(
        self,
        bucket: Optional[str] = None,
        role: Optional[str] = None,
        prefix: Optional[str] = None,
        default_model_kwargs: Optional[Dict] = None,
        default_transfomer_kwargs: Optional[Dict] = None,
        default_deploy_kwargs: Optional[Dict] = None,
    ) -> None:
        """
        Initializes the AWS object

        Arguments:
            bucket: The bucket name. Defaulted to the session default bucket
            role: The role name to assume. Default is getting from AWS_DEFAULT_ROLE of the env variables
            prefix: The prefix to use in the bucket. Defaulted to 'data'
            default_model_kwargs: Dict for default kwargs for any sagemaker model.
                Default contains train_instance_type, train_instance_count, role and session
            default_transformer_kwargs: Dict for default kwargs for any sagemaker transformer.
                Default contains instance_type, instance_count, and role.
            default_deploy_kwargs: Dict for default kwargs for any sagemaker deployment.
                Default contains instance_type and initial_instance_count.
        """
        LOGGER.info("Initializing Sagemaker executor")
        self.boto_session = BotoSession(
            aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"),
            aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
            region_name="eu-west-1",
        )
        self.region = self.boto_session.region_name
        self.session = Session(boto_session=self.boto_session)
        self.role = role if role is not None else os.environ.get(
            "AWS_DEFAULT_ROLE")
        self.bucket = bucket if bucket is not None else self.session.default_bucket(
        )
        self.prefix = prefix if prefix is not None else "data"
        self.default_model_kwargs = self._default_model_kwargs(
            self.role, self.session, default_model_kwargs)
        self.default_transformer_kwargs = self._default_transformer_kwargs(
            self.role, self.session, default_transfomer_kwargs)
        self.default_deploy_kwargs = self._default_deploy_kwargs(
            self.role, self.session, default_deploy_kwargs)

    def _default_model_kwargs(self, role, session, input_default) -> Dict:
        initial = {
            "role": role,
            "sagemaker_session": session,
            "train_instance_count": self.training_instance_count,
            "train_instance_type": self.training_instance_type,
        }
        if input_default is not None:
            initial.update(input_default)
        return initial

    def _default_transformer_kwargs(self, role, session,
                                    input_default) -> Dict:
        initial = {
            "role": role,
            "instance_count": self.transformer_instance_count,
            "instance_type": self.transformer_instance_type,
        }
        if input_default is not None:
            initial.update(input_default)
        return initial

    def _default_deploy_kwargs(self, role, session, input_default) -> Dict:
        initial = {
            "initial_instance_count": self.deploy_instance_count,
            "instance_type": self.deploy_instance_type,
        }
        if input_default is not None:
            initial.update(input_default)
        return initial

    def upload_data(
        self,
        local_data_file: str,
        bucket: Optional[str] = None,
        prefix: Optional[str] = None,
    ) -> str:
        """
        Uploads the data from the local data file to S3. Returns the location

        Argument:
            local_data_file: the location of the data
            bucket: The bucket to upload to. Defaulted to the own default bucket
            prefix: The prefix to use to upload to. Defaulted to the own default bucket

        Returns:
            The s3 data location
        """
        if bucket is None:
            bucket = self.bucket
        if prefix is None:
            prefix = self.prefix
        LOGGER.info("Uploading data to S3")
        return self.session.upload_data(local_data_file,
                                        bucket=bucket,
                                        key_prefix=prefix)

    def download_data(
        self,
        file_name: str,
        local_file_directory: str,
        bucket: Optional[str] = None,
        prefix: Optional[str] = None,
    ) -> str:
        """
        Downloads the S3 data and stores it to the local file location.

        Arguments:
            file_name: the name of the file
            local_file_directory: the directory to store the data to
            bucket: The bucket to upload to. Defaulted to the own default bucket
            prefix: The prefix to use to upload to. Defaulted to the own default bucket

        Returns:
            The local file location.
        """
        s3_client = self.boto_session.client("s3")
        if prefix is None:
            prefix = self.prefix
        key = f"{prefix}/{file_name}"
        local_file_name = os.path.join(local_file_directory, file_name)
        LOGGER.info(
            f"Downloading data from s3: from s3://{self.bucket}/{key} to {local_file_name}"
        )
        if not os.path.exists(local_file_directory):
            os.makedirs(local_file_directory)
        s3_client.download_file(Bucket=self.bucket,
                                Key=key,
                                Filename=local_file_name)
        return local_file_name