コード例 #1
0
    def create_model(self):

        if not self.multimodel:

            self.sagemakermodel = Model(
                name="model-" + self.name,
                model_data=self.modelpath[0],
                image=self.image,
                role=sagemaker.get_execution_role(),
                # sagemaker_session=self.session,
                predictor_cls=sagemaker.predictor.RealTimePredictor,
            )

        else:

            self.sagemakermodel = MultiDataModel(
                name="model-" + self.name,
                model_data_prefix="/".join(self.modelpath[0].split("/")[:-1]) +
                "/",
                image=self.image,
                role=sagemaker.get_execution_role(),
                # sagemaker_session=self.session,
                predictor_cls=sagemaker.predictor.RealTimePredictor,
            )

            for path in self.modelpath:
                self.add_model(path, "serving/")

            self.ei = False
コード例 #2
0
def test_deploy_multi_data_framework_model(sagemaker_session, mxnet_model):
    model = MultiDataModel(
        name=MODEL_NAME,
        model_data_prefix=VALID_MULTI_MODEL_DATA_PREFIX,
        sagemaker_session=sagemaker_session,
        model=mxnet_model,
    )

    predictor = model.deploy(
        initial_instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE,
        endpoint_name=MULTI_MODEL_ENDPOINT_NAME,
    )

    # Assert if this is called with mxnet_model parameters
    sagemaker_session.create_model.assert_called_with(
        MODEL_NAME,
        MXNET_ROLE,
        model.prepare_container_def(INSTANCE_TYPE),
        vpc_config=None,
        enable_network_isolation=True,
        tags=None,
    )
    sagemaker_session.endpoint_from_production_variants.assert_called_with(
        name=MULTI_MODEL_ENDPOINT_NAME,
        wait=True,
        tags=None,
        kms_key=None,
        data_capture_config_dict=None,
        production_variants=EXPECTED_PROD_VARIANT,
    )
    sagemaker_session.create_endpoint_config.assert_not_called()
    assert isinstance(predictor, MXNetPredictor)
コード例 #3
0
def test_deploy_multi_data_model(sagemaker_session):
    model = MultiDataModel(
        name=MODEL_NAME,
        model_data_prefix=VALID_MULTI_MODEL_DATA_PREFIX,
        image=IMAGE,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        env={"EXTRA_ENV_MOCK": "MockValue"},
    )
    model.deploy(
        initial_instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE,
        endpoint_name=MULTI_MODEL_ENDPOINT_NAME,
    )

    sagemaker_session.create_model.assert_called_with(
        MODEL_NAME,
        ROLE,
        model.prepare_container_def(INSTANCE_TYPE),
        vpc_config=None,
        enable_network_isolation=False,
        tags=None,
    )
    sagemaker_session.endpoint_from_production_variants.assert_called_with(
        name=MULTI_MODEL_ENDPOINT_NAME,
        wait=True,
        tags=None,
        kms_key=None,
        data_capture_config_dict=None,
        production_variants=EXPECTED_PROD_VARIANT,
    )
コード例 #4
0
def multi_data_model(sagemaker_session):
    return MultiDataModel(
        name=MODEL_NAME,
        model_data_prefix=VALID_MULTI_MODEL_DATA_PREFIX,
        image=IMAGE,
        role=ROLE,
        sagemaker_session=sagemaker_session,
    )
コード例 #5
0
def test_multi_data_model_create_with_model_arg_only(mxnet_model):
    model = MultiDataModel(
        name=MODEL_NAME, model_data_prefix=VALID_MULTI_MODEL_DATA_PREFIX, model=mxnet_model
    )

    assert model.model_data_prefix == VALID_MULTI_MODEL_DATA_PREFIX
    assert model.model == mxnet_model
    assert hasattr(model, "role") is False
    assert hasattr(model, "image") is False
コード例 #6
0
def test_multi_data_model_create_with_invalid_model_data_prefix():
    invalid_model_data_prefix = "https://mybucket/path/"
    with pytest.raises(ValueError) as ex:
        MultiDataModel(
            name=MODEL_NAME, model_data_prefix=invalid_model_data_prefix, image=IMAGE, role=ROLE
        )
    err_msg = 'ValueError: Expecting S3 model prefix beginning with "s3://". Received: "{}"'.format(
        invalid_model_data_prefix
    )
    assert err_msg in str(ex)
コード例 #7
0
def test_deploy_model_update(sagemaker_session):
    model = MultiDataModel(
        name=MODEL_NAME,
        model_data_prefix=VALID_MULTI_MODEL_DATA_PREFIX,
        image=IMAGE,
        role=ROLE,
        sagemaker_session=sagemaker_session,
    )

    model.deploy(
        initial_instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE,
        endpoint_name=MULTI_MODEL_ENDPOINT_NAME,
        update_endpoint=True,
    )

    sagemaker_session.create_model.assert_called()
    sagemaker_session.create_endpoint_config.assert_called_with(
        name=model.name,
        model_name=model.name,
        initial_instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE,
        accelerator_type=None,
        tags=None,
        kms_key=None,
        data_capture_config_dict=None,
    )

    config_name = sagemaker_session.create_endpoint_config(
        name=model.name,
        model_name=model.name,
        initial_instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE,
        accelerator_type=None,
    )
    sagemaker_session.update_endpoint.assert_called_with(
        MULTI_MODEL_ENDPOINT_NAME, config_name, wait=True
    )
    sagemaker_session.create_endpoint.assert_not_called()
コード例 #8
0
def test_prepare_container_def_mxnet(sagemaker_session, mxnet_model):
    expected_container_env_keys = [
        "SAGEMAKER_CONTAINER_LOG_LEVEL",
        "SAGEMAKER_ENABLE_CLOUDWATCH_METRICS",
        "SAGEMAKER_PROGRAM",
        "SAGEMAKER_REGION",
        "SAGEMAKER_SUBMIT_DIRECTORY",
    ]
    model = MultiDataModel(
        name=MODEL_NAME,
        model_data_prefix=VALID_MULTI_MODEL_DATA_PREFIX,
        sagemaker_session=sagemaker_session,
        model=mxnet_model,
    )

    container_def = model.prepare_container_def(INSTANCE_TYPE)

    assert container_def["Image"] == MXNET_IMAGE
    assert container_def["ModelDataUrl"] == VALID_MULTI_MODEL_DATA_PREFIX
    assert container_def["Mode"] == MULTI_MODEL_CONTAINER_MODE
    # Check if the environment variables defined only for MXNetModel
    # are part of the MultiDataModel container definition
    assert set(container_def["Environment"].keys()) == set(expected_container_env_keys)
コード例 #9
0
def test_multi_data_model_create_with_invalid_arguments(sagemaker_session, mxnet_model):
    with pytest.raises(ValueError) as ex:
        MultiDataModel(
            name=MODEL_NAME,
            model_data_prefix=VALID_MULTI_MODEL_DATA_PREFIX,
            image=IMAGE,
            role=ROLE,
            sagemaker_session=sagemaker_session,
            model=mxnet_model,
        )
    assert (
        "Parameters image, role or kwargs are not permitted when model parameter is passed."
        in str(ex)
    )
コード例 #10
0
def test_multi_data_model_create(sagemaker_session):
    model = MultiDataModel(
        name=MODEL_NAME,
        model_data_prefix=VALID_MULTI_MODEL_DATA_PREFIX,
        image=IMAGE,
        role=ROLE,
        sagemaker_session=sagemaker_session,
    )

    assert model.sagemaker_session == sagemaker_session
    assert model.name == MODEL_NAME
    assert model.model_data_prefix == VALID_MULTI_MODEL_DATA_PREFIX
    assert model.role == ROLE
    assert model.image == IMAGE
    assert model.vpc_config is None
コード例 #11
0
def test_multi_data_model_deploy_pretrained_models_update_endpoint(
    container_image, sagemaker_session, cpu_instance_type, alternative_cpu_instance_type
):
    timestamp = sagemaker_timestamp()
    endpoint_name = "test-multimodel-endpoint-{}".format(timestamp)
    model_name = "test-multimodel-{}".format(timestamp)

    # Define pretrained model local path
    pretrained_model_data_local_path = os.path.join(DATA_DIR, "sparkml_model", "mleap_model.tar.gz")

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        model_data_prefix = os.path.join(
            "s3://", sagemaker_session.default_bucket(), "multimodel-{}/".format(timestamp)
        )
        multi_data_model = MultiDataModel(
            name=model_name,
            model_data_prefix=model_data_prefix,
            image_uri=container_image,
            role=ROLE,
            sagemaker_session=sagemaker_session,
        )

        # Add model before deploy
        multi_data_model.add_model(pretrained_model_data_local_path, PRETRAINED_MODEL_PATH_1)
        # Deploy model to an endpoint
        multi_data_model.deploy(1, cpu_instance_type, endpoint_name=endpoint_name)
        # Add model after deploy
        multi_data_model.add_model(pretrained_model_data_local_path, PRETRAINED_MODEL_PATH_2)

        # List model assertions
        endpoint_models = []
        for model_path in multi_data_model.list_models():
            endpoint_models.append(model_path)
        assert PRETRAINED_MODEL_PATH_1 in endpoint_models
        assert PRETRAINED_MODEL_PATH_2 in endpoint_models

        predictor = Predictor(
            endpoint_name=endpoint_name,
            sagemaker_session=sagemaker_session,
            serializer=NumpySerializer(),
            deserializer=string_deserializer,
        )

        data = numpy.zeros(shape=(1, 1, 28, 28))
        result = predictor.predict(data, target_model=PRETRAINED_MODEL_PATH_1)
        assert result == "Invoked model: {}".format(PRETRAINED_MODEL_PATH_1)

        result = predictor.predict(data, target_model=PRETRAINED_MODEL_PATH_2)
        assert result == "Invoked model: {}".format(PRETRAINED_MODEL_PATH_2)

        endpoint_desc = sagemaker_session.sagemaker_client.describe_endpoint(
            EndpointName=endpoint_name
        )
        old_config_name = endpoint_desc["EndpointConfigName"]

        # Update endpoint
        predictor.update_endpoint(
            initial_instance_count=1, instance_type=alternative_cpu_instance_type
        )

        endpoint_desc = sagemaker_session.sagemaker_client.describe_endpoint(
            EndpointName=endpoint_name
        )
        new_config_name = endpoint_desc["EndpointConfigName"]

        new_config = sagemaker_session.sagemaker_client.describe_endpoint_config(
            EndpointConfigName=new_config_name
        )
        assert old_config_name != new_config_name
        assert new_config["ProductionVariants"][0]["InstanceType"] == alternative_cpu_instance_type
        assert new_config["ProductionVariants"][0]["InitialInstanceCount"] == 1

        # Cleanup
        sagemaker_session.sagemaker_client.delete_endpoint_config(
            EndpointConfigName=old_config_name
        )
        sagemaker_session.sagemaker_client.delete_endpoint_config(
            EndpointConfigName=new_config_name
        )
        multi_data_model.delete_model()

    with pytest.raises(Exception) as exception:
        sagemaker_session.sagemaker_client.describe_model(ModelName=model_name)
        assert "Could not find model" in str(exception.value)
        sagemaker_session.sagemaker_client.describe_endpoint_config(name=old_config_name)
        assert "Could not find endpoint" in str(exception.value)
        sagemaker_session.sagemaker_client.describe_endpoint_config(name=new_config_name)
        assert "Could not find endpoint" in str(exception.value)
コード例 #12
0
def test_multi_data_model_deploy_train_model_from_amazon_first_party_estimator(
    container_image, sagemaker_session, cpu_instance_type
):
    timestamp = sagemaker_timestamp()
    endpoint_name = "test-multimodel-endpoint-{}".format(timestamp)
    model_name = "test-multimodel-{}".format(timestamp)

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        rcf_model_v1 = __rcf_training_job(
            sagemaker_session, container_image, cpu_instance_type, 50, 20
        )

        model_data_prefix = os.path.join(
            "s3://", sagemaker_session.default_bucket(), "multimodel-{}/".format(timestamp)
        )
        multi_data_model = MultiDataModel(
            name=model_name,
            model_data_prefix=model_data_prefix,
            model=rcf_model_v1,
            sagemaker_session=sagemaker_session,
        )

        # Add model before deploy
        multi_data_model.add_model(rcf_model_v1.model_data, PRETRAINED_MODEL_PATH_1)
        # Deploy model to an endpoint
        multi_data_model.deploy(1, cpu_instance_type, endpoint_name=endpoint_name)
        # Train another model
        rcf_model_v2 = __rcf_training_job(
            sagemaker_session, container_image, cpu_instance_type, 70, 20
        )
        # Deploy newly trained model
        multi_data_model.add_model(rcf_model_v2.model_data, PRETRAINED_MODEL_PATH_2)

        # List model assertions
        endpoint_models = []
        for model_path in multi_data_model.list_models():
            endpoint_models.append(model_path)
        assert PRETRAINED_MODEL_PATH_1 in endpoint_models
        assert PRETRAINED_MODEL_PATH_2 in endpoint_models

        # Define a predictor to set `serializer` parameter with `NumpySerializer`
        # instead of `JSONSerializer` in the default predictor returned by `MXNetPredictor`
        # Since we are using a placeholder container image the prediction results are not accurate.
        predictor = Predictor(
            endpoint_name=endpoint_name,
            sagemaker_session=sagemaker_session,
            serializer=NumpySerializer(),
            deserializer=string_deserializer,
        )

        data = numpy.random.rand(1, 14)
        # Prediction result for the first model
        result = predictor.predict(data, target_model=PRETRAINED_MODEL_PATH_1)
        assert result == "Invoked model: {}".format(PRETRAINED_MODEL_PATH_1)

        # Prediction result for the second model
        result = predictor.predict(data, target_model=PRETRAINED_MODEL_PATH_2)
        assert result == "Invoked model: {}".format(PRETRAINED_MODEL_PATH_2)

        # Cleanup
        sagemaker_session.sagemaker_client.delete_endpoint_config(EndpointConfigName=endpoint_name)
        multi_data_model.delete_model()
    with pytest.raises(Exception) as exception:
        sagemaker_session.sagemaker_client.describe_model(ModelName=model_name)
        assert "Could not find model" in str(exception.value)
        sagemaker_session.sagemaker_client.describe_endpoint_config(name=endpoint_name)
        assert "Could not find endpoint" in str(exception.value)
コード例 #13
0
def test_multi_data_model_deploy_pretrained_models_local_mode(container_image, sagemaker_session):
    timestamp = sagemaker_timestamp()
    endpoint_name = "test-multimodel-endpoint-{}".format(timestamp)
    model_name = "test-multimodel-{}".format(timestamp)

    # Define pretrained model local path
    pretrained_model_data_local_path = os.path.join(DATA_DIR, "sparkml_model", "mleap_model.tar.gz")

    with timeout(minutes=30):
        model_data_prefix = os.path.join(
            "s3://", sagemaker_session.default_bucket(), "multimodel-{}/".format(timestamp)
        )
        multi_data_model = MultiDataModel(
            name=model_name,
            model_data_prefix=model_data_prefix,
            image_uri=container_image,
            role=ROLE,
            sagemaker_session=sagemaker_session,
        )

        # Add model before deploy
        multi_data_model.add_model(pretrained_model_data_local_path, PRETRAINED_MODEL_PATH_1)
        # Deploy model to an endpoint
        multi_data_model.deploy(1, "local", endpoint_name=endpoint_name)
        # Add models after deploy
        multi_data_model.add_model(pretrained_model_data_local_path, PRETRAINED_MODEL_PATH_2)

        endpoint_models = []
        for model_path in multi_data_model.list_models():
            endpoint_models.append(model_path)
        assert PRETRAINED_MODEL_PATH_1 in endpoint_models
        assert PRETRAINED_MODEL_PATH_2 in endpoint_models

        predictor = Predictor(
            endpoint_name=endpoint_name,
            sagemaker_session=multi_data_model.sagemaker_session,
            serializer=NumpySerializer(),
            deserializer=string_deserializer,
        )

        data = numpy.zeros(shape=(1, 1, 28, 28))
        result = predictor.predict(data, target_model=PRETRAINED_MODEL_PATH_1)
        assert result == "Invoked model: {}".format(PRETRAINED_MODEL_PATH_1)

        result = predictor.predict(data, target_model=PRETRAINED_MODEL_PATH_2)
        assert result == "Invoked model: {}".format(PRETRAINED_MODEL_PATH_2)

        # Cleanup
        multi_data_model.sagemaker_session.sagemaker_client.delete_endpoint_config(
            EndpointConfigName=endpoint_name
        )
        multi_data_model.sagemaker_session.delete_endpoint(endpoint_name)
        multi_data_model.delete_model()
    with pytest.raises(Exception) as exception:
        sagemaker_session.sagemaker_client.describe_model(ModelName=multi_data_model.name)
        assert "Could not find model" in str(exception.value)
        sagemaker_session.sagemaker_client.describe_endpoint_config(name=endpoint_name)
        assert "Could not find endpoint" in str(exception.value)
コード例 #14
0
class Deploy(object):
    def __init__(
        self,
        model,
        script,
        framework=None,
        requirements=None,
        name=None,
        autoscale=False,
        autoscaletarget=1000,
        wait=True,
        bucket=None,
        session=None,
        image=None,
        dockerfilepath=None,
        instance_type=None,
        instance_count=1,
        budget=100,
        ei=None,
        monitor=False,
    ):

        self.frameworklist = ["tensorflow", "pytorch", "mxnet", "sklearn"]
        self.frameworkinstalls = {
            "tensorflow": ["tensorflow"],
            "pytorch": ["torch"],
            "mxnet": ["mxnet", "gluon"],
            "sklearn": ["sklearn"],
        }

        self.wait = wait
        self.budget = budget
        self.instance_count = instance_count
        self.instance_type = instance_type
        self.image = image
        self.dockerfilepath = dockerfilepath
        self.ei = ei
        self.monitor = monitor
        self.deployed = False
        self.autoscaletarget = autoscaletarget

        # ------ load cost types dict ---------
        costpath = pkg_resources.resource_filename("ezsmdeploy",
                                                   "data/cost.csv")
        self.costdict = {}
        with open(costpath, mode="r") as infile:
            reader = csv.reader(infile)
            for rows in reader:
                # cost for each instance
                self.costdict[rows[0]] = float(rows[1])

        # ------- basic instance type check --------

        if (
                self.instance_type == None
        ):  # since we will not select a a GPU instance in automatic instance selection
            self.gpu = False
            self.multimodel = True
        else:

            if ((self.instance_type in list(self.costdict.keys())) or "local"
                    in self.instance_type) and self.instance_type != None:

                if "local" in self.instance_type:
                    if (
                            self.instance_type == "local_gpu"
                    ):  # useful if you intend to do local testing. No change vs. local
                        self.gpu = True
                        self.multimodel = False
                        self.instance_type == "local"
                    else:
                        self.gpu = False
                        self.multimodel = True

                else:
                    if self.instance_type.split(".")[1][0] in [
                            "p",
                            "g",
                    ]:  # if gpu instance
                        self.gpu = True
                        self.multimodel = False
                    else:
                        self.gpu = False
                        self.multimodel = (
                            True  # multi model works well with local endpoints ....
                        )

            else:  # throw wrong instance error
                raise ValueError(
                    "Please choose an instance type in",
                    list(self.costdict.keys()),
                    ", or choose local for local testing. Don't pass in any instance or pass in None if you want to automatically choose an instance type.",
                )

        # ------- Model checks --------
        if type(model) == str:
            self.model = [model]
            self.multimodel = False

        elif type(model) == list:
            self.model = model
            self.multimodel = True
        elif model == None:  # assume you are loading from a hub or from a dockerfile
            with open("tmpmodel", "w") as fp:
                pass
            self.model = ["tmpmodel"]
            self.multimodel = False
        else:
            raise ValueError(
                "model must be a single serialized file (like 'model.pkl') or a \
                list of files ([model.pkl, model2.pkl]). If you are downloading a model in the script \
                or packaging with the container, pass in model = None")

        # ------- Script checks ---------
        if script[-2:] != "py":
            raise ValueError(
                "please provide a valid python script with .py extension. " +
                script + " is invalid")
        else:
            self.script = script

        filename = self.script
        with open(filename) as file:
            node = ast.parse(file.read())
            functions = [
                n.name for n in node.body if isinstance(n, ast.FunctionDef)
            ]

        if ("load_model" not in functions) and ("predict" not in functions):
            raise ValueError("please implement a load_model(modelpath) that \
                returns a loaded model, and predict(inputdata) function that returns a prediction in your"
                             + script)

        # ------- session checks --------
        if session == None:
            self.session = sagemaker.session.Session()
        else:
            self.session = session  # leave session as none since users may want to do local testing.

        # ------- name checks --------
        if name == None:
            self.name = shortuuid.uuid().lower()
        elif type(name) == str:
            self.name = name
            if name.islower() == False:
                raise ValueError(
                    "please enter a name with lower case letters; we will be using this name for s3 bucket prefixes, model names, ECR repository names etc. that have various restrictions"
                )

        else:
            raise ValueError(
                "enter string for a name or don't pass in a name; type of name passed in is "
                + str(type(name)))

        # ------- bucket checks --------
        if bucket == None:
            self.bucket = self.session.default_bucket()
        else:
            self.bucket = bucket

        self.requirements = requirements

        # ------- framework --------
        if requirements == None and framework in self.frameworklist:
            self.framework = framework
            self.requirements = self.frameworkinstalls[framework]
        elif requirements == None and framework not in self.frameworklist:
            raise ValueError(
                "If requirements=None, please provide a value for framework; \
                    choice should be one of 'tensorflow','pytorch','mxnet','sklearn'"
            )

        self.autoscale = autoscale

        self.wait = wait

        self.deploy()

    def process_instance_type(self):
        # ------ instance checks --------

        self.instancedict = {}

        if self.instance_type == None:
            # ------ load instance types dict ---------
            instancetypepath = pkg_resources.resource_filename(
                "ezsmdeploy", "data/instancetypes.csv")
            with open(instancetypepath, mode="r") as infile:
                reader = csv.reader(infile)
                for rows in reader:  # memGb / vcpu, cost, cost/memGb-per-vcpu
                    self.instancedict[rows[0]] = (
                        float(rows[2]) / (2 * float(rows[1])),
                        self.costdict[rows[0]],
                        self.costdict[rows[0]] / float(rows[2]) /
                        (2 * float(rows[1])),
                    )

            # ------ auto instance selection ---------
            self.choose_instance_type()

        else:

            if (self.instance_type in list(
                    self.costdict.keys())) or (self.instance_type
                                               in ["local", "local_gpu"]):
                if self.instance_type not in ["local", "local_gpu"]:
                    self.costperhour = self.costdict[self.instance_type]

                    if self.ei != None:
                        eicosts = {
                            "ml.eia2.medium": 0.12,
                            "ml.eia2.large": 0.24,
                            "ml.eia2.xlarge": 0.34,
                            "ml.eia.medium": 0.13,
                            "ml.eia.large": 0.26,
                            "ml.eia.xlarge": 0.52,
                        }
                        self.costperhour = self.costperhour + eicosts[self.ei]

                else:
                    self.costperhour = 0
            else:
                raise ValueError(
                    "Please choose an instance type in",
                    list(self.costdict.keys()),
                    ", or choose local for local testing.",
                )

    def choose_instance_type(self):
        # TO DO : add heuristic for auto selection of instance size
        size = self.get_size(self.bucket,
                             "ezsmdeploy/model-" + self.name + "/")

        self.instancetypespath = pkg_resources.resource_filename(
            "ezsmdeploy", "data/instancetypes.csv")

        # Assume you need at least 4 workers, each model is deployed redundantly to every vcpu.
        # So we base this decision on memory available per vcpu. If model is being downloaded from a hub
        # one should ideally pass in an instance since we don't know the size of model.
        # list includes some extremely large CPU instance and all GPU instances. For all instances that have the same
        # memory per vcpu, what is done to tie break is min (cost/total vpcus). Also 'd' instances are preferred to others for
        # faster load times at the same cost since they have NvMe. If budget is supplied, we can try to satisfy this.

        choseninstance = None
        mincost = 1000

        for instance in list(self.instancedict.keys()):
            # cost and memory per worker
            memperworker = self.instancedict[instance][0]
            cost = self.instancedict[instance][1]
            costpermem = self.instancedict[instance][2]
            #
            if self.budget == 100:
                # even though budget is unlimited, minimize cost
                if memperworker > size and cost < mincost:
                    mincost = cost
                    choseninstance = instance
                    # print("instance ={}, size={}, memperworker={}, choseninstance = {}, mincost = {}".format(instance, size, memperworker, choseninstance,mincost))
            else:
                if memperworker > size and cost <= self.budget:
                    choseninstance = instance
                    break

        if choseninstance == None and self.budget != 100:
            raise ValueError(
                "Could not find an instance that satisfies your budget of " +
                str(self.budget) +
                " per hour and can host your models with a total size of " +
                str(size) + " Gb. Please choose a higher budget per hour.")
        elif choseninstance == None and self.budget == 100:
            raise ValueError(
                "You may be using large models with a total size of " +
                str(size) +
                " Gb. Please choose a high memory GPU instance and launch without multiple models (if applicable)"
            )

        self.instance_type = choseninstance

        self.costperhour = self.costdict[self.instance_type]

    def add_model(self, s3path, relativepath):
        self.sagemakermodel.add_model(s3path, relativepath)

    def create_model(self):

        if not self.multimodel:

            self.sagemakermodel = Model(
                name="model-" + self.name,
                model_data=self.modelpath[0],
                image=self.image,
                role=sagemaker.get_execution_role(),
                # sagemaker_session=self.session,
                predictor_cls=sagemaker.predictor.RealTimePredictor,
            )

        else:

            self.sagemakermodel = MultiDataModel(
                name="model-" + self.name,
                model_data_prefix="/".join(self.modelpath[0].split("/")[:-1]) +
                "/",
                image=self.image,
                role=sagemaker.get_execution_role(),
                # sagemaker_session=self.session,
                predictor_cls=sagemaker.predictor.RealTimePredictor,
            )

            for path in self.modelpath:
                self.add_model(path, "serving/")

            self.ei = False

    def deploy_model(self):

        if self.monitor:
            from sagemaker.model_monitor import DataCaptureConfig

            data_capture_config = DataCaptureConfig(
                enable_capture=True,
                sampling_percentage=100,
                destination_s3_uri="s3://{}/ezsmdeploy/model-{}/datacapture".
                format(self.bucket, self.name),
            )
        else:
            data_capture_config = None

        self.predictor = self.sagemakermodel.deploy(
            initial_instance_count=self.instance_count,
            instance_type=self.instance_type,
            accelerator_type=self.ei,
            endpoint_name="ezsmdeploy-endpoint-" + self.name,
            update_endpoint=False,
            wait=self.wait,
            data_capture_config=data_capture_config,
        )

        self.endpoint_name = "ezsmdeploy-endpoint-" + self.name

    def get_size(self, bucket, path):
        s3 = boto3.resource("s3")
        my_bucket = s3.Bucket(bucket)
        total_size = 0.0

        for obj in my_bucket.objects.filter(Prefix=path):
            total_size = total_size + obj.size

        return total_size / ((1024.0)**3)

    def upload_model(self):
        i = 1
        self.modelpath = []
        for name in self.model:
            self.modelpath.append(
                self.session.upload_data(
                    path="model{}.tar.gz".format(i),
                    bucket=self.bucket,
                    key_prefix="ezsmdeploy/model-" + self.name,
                ))
            i += 1

    def tar_model(self):

        i = 1
        for name in self.model:

            if "s3" in name:
                # download and uncompress
                self.session.download_data(
                    path="./downloads/{}".format(i),
                    bucket=name.split("/")[2],
                    key_prefix="/".join(name.split("/")[3:]),
                )

                with tarfile.open(
                        glob.glob(
                            "./downloads/{}/*.tar.gz".format(i))[0]) as tar:
                    tar.extractall("./extractedmodel/{}/".format(i))

                name = "extractedmodel/{}/".format(i)

            tar = tarfile.open("model{}.tar.gz".format(i), "w:gz")
            if "/" in name:
                tar.add(name, arcname=".")
            else:
                tar.add(name)
            tar.close()
            i += 1

    def makedir_safe(self, directory):

        try:
            shutil.rmtree(directory)
        except:
            pass

        try:
            if not os.path.exists(directory):
                os.makedirs(directory)
        except OSError as err:
            if err.errno != 17:
                print(err.errno)
                raise

    def handle_requirements(self):
        # ------- requirements checks -------
        self.makedir_safe("src")

        if type(self.requirements) == str:
            if os.path.exists(self.requirements):
                # move file to src

                shutil.move(self.requirements, "src/requirements.txt")

            else:
                raise (self.requirements + " does not exist!")

        elif type(self.requirements) == list:
            f = open("src/requirements.txt", "w")
            l1 = map(lambda x: x + "\n", self.requirements)
            f.writelines(l1)
            f.close()

        else:
            raise ValueError(
                "pass in a path/to/requirements.txt or a list of requirements ['scikit-learn',...,...]"
            )

    #     def build_docker(self):
    #         cmd = 'chmod +x src/build-docker.sh  & sudo ./src/build-docker.sh {}'
    #         p = os.popen(cmd.format(self.name)).read()
    #         #print(output.decode())
    #         acct = os.popen('aws sts get-caller-identity --query Account --output text').read().split('\n')[0]
    #         region = os.popen('aws configure get region').read().split('\n')[0]
    #         self.image = "{}.dkr.ecr.{}.amazonaws.com/ezsmdeploy-image-{}:latest".format(acct,region,self.name)

    #         #container = '{}.dkr.ecr.{}.amazonaws.com/{}:latest'.format(account_id, region, 'demo-sagemaker-multimodel')

    #         while not os.path.exists('src/done.txt'):
    #             time.sleep(1)

    def build_docker(self):
        cmd = "chmod +x src/build-docker.sh  & sudo ./src/build-docker.sh {}"
        p = subprocess.Popen(cmd.format(self.name),
                             stdout=subprocess.PIPE,
                             shell=True)
        self.dockeroutput = p.communicate()[0].decode()
        acct = (os.popen(
            "aws sts get-caller-identity --query Account --output text").read(
            ).split("\n")[0])
        region = os.popen("aws configure get region").read().split("\n")[0]
        self.image = "{}.dkr.ecr.{}.amazonaws.com/ezsmdeploy-image-{}".format(
            acct, region, self.name)

        while not os.path.exists("src/done.txt"):
            time.sleep(3)
            self.dockeroutput = p.communicate()[0].decode()

    def autoscale_endpoint(self):
        response = boto3.client("sagemaker").describe_endpoint(
            EndpointName=self.endpoint_name)

        in1 = response["EndpointName"]
        in2 = response["ProductionVariants"][0]["VariantName"]

        client = boto3.client("application-autoscaling")
        response = client.register_scalable_target(
            ServiceNamespace="sagemaker",
            ResourceId="endpoint/{}/variant/{}".format(in1, in2),
            ScalableDimension="sagemaker:variant:DesiredInstanceCount",
            MinCapacity=1,
            MaxCapacity=10,
        )

        response = client.put_scaling_policy(
            PolicyName="scaling-policy-{}".format(self.name),
            ServiceNamespace="sagemaker",
            ResourceId="endpoint/{}/variant/{}".format(in1, in2),
            ScalableDimension="sagemaker:variant:DesiredInstanceCount",
            PolicyType="TargetTrackingScaling",
            TargetTrackingScalingPolicyConfiguration={
                "TargetValue": self.autoscaletarget,
                "PredefinedMetricSpecification": {
                    "PredefinedMetricType":
                    "SageMakerVariantInvocationsPerInstance",
                },
                "ScaleOutCooldown": 600,
                "ScaleInCooldown": 600,
                "DisableScaleIn": False,
            },
        )

        self.scalingresponse = response

    def test(self,
             input_data,
             target_model=None,
             usercount=10,
             hatchrate=5,
             timeoutsecs=5):

        if self.multimodel and target_model == None:
            raise ValueError(
                "since this is a multimodel endpoint, please pass in a target model that you wish to test"
            )

        if self.deployed:

            path1 = pkg_resources.resource_filename("ezsmdeploy",
                                                    "data/smlocust.py")
            shutil.copy(path1, "src/smlocust.py")

            start = datetime.datetime.now()

            with yaspin(Spinners.point, color="green", text="") as sp:

                sp.hide()
                sp.write(
                    str(datetime.datetime.now() - start) +
                    " | Starting test with Locust")
                sp.show()

                if self.multimodel:
                    with open("src/locustdata.txt", "w") as outfile:
                        json.dump(
                            {
                                "endpoint_name": self.endpoint_name,
                                "target_model": "model1.tar.gz",
                            },
                            outfile,
                        )
                else:
                    with open("src/locustdata.txt", "w") as outfile:
                        json.dump(
                            {
                                "endpoint_name": self.endpoint_name,
                                "target_model": ""
                            },
                            outfile,
                        )

                pickle.dump(input_data, open("src/testdata.p", "wb"))

                cmd = "locust -f src/smlocust.py --no-web -c {} -r {} --run-time {}s --csv=src/locuststats; touch src/testdone.txt".format(
                    usercount, hatchrate, timeoutsecs)
                p = os.system(cmd)
                while not os.path.exists("src/testdone.txt"):
                    time.sleep(3)

                os.remove("src/testdone.txt")

                sp.hide()
                sp.write(
                    str(datetime.datetime.now() - start) +
                    " | Done! Please see the src folder for locuststats* files"
                )
                sp.show()

        else:
            raise ValueError("Deploy model to endpoint first before testing")

    def deploy(self):
        # print(self.__dict__)
        start = datetime.datetime.now()

        with yaspin(Spinners.point, color="green", text="") as sp:

            try:
                shutil.rmtree("src/")
            except:
                pass

            # compress model files
            self.tar_model()
            sp.hide()
            if self.model == ["tmpmodel"]:
                sp.write(
                    str(datetime.datetime.now() - start) +
                    " | No model was passed. Assuming you are downloading a model in the script or in the container"
                )
            else:
                sp.write(
                    str(datetime.datetime.now() - start) +
                    " | compressed model(s)")
            sp.show()

            # upload model file(s)
            self.upload_model()

            # Process instance type
            self.process_instance_type()
            sp.hide()
            sp.write(
                str(datetime.datetime.now() - start) +
                " | uploaded model tarball(s) ; check returned modelpath")
            sp.show()

            #                 if self.gpu and self.image == None:
            #                     raise ValueError("The default container image used here is based on the multi-model server which does not support GPU instances. Please provide a docker image (ECR repository link) to proceed with model build and deployment.")

            # else:
            # handle requirements
            if self.requirements == None:
                rtext = (str(datetime.datetime.now() - start) +
                         " | no additional requirements found")
                self.makedir_safe("src")
            else:
                self.handle_requirements()
                rtext = (str(datetime.datetime.now() - start) +
                         " | added requirements file")
            sp.hide()
            sp.write(rtext)
            sp.show()

            # move script to src
            shutil.copy(self.script, "src/transformscript.py")
            sp.hide()
            sp.write(
                str(datetime.datetime.now() - start) + " | added source file")
            sp.show()

            # ------ Dockerfile checks -------
            if self.dockerfilepath == None and self.multimodel == True:
                self.dockerfilepath = pkg_resources.resource_filename(
                    "ezsmdeploy", "data/Dockerfile")
            elif self.dockerfilepath == None and self.multimodel == False:
                self.dockerfilepath = pkg_resources.resource_filename(
                    "ezsmdeploy", "data/Dockerfile_flask")

            # move Dockerfile to src
            shutil.copy(self.dockerfilepath, "src/Dockerfile")
            sp.hide()
            sp.write(
                str(datetime.datetime.now() - start) + " | added Dockerfile")
            sp.show()

            # move model_handler and build scripts to src

            if self.multimodel:
                # Use multi model
                path1 = pkg_resources.resource_filename(
                    "ezsmdeploy", "data/model_handler.py")
                path2 = pkg_resources.resource_filename(
                    "ezsmdeploy", "data/dockerd-entrypoint.py")
                path3 = pkg_resources.resource_filename(
                    "ezsmdeploy", "data/build-docker.sh")

                shutil.copy(path1, "src/model_handler.py")
                shutil.copy(path2, "src/dockerd-entrypoint.py")
                shutil.copy(path3, "src/build-docker.sh")

                self.ei = None

            else:
                # Use Flask stack
                path1 = pkg_resources.resource_filename(
                    "ezsmdeploy", "data/nginx.conf")
                path2 = pkg_resources.resource_filename(
                    "ezsmdeploy", "data/predictor.py")
                path3 = pkg_resources.resource_filename(
                    "ezsmdeploy", "data/serve")
                path4 = pkg_resources.resource_filename(
                    "ezsmdeploy", "data/train")
                path5 = pkg_resources.resource_filename(
                    "ezsmdeploy", "data/wsgi.py")
                path6 = pkg_resources.resource_filename(
                    "ezsmdeploy", "data/build-docker.sh")

                shutil.copy(path1, "src/nginx.conf")
                shutil.copy(path2, "src/predictor.py")
                shutil.copy(path3, "src/serve")
                shutil.copy(path4, "src/train")
                shutil.copy(path5, "src/wsgi.py")
                shutil.copy(path6, "src/build-docker.sh")

                if self.gpu and self.ei != None:
                    self.ei = None
                    sp.hide()
                    sp.write(
                        str(datetime.datetime.now() - start) +
                        " | Setting Elastic Inference \
                    to None since you selected a GPU instance")
                    sp.show()

            sp.hide()
            sp.write(
                str(datetime.datetime.now() - start) +
                " | added model_handler and docker utils")
            sp.show()

            # build docker container
            if self.image == None:
                sp.write(
                    str(datetime.datetime.now() - start) +
                    " | building docker container")
                self.build_docker()
                sp.hide()
                sp.write(
                    str(datetime.datetime.now() - start) +
                    " | built docker container")
                sp.show()

            # create sagemaker model
            self.create_model()
            sp.hide()
            sp.write(
                str(datetime.datetime.now() - start) +
                " | created model(s). Now deploying on " + self.instance_type)
            sp.show()

            # deploy model
            self.deploy_model()
            sp.hide()
            sp.write(
                str(datetime.datetime.now() - start) + " | deployed model")
            sp.show()

            if self.autoscale and self.instance_type not in [
                    "local", "local_gpu"
            ]:
                self.autoscale_endpoint()
                sp.hide()
                sp.write(
                    str(datetime.datetime.now() - start) +
                    " | set up autoscaling")
                sp.show()
            elif self.autoscale and self.instance_type in [
                    "local", "local_gpu"
            ]:
                sp.hide()
                sp.write(
                    str(datetime.datetime.now() - start) +
                    " | not setting up autoscaling; deploying locally")
                sp.show()

            if self.instance_type not in ["local", "local_gpu"]:
                sp.hide()
                sp.write(
                    str(datetime.datetime.now() - start) +
                    " | estimated cost is $" + str(self.costperhour) +
                    " per hour")
                sp.show()

            if self.monitor:
                sp.hide()
                sp.write(
                    str(datetime.datetime.now() - start) +
                    " | model monitor data capture location is " +
                    "s3://{}/ezsmdeploy/model-{}/datacapture".format(
                        self.bucket, self.name))
                sp.show()

            # finalize
            sp.green.ok(str(datetime.datetime.now() - start) + " | " "Done! ✔")

            self.deployed = True

            try:
                # Cleanup
                os.remove("src/done.txt")
                os.remove("src")
                os.remove("downloads")
                os.remove("extractedmodel")
                os.remove("tmpmodel")
            except:
                pass

            return self.predictor
コード例 #15
0
def test_multi_data_model_deploy_trained_model_from_framework_estimator(
        container_image, sagemaker_session, cpu_instance_type):
    timestamp = sagemaker_timestamp()
    endpoint_name = "test-multimodel-endpoint-{}".format(timestamp)
    model_name = "test-multimodel-{}".format(timestamp)
    mxnet_version = "1.4.1"

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        mxnet_model_1 = __mxnet_training_job(sagemaker_session,
                                             container_image, mxnet_version,
                                             cpu_instance_type, 0.1)
        model_data_prefix = os.path.join("s3://",
                                         sagemaker_session.default_bucket(),
                                         "multimodel-{}/".format(timestamp))
        multi_data_model = MultiDataModel(
            name=model_name,
            model_data_prefix=model_data_prefix,
            model=mxnet_model_1,
            sagemaker_session=sagemaker_session,
        )

        # Add model before deploy
        multi_data_model.add_model(mxnet_model_1.model_data,
                                   PRETRAINED_MODEL_PATH_1)
        # Deploy model to an endpoint
        multi_data_model.deploy(1,
                                cpu_instance_type,
                                endpoint_name=endpoint_name)

        # Train another model
        mxnet_model_2 = __mxnet_training_job(sagemaker_session,
                                             container_image, mxnet_version,
                                             cpu_instance_type, 0.01)
        # Deploy newly trained model
        multi_data_model.add_model(mxnet_model_2.model_data,
                                   PRETRAINED_MODEL_PATH_2)

        endpoint_models = []
        for model_path in multi_data_model.list_models():
            endpoint_models.append(model_path)
        assert PRETRAINED_MODEL_PATH_1 in endpoint_models
        assert PRETRAINED_MODEL_PATH_2 in endpoint_models

        # Define a predictor to set `serializer` parameter with npy_serializer
        # instead of `json_serializer` in the default predictor returned by `MXNetPredictor`
        # Since we are using a placeholder container image the prediction results are not accurate.
        predictor = RealTimePredictor(
            endpoint=endpoint_name,
            sagemaker_session=sagemaker_session,
            serializer=npy_serializer,
            deserializer=string_deserializer,
        )

        data = numpy.zeros(shape=(1, 1, 28, 28))
        # Prediction result for the first model
        result = predictor.predict(data, target_model=PRETRAINED_MODEL_PATH_1)
        assert result == "Invoked model: {}".format(PRETRAINED_MODEL_PATH_1)

        # Prediction result for the second model
        result = predictor.predict(data, target_model=PRETRAINED_MODEL_PATH_2)
        assert result == "Invoked model: {}".format(PRETRAINED_MODEL_PATH_2)

        # Cleanup
        sagemaker_session.sagemaker_client.delete_endpoint_config(
            EndpointConfigName=endpoint_name)
        multi_data_model.delete_model()
    with pytest.raises(Exception) as exception:
        sagemaker_session.sagemaker_client.describe_model(ModelName=model_name)
        assert "Could not find model" in str(exception.value)
        sagemaker_session.sagemaker_client.describe_endpoint_config(
            name=endpoint_name)
        assert "Could not find endpoint" in str(exception.value)