Ejemplo n.º 1
0
    def run(self):
        """
        The main method for the MasterManager.
        """
        logger.info("Running the MasterManager!")
        attr_str = self.cluster_info.get_master_config(USER_DATA_FILE_S3)
        if attr_str is not None:
            self.attributes = json.loads(attr_str)
            self.attributes['user_data_file'] = USER_DATA_FILE_S3

        # Check if the master is running. Update the self.master_instance object.
        try:
            instance_id = self.discover_master()
            if instance_id is not None:
                self.master_instance = self.ec2.Instance(instance_id)
                logger.info("Master instance discovered: %s",
                            self.master_instance.instance_id)

                # this will retry for a while and then throw an exception if master api server is unreachable
                self.check_master_api_server()

                if not self.attributes:
                    # This is needed only for first startup when cluster is created.
                    logger.debug("Populating attributes")
                    self.populate_attributes()
                    logger.debug("Saving master's config into S3")
                    self.save_master_config(USER_DATA_FILE_NEW)
                    logger.info("Master config uploaded to s3")
        except Exception as e:
            raise AXPlatformException("Failed to discover master: " + str(e))

        while (True):
            if self.master_instance is not None:
                self.wait_for_termination()
                message = "Master instance with id " + \
                    self.master_instance.instance_id + " terminated. A " + \
                    "new master instance will be created. This should " + \
                    "take a few minutes"
            else:
                logger.info("Master not running")
                message = "Master instance not found" + \
                    "A new master instance will be created. This should " + \
                    "take a few minutes."

            self.send_notification(CODE_PLATFORM_ERROR, message)
            new_master = self.launch_new_master()
            self.master_instance = self.ec2.Instance(new_master.instance_id)
            logger.info("New master instance %s running",
                        self.master_instance.instance_id)
            self.send_notification(CODE_PLATFORM_CRITICAL, "New master " + \
                                   "instance with id {} started".format(
                                       self.master_instance.instance_id))
            logger.info("Wait for {} minutes before running checks...".format(
                WAIT_TIME_POST_RESTART_MIN))
            time.sleep(WAIT_TIME_POST_RESTART_MIN * const.SECONDS_PER_MINUTE)
            logger.info("Done waiting. Now back to checks")
Ejemplo n.º 2
0
def delete_webhook():
    webhook_svc_name = "axops-webhook"
    try:
        kubectl.api.delete_namespaced_service(namespace="axsys",
                                              name=webhook_svc_name)
    except ApiException as ae:
        if ae.status != 404:
            raise AXPlatformException("Unable to delete webhook",
                                      detail=str(ae))
    return jsonify(result="ok")
Ejemplo n.º 3
0
 def search(self, searchstr=None):
     if searchstr is None or searchstr == "":
         raise AXPlatformException(
             "Docker hub search string needs to a non-empty string")
     response = self._conn.search(searchstr)
     return [{
         "ctime": "",
         "repo": x['name'],
         "tag": "latest"
     } for x in response or []]
Ejemplo n.º 4
0
    def __init__(self, name, client=None):
        self.name = name
        if client is None:
            self._client = KubernetesApiClient(use_proxy=True)
        else:
            self._client = client

        self._registry_spec = None
        self._software_info = SoftwareInfo()
        if self._software_info.registry_is_private():
            secret = KubeObjectConfigFile(
                DEFAULT_SECRET_YAML_PATH,
                {"REGISTRY_SECRETS": self._software_info.registry_secrets})
            for obj in secret.get_swagger_objects():
                if isinstance(obj, swagger_client.V1Secret):
                    self._registry_spec = obj
            assert self._registry_spec, "Argo registry specification is missing"

        self._am_service_spec = None
        self._am_deployment_spec = None

        # AA-2471: Hack to add AXOPS_EXT_DNS to Application Manager
        elb = InternalRoute("axops", "axsys", client=self._client)
        elb_status = elb.status(with_loadbalancer_info=True)["loadbalancer"][0]
        if not elb_status:
            raise AXPlatformException(
                "Could not get axops elb address {}".format(elb_status))

        k = KubeObjectConfigFile(
            DEFAULT_AM_YAML_PATH, {
                "NAMESPACE": self._software_info.image_namespace,
                "VERSION": self._software_info.image_version,
                "REGISTRY": self._software_info.registry,
                "APPLICATION_NAME": self.name,
                "AXOPS_EXT_DNS": elb_status
            })
        for obj in k.get_swagger_objects():
            if isinstance(obj, swagger_client.V1Service):
                self._am_service_spec = obj
            elif isinstance(obj, swagger_client.V1beta1Deployment):
                self._am_deployment_spec = obj
                self._add_pod_metadata("deployment",
                                       self._am_deployment_spec.metadata.name,
                                       is_label=True)
                self._add_pod_metadata(
                    "ax_costid",
                    json.dumps({
                        "app": self.name,
                        "service": "axam-deployment",
                        "user": "******"
                    }))
            else:
                logger.debug("Ignoring specification of type {}".format(
                    type(obj)))
        assert self._am_service_spec and self._am_deployment_spec, "Application monitor specification is missing"
Ejemplo n.º 5
0
    def _update_cluster_bucket(self):
        bucket_name = AXClusterConfigPath(name_id=self._name_id).bucket()
        cluster_bucket = Cloud().get_bucket(bucket_name,
                                            aws_profile=self._aws_profile,
                                            region=self._aws_region)

        if not cluster_bucket.create():
            raise AXPlatformException("Failed to create S3 bucket {}".format(
                cluster_bucket.get_bucket_name()))
        logger.info("Created %s bucket ... DONE",
                    cluster_bucket.get_bucket_name())
Ejemplo n.º 6
0
    def modify_asg(self, min, max):
        logger.info("Modifying autoscaling group ...")

        asg_manager = AXUserASGManager(self._cluster_name_id, self._region,
                                       self._aws_profile)

        asg = asg_manager.get_variable_asg()
        if not asg:
            raise AXPlatformException(
                "Failed to get variable autoscaling group for cluster {}".
                format(self._cluster_name_id))
        asg_name = asg["AutoScalingGroupName"]
        try:
            asg_manager.set_asg_spec(name=asg_name, minsize=1, maxsize=max)
        except ClientError as ce:
            raise AXPlatformException(
                "Failed to set cluster's variable autoscaling group min/max. Error: {}"
                .format(ce))

        logger.info("Modifying cluster autoscaling group ... DONE")
Ejemplo n.º 7
0
 def upload_staging_info(self, stage, msg):
     assert stage in ["stage1", "stage2"
                      ], "Only stage1, and stage2 information is available"
     logger.info("Uploading Argo install %s info to s3 ...", stage)
     if not self._bucket.put_object(key=self._staging_info[stage],
                                    data=msg):
         raise AXPlatformException(
             "Failed to upload Argo install {} info for {}".format(
                 stage, self._cluster_name_id))
     logger.info("Uploading Argo install %s info %s to s3 ... DONE", stage,
                 msg)
Ejemplo n.º 8
0
 def upload_cluster_metadata(self):
     logger.info("Uploading Argo cluster metadata ...")
     with open(self._metadata_file, "r") as f:
         data = f.read()
     # User pods should be able to curl it so we have to set ACL to public-read
     if not self._bucket.put_object(
             self._s3_cluster_meta, data, ACL="public-read"):
         raise AXPlatformException(
             "Failed to upload cluster metadata for {}".format(
                 self._cluster_name_id))
     logger.info("Uploading Argo cluster metadata ... DONE")
Ejemplo n.º 9
0
    def _update_data_bucket(self):
        data_bucket = Cloud().get_bucket(AXClusterDataPath(name_id=self._name_id).bucket(),
                                         aws_profile=self._aws_profile, region=self._aws_region)

        if not data_bucket.create():
            raise AXPlatformException("Failed to create S3 bucket {}".format(data_bucket.get_bucket_name()))
        # Update CORS config for data bucket too.
        logger.info("Checking CORS config for %s.", data_bucket.get_bucket_name())
        data_bucket.put_cors(DATA_CORS_CONFIG)

        logger.info("Created %s bucket ... DONE", data_bucket.get_bucket_name())
Ejemplo n.º 10
0
    def get_main_container_name(self):
        if not hasattr(self, "containers"):
            self.build_attributes()
        for c in self.containers:
            if c["name"] not in [
                    SIDEKICK_WAIT_CONTAINER_NAME, DIND_CONTAINER_NAME
            ]:
                return c["name"]

        raise AXPlatformException(
            "Pod for a task needs to have a non-wait container")
Ejemplo n.º 11
0
 def save_config(self):
     """
     Upload config to s3
     :return:
     """
     logger.info("Uploading cluster config to s3 at %s ...", self._cluster_config_key)
     if not self._conf:
         logger.warning("AXClusterConfig is not initialized with a valid config, NOT uploading.")
         return
     if not self._bucket.put_object(key=self._cluster_config_key, data=json.dumps(self._conf)):
         raise AXPlatformException("Failed to upload cluster config for {}".format(self._cluster_name_id))
     logger.info("Uploading cluster config to s3 ... DONE")
Ejemplo n.º 12
0
    def add_ref(self, ref, exclusive=False):
        """
        Add a reference to the EBS volume
        Args:
            ref: string
            exclusive: Boolean for exclusive access

        Returns: the array of refs after addition of ref
        """
        state = self._get_from_provider()
        if state is None:
            raise AXPlatformException("Missing volume for {}".format(
                self.name))

        if state.metadata.annotations.get("ax_deletion", "False") == "True":
            raise AXVolumeException(
                "Cannot add ref to a volume that is marked for deletion")

        # refs is an array of ref
        refs_str = state.metadata.annotations['ax_refs']
        refs = ast.literal_eval(refs_str)

        curr_excl = state.metadata.annotations['ax_exclusive'] == 'True'
        single_ref = len(refs) == 1
        ref_exists = ref in refs

        if len(refs) == 0:
            # trivially add ref
            refs.append(ref)
            self._add_state_and_update(exclusive, refs)
            return refs

        if exclusive:
            if single_ref and ref_exists:
                if not curr_excl:
                    # update to exclusive
                    refs.append(ref)
                    self._add_state_and_update(exclusive, refs)
                return refs
            else:
                raise AXVolumeException(
                    "Cannot lock volume {} for ref {} in state [Exclusive {} Refs {}]"
                    .format(self.name, ref, curr_excl, refs))
        else:
            if not curr_excl:
                if not ref_exists:
                    refs.append(ref)
                    self._add_state_and_update(exclusive, refs)
                return refs
            else:
                raise AXVolumeException(
                    "Cannot add ref {} to volume {} as it is state [Exclusive {} Refs {}"
                    .format(ref, self.name, curr_excl, refs))
Ejemplo n.º 13
0
    def delete_images(self, images):

        for image in images or []:
            d_img = DockerImage(fullname=image)
            _server, artifact, tag = d_img.docker_names()

            if _server != self.servername:
                raise AXPlatformException(
                    "Delete only supports deletion from {} registry".format(
                        self.servername))

            digest = self._get_digest(artifact, tag)
            if digest is None:
                raise AXPlatformException(
                    "Could not find digest for image {}:{}".format(
                        artifact, tag))

            # now try to delete
            if not self._delete_digest(artifact, tag, digest):
                raise AXPlatformException(
                    "Could not delete image {}:{} with digest {}".format(
                        artifact, tag, digest))
Ejemplo n.º 14
0
    def _set_ext_dns(self):
        axops_eip = self._get_eip_from_config_map() or self._get_svc_eip(svclabel="app=axops",
                                                                         namespace=AXNameSpaces.AXSYS)

        if not axops_eip:
            logger.error("Platform Start Failed: cannot find External IP for AXOPS")
            raise AXPlatformException("AXOPS elastic IP does not exist")

        self.cluster_dns_name = axops_eip[0]
        # Don't change format of this message. Portal parses this line to get cluster IP/DNS.
        logger.info("\n\n%s>>>>> Starting Argo platform... cluster DNS: %s%s\n", COLOR_GREEN, self.cluster_dns_name,
                    COLOR_NORM)
        self._replacing["AXOPS_EXT_DNS"] = self.cluster_dns_name
Ejemplo n.º 15
0
 def exception_handler(*args, **kwargs):
     try:
         return func(*args, **kwargs)
     except botocore.exceptions.ClientError as e:
         code = e.response["ResponseMetadata"]["HTTPStatusCode"]
         msg = e.message
         if code == 409:
             raise AXConflictException(msg)
         elif code == 404 or code == 400:
             # 400 also seems to be used for not found by AWS
             raise AXNotFoundException(msg)
         else:
             raise AXPlatformException(msg)
Ejemplo n.º 16
0
    def wait_for_kube_object(self, kube_obj=None, timeout=None, waiter=None):
        """
        kube_obj example:
        {
            "kind": "pods",
            "name": "my-pod",
            "validator": <some_lambda_function>
        }

        :param kube_obj:
        :param timeout: timeout value in seconds
        :param waiter: AXWaiter object
        :return:
        """
        if not self._validate_kube_object(kube_obj):
            msg = "Invalid kube_object: {}".format(str(kube_obj))
            raise AXPlatformException(msg)

        if not waiter:
            raise AXPlatformException(
                "No waiter specified for wait_for_kube_object")

        uid = str(uuid.uuid4())
        if timeout:
            timer = Timer(timeout, self.process_timeout,
                          (kube_obj["name"], uid))
        else:
            timer = None
        status = KubeObjStatus(name=kube_obj["name"],
                               kind=kube_obj["kind"],
                               validator=kube_obj["validator"],
                               waiter=waiter,
                               uid=uid,
                               timer=timer)
        if timer:
            timer.start()

        self.record.post_kubeobj_status(status)
Ejemplo n.º 17
0
    def create(self):
        """
        Create a new object using kubectl create -f <config_file>
        """
        if not self._kube_conf_file:
            raise ValueError("Cannot create object without config file")
        self._kube_conf_file.generate_tmp_file()
        stdout, stderr = self._call_kubectl("create")

        # Some of our yaml files have multiple kubernetes object, i.e.
        # Service, Deployments, etc, so the result of the command will
        # contain multiple lines.
        if stdout:
            for l in stdout.splitlines():
                logger.info(l)
        if stderr:
            logger.warning("Failed to create object %s due to %s",
                           self._config_file, stderr)
            if "already exists" in stderr:
                # As a temp work around for AA-3209
                logger.warning(
                    "Object %s already exist, which is not expected. Deleting the object and retry create",
                    self._config_file)
                self._call_kubectl("delete")
                retry_stdout, retry_stderr = self._call_kubectl("create")
                if retry_stdout:
                    for l in retry_stdout.splitlines():
                        logger.info(l)
                if retry_stderr:
                    logger.error("Object %s cannot be created after retry: %s",
                                 self._config_file, retry_stderr)
                    raise AXPlatformException(
                        "Object {} cannot be created after retry".format(
                            self._config_file))
            else:
                raise AXPlatformException(
                    "Un-recognized error during create: {}".format(stderr))
        self._kube_conf_file.delete_tmp_file()
Ejemplo n.º 18
0
    def upload_cluster_status_before_pause(self, status):
        """
        We upload cluster asg configures once for idempotency. i.e. when pause cluster failed but we have already
        scaled asg to 0, the next time we execute pause-cluster should use the status it uploaded before it even
        tried to scale cluster down
        """
        logger.info("Uploading Argo cluster status before pause ...")
        if self._bucket.get_object(key=self._s3_cluster_state_before_pause):
            logger.info("Status before pause already uploaded")
            return

        if not self._bucket.put_object(key=self._s3_cluster_state_before_pause, data=status):
            raise AXPlatformException("Failed to upload cluster status before pause")
        logger.info("Uploading Argo cluster status before pause ... DONE")
Ejemplo n.º 19
0
    def check_in_cloud(self):
        """
        This function checks if the volume exists in the cloud provider
        On error it raises AXPlatformException
        """
        state = self._get_from_provider()
        if state is None:
            raise AXPlatformException("Missing volume for {}".format(
                self.name))

        # This is due to bug in Kubernetes. See AA-1764
        # we only check on adding ref as we want to minimize the number of boto
        # calls. We only care about disk existing when we are adding a ref to the volume
        self._check_volume_in_cloud_provider(state)
Ejemplo n.º 20
0
Archivo: rest.py Proyecto: nuaays/argo
def axmon_api_get_portal():
    """
    Get portal connection information

    Returns the portal connection information as a json object
    """
    try:
        portal = {
            "cluster_name_id": os.getenv("AX_CLUSTER_NAME_ID"),
            "customer_id": AXCustomerId().get_customer_id()
        }
        return jsonify(portal)
    except Exception as e:
        raise AXPlatformException("Critical environment variable missing: {}".format(e))
Ejemplo n.º 21
0
Archivo: pod.py Proyecto: zhan849/argo
    def enable_docker(self, size_in_mb):
        if "main" not in self.cmap:
            raise AXPlatformException("Pod needs to have main container before enabling docker")

        # create the dind sidecar container
        dind_c = SidecarDockerDaemon(size_in_mb)
        if Cloud().in_cloud_aws():
            dind_c.args = ["--storage-driver=overlay2"]
        elif Cloud().in_cloud_gcp():
            # Current GKE defaults to overlay.
            dind_c.args = ["--storage-driver=overlay"]
        self.cmap["dind"] = dind_c
        self.cmap["main"].add_env("DOCKER_HOST", value="tcp://localhost:2375")

        return dind_c
Ejemplo n.º 22
0
    def delete_ref(self, ref):
        state = self._get_from_provider()
        if state is None:
            raise AXPlatformException("Missing volume for {}".format(
                self.name))

        # refs is an array of ref
        refs_str = state.metadata.annotations['ax_refs']
        refs = ast.literal_eval(refs_str)

        ref_exists = ref in refs
        if not ref_exists:
            return refs
        newrefs = [r for r in refs if r != ref]
        self._add_state_and_update(False, newrefs)
        return newrefs
Ejemplo n.º 23
0
    def get_last_pod(self):
        try:
            l = self.get_pod_list()
            if len(l.items) == 0:
                raise ValueError("No Pod found for job {}".format(
                    self.jobname))
            p = None
            for i in l.items:
                if p is None or i.status.startTime > p.status.startTime:
                    p = i
            pod = Pod(p.metadata.name)
            pod.build_attributes()
            return pod

        except swagger_client.rest.ApiException as e:
            details = json.loads(e.body)
            raise AXPlatformException(message=details["message"])
Ejemplo n.º 24
0
        def _upload(s, d, meta_data):
            # To be consistent with file uploaded from container_outer_executor
            #   - StorageClass is by default "STANDARD", and
            #   - ContentLength is used only when file size cannot be automatically
            #     determined, but as we upload .gz files after IN_CLOSE_WRITE, we
            #     don't need to set it
            extra_args = {
                "Metadata": meta_data,
                "ContentDisposition": "attachment; filename={}".format(full_name)
            }
            if AXS3Bucket.supports_encryption():
                extra_args["ACL"] = "bucket-owner-full-control"
                extra_args["ServerSideEncryption"] = "AES256"

            logger.info("about to upload log %s to %s (%s) to s3", s, d, artifact_uuid)
            if not bucket.put_file(local_file_name=s, s3_key=d, ExtraArgs=extra_args):
                raise AXPlatformException("Failed to put object {} to s3 {}".format(s, d))
            logger.debug("upload %s done", artifact_uuid)
Ejemplo n.º 25
0
    def stop_all_pods(self, delete_pod, force):
        if (not delete_pod) and force:
            logger.warning(
                "stop_running_pod(%s, delete_pod=%s, force=%s) has no effect.",
                self.jobname, delete_pod, force)
            return

        try:
            pods = self.get_pod_list()
            if not force:
                for p in pods.items:
                    if p.status.phase in ["Pending", "Running"]:
                        pod = Pod(p.metadata.name)
                        pod.stop(self.jobname)
                        continue
                    else:
                        logger.debug("Don't need to stop [%s][%s], status=%s",
                                     self.jobname, p.metadata.name,
                                     p.status.phase)
            else:
                logger.debug("Force delete pods for [%s]", self.jobname)

            if delete_pod:
                logger.debug("Deleting all pods for [%s]", self.jobname)
                self.client.api.deletecollection_namespaced_pod(
                    namespace=self.kube_namespace,
                    label_selector="job-name={}".format(self.jobname))
                logger.debug("Deleted all pods for [%s]", self.jobname)
            else:
                logger.debug("Don't delete pods for [%s]", self.jobname)

            for p in pods.items:
                pod = Pod(p.metadata.name)
                logger.debug("Delete volumes for [%s][%s]", self.jobname,
                             p.metadata.name)
                pod._delete_volumes_for_pod(p)

            time.sleep(DELETE_TASK_GRACE_PERIOD)

        except swagger_client.rest.ApiException as e:
            logger.exception("delete_all_pods")
            details = json.loads(e.body)
            raise AXPlatformException(message=details["message"])
Ejemplo n.º 26
0
    def get_container_detail(self, cid):
        container = self._containers.get(cid, None)
        if not container:
            raise AXPlatformException(
                "No container record for container id {}".format(cid))
        container_config = dict(
            id=cid,
            name=container["name"],
            host_id=self._host_info['id'],
            host_name=self._host_info['name'],
            cost_id=self._service_config.get_cost_id(cid),
            service_id=self._service_config.get_service_id(cid))

        # TODO: add reserved cpu/mem when upper layer is handling them
        # Discussed with Ying that we should not post these two fields now
        #
        # resources = self._service_config.get_resources(cname)
        # container_config['cpu'] = getattr(resources, 'cpu_cores', 0.0) if resources else 0.0
        # container_config['mem'] = getattr(resources, 'mem_mib', 0.0) if resources else 0.0
        return container_config
Ejemplo n.º 27
0
    def restart_master(self):
        started_master_id = self.discover_master(state=[EC2InstanceState.Running])
        if started_master_id:
            logger.info("Master %s is already running", started_master_id)
            return

        stopped_master_id = self.discover_master(state=[EC2InstanceState.Stopped])
        if not stopped_master_id:
            raise AXPlatformException("Cannot find a previously stopped master instance")

        # As we can always start a "stopped" instance, any other exception will be thrown out
        self.client.start_instances(InstanceIds=[stopped_master_id])

        logger.info("Waiting for master %s to get into state \"running\"", stopped_master_id)
        while True:
            running_master_id = self.discover_master(state=[EC2InstanceState.Running])
            if running_master_id:
                logger.info("Master %s successfully started", running_master_id)
                return
            else:
                time.sleep(5)
Ejemplo n.º 28
0
    def get_user_data(self):
        """
        Get's the user-data for the current master. Note that the user-data is base64 encoded when it is
        downloaded. Writes the data into a file.
        """
        # The user-data is base64 encoded.
        user_data = self.client.describe_instance_attribute(
            Attribute='userData', InstanceId=self.master_instance.instance_id)['UserData']['Value']
        # Download user-data and store it into a temporary file. This data is base64 encoded.
        # It is better to use a well-known location for this file rather than one generated by mkstemp (or variants).
        # That way, this file could be populated the first time this pod run or even later by simply downloading
        # the user-data from S3.
        try:
            user_data = self.user_data_fixup(user_data)
        except Exception as e:
            logger.exception("Failed while fixing up user-data")
            raise AXPlatformException("Failed while fixing up user-data: " + str(e))

        with open(USER_DATA_FILE_NEW, "w") as f:
            f.write(user_data)
        return USER_DATA_FILE_NEW
Ejemplo n.º 29
0
    def create_objects(self, objects):
        """
        Start kubernetes objects based on records.
        Wait for all of them.

        :param objects: AXPlatformObjectGroup
        """
        if objects is None or len(objects.object_set) == 0:
            return

        assert isinstance(objects, AXPlatformObjectGroup)
        if not self._should_create_group(
                policy=objects.policy,
                policy_predicate=objects.policy_predicate,
                consistency=objects.consistency):
            logger.debug(
                "Skipping object group (%s) creation based on policy (%s), policy predicate (%s), consistency (%s)",
                objects.name, objects.policy, objects.policy_predicate,
                objects.consistency)
            return
        logger.info("Create step: %s", objects.name)
        logger.info("Creating platform objects\n\n%s",
                    self._generate_object_summary(objects.object_set))
        pool = ThreadPool(len(objects.object_set))
        async_results = {}
        for obj in objects.object_set:
            assert isinstance(obj, AXPlatformObject)
            name = obj.name
            namespace = obj.namespace
            async_results[name] = pool.apply_async(
                self.start_one, args=(name, ), kwds={"namespace": namespace})
        pool.close()
        pool.join()

        report, failed = self._generate_report(async_results, "Create")
        logger.info(report)

        if failed:
            raise AXPlatformException("Failed to create platform objects.")
Ejemplo n.º 30
0
    def stop_master(self):
        stop_master_requested = False
        master_instance_id = self.discover_master(
            state=[EC2InstanceState.Stopping, EC2InstanceState.Stopped])
        if master_instance_id:
            stop_master_requested = True

        if not stop_master_requested:
            master_instance_id = self.discover_master(state=["*"])
            if not master_instance_id:
                raise AXPlatformException("Cannot find master instance")
            try:
                self.client.stop_instances(InstanceIds=[master_instance_id])
            except ClientError as ce:
                if "UnsupportedOperation" in str(
                        ce) and "StopInstances" in str(ce):
                    logger.warning(
                        "Master instance %s a spot instance, which cannot be stopped."
                    )
                    return
                elif "IncorrectInstanceState" in str(ce):
                    # Master could be in "terminating", "terminated", or "stopped" state. It does not
                    # make sense that first 2 states could kick in, unless there is some human intervention
                    # so the code will stuck in waiting for master to go into "stopped" state, which is
                    # a good indication for checking manually
                    pass
                else:
                    raise ce
        logger.info("Waiting for master %s to get into state \"stopped\"",
                    master_instance_id)
        while True:
            stopped_master = self.discover_master(
                state=[EC2InstanceState.Stopped])
            if stopped_master:
                logger.info("Master %s successfully stopped",
                            master_instance_id)
                return
            else:
                time.sleep(5)