Esempio n. 1
0
class Deployment(object):
    """
    This class creates and manages a single deployment object
    A deployment consists of the following specifications in kubernetes
    1. A kubernetes deployment spec
    2. Zero or more kubernetes service specs
    3. Zero or more ingress rules

    All functions in the object need to be idempotent.
    """

    def __init__(self, name, application):
        """
        Each deployment has a name and needs to be part of an application
        Application maps to a kubernetes namespace and the deployment will
        be created in this namespace.

        Args:
            name: deployment name
            application: the application that this deployment runs under
        """
        self.name = name
        self.application = application
        self.client = KubernetesApiClient(use_proxy=True)
        self._nameid = AXClusterId().get_cluster_name_id()
        self._software_info = SoftwareInfo()

        self._app_obj = Application(application)

        self._resources = AXResources()
        self.spec = None

    def create(self, spec):
        """
        Create a deployment from the template specified

        Idempotency: This function is idempotent. A create of identical spec will
        have no impact if the deployment already exists. If the spec is different
        then the existing deployment will be updated.
        """
        @retry_unless(status_code=[404, 422])
        def create_in_provider(k8s_spec):
            try:
                logger.info("Creating deployment %s in Kubernetes namespace %s", self.name, self.application)
                self.client.apisappsv1beta1_api.create_namespaced_deployment(k8s_spec, self.application)
                logger.info("Done creating deployment %s in Kubernetes namespace %s", self.name, self.application)
            except swagger_client.rest.ApiException as e:
                if e.status == 409:
                    self.client.apisappsv1beta1_api.replace_namespaced_deployment(k8s_spec, self.application, self.name)
                else:
                    raise e

        with DeploymentOperation(self):

            self.spec = spec

            # Do some template checks
            self._template_checks()

            # First create supplemental resources such as routes, ingress rules etc
            self._create_deployment_resources()

            # Now create the deployment spec
            d_spec = self._create_deployment_spec()

            # Store the resources in the deployment spec
            self._resources.finalize(d_spec)

            # Create the deployment object in kubernetes
            create_in_provider(d_spec)

    def delete(self, timeout=None):
        """
        Delete the deployment.

        Idempotency: This function is idempotent. If deployment does not exist then
        delete will silently fail without raising any exceptions.
        Args:
            timeout: In seconds or None for infinite
        """
        options = swagger_client.V1DeleteOptions()
        options.grace_period_seconds = 1
        options.orphan_dependents = False

        def check_result(result):
            # True for retry False for done
            return not result

        @retry(retry_on_result=check_result, wait_fixed=2000, stop_max_delay=timeout)
        def wait_for_scale_to_zero():
            logger.debug("Wait for scale of deployment to 0 for {} {}".format(self.application, self.name))

            @retry_unless(swallow_code=[404])
            def get_scale_from_provider():
                return self.client.apisappsv1beta1_api.read_namespaced_scale_scale(self.application, self.name)

            scale = get_scale_from_provider()
            if scale is None:
                return True
            if scale.status.replicas == 0:
                return True

            return False

        @retry_unless(swallow_code=[404, 409])
        def delete_in_provider():
            logger.debug("Deleting deployment for {} {}".format(self.application, self.name))
            self.client.apisappsv1beta1_api.delete_namespaced_deployment(options, self.application, self.name)

        def delete_rs_in_provider():
            logger.debug("Deleting replica set for {} {}".format(self.application, self.name))
            self.client.extensionsv1beta1.deletecollection_namespaced_replica_set(self.application, label_selector="deployment={}".format(self.name))

        # now delete deployment object and replication set
        with DeploymentOperation(self):
            dep_obj = self._deployment_status()
            self._scale_to(0)
            wait_for_scale_to_zero()

            if dep_obj:
                resources = AXResources(existing=dep_obj)
                resources.delete_all()

            delete_in_provider()
            delete_rs_in_provider()

    def status(self):
        """
        Get the status of the deployment.
        Returns: Returns the entire V1Deployment as a dict.
        If deployment is not found then this will raise an AXNotFoundException (404)
        """
        # STEP 1: Get status of deployment
        stat = self._deployment_status()
        if stat is None:
            raise AXNotFoundException("Deployment {} not found in application {}".format(self.name, self.application))

        dep_field_map = {
            "name": "metadata.name",
            "generation": "metadata.annotations.ax_generation",
            "desired_replicas": "status.replicas",
            "available_replicas": "status.available_replicas",
            "unavailable_replicas": "status.unavailable_replicas"
        }
        ret = KubeObject.swagger_obj_extract(stat, dep_field_map, serializable=True)

        # STEP 2: Get the pods for the deployment and events associated
        podlist = self._deployment_pods().items
        dep_events = self._app_obj.events(name=self.name)
        event_field_map = {
            "message": "message",
            "reason": "reason",
            "source": "source.component",
            "host": "source.host",
            "firstTS": "first_timestamp",
            "lastTS": "last_timestamp",
            "count": "count",
            "container": "involved_object.field_path",
            "type": "type"
        }
        ret["events"] = []
        for event in dep_events:
            ret["events"].append(KubeObject.swagger_obj_extract(event, event_field_map, serializable=True))

        ret["pods"] = []
        for pod in podlist or []:
            # fill pod status and containers
            pstatus = Pod.massage_pod_status(pod)

            # fill events for pod
            pstatus["events"] = []

            events = self._app_obj.events(name=pod.metadata.name)
            for event in events:
                pstatus["events"].append(KubeObject.swagger_obj_extract(event, event_field_map, serializable=True))

            # fill pod failure information for pod based on events
            pstatus["failure"] = Deployment._pod_failed_info(pstatus)

            ret["pods"].append(pstatus)

        # STEP 3: From the deployment spec get the resources created by deployment
        # TODO: Add this when services are created by deployment

        return ret

    def get_labels(self):
        """
        Get a dict of labels used for this deployment
        """
        state = self._deployment_status()
        if state is None:
            raise AXNotFoundException("Did not find deployment {} in application {}".format(self.name, self.application))

        return KubeObject.swagger_obj_extract(state, {"labels": "spec.selector.match_labels"})['labels']

    @staticmethod
    def _pod_failed_info(pod_status):
        if pod_status["phase"] != "Pending":
            return None

        for ev in pod_status["events"] or []:
            if ev["reason"] == "Failed" and ev["source"] == "kubelet" and ev["type"] == "Warning" and \
                            "Failed to pull image" in ev["message"] and ev["count"] > 5:
                return {
                    "reason": "ImagePullFailure",
                    "message": ev["message"]
                }
        return None

    def scale(self, replicas):
        with DeploymentOperation(self):
            # Deployments with volumes can't be scaled to > 1.
            if replicas > 1:
                dep_obj = self._deployment_status()
                if dep_obj:
                    resources = AXResources(existing=dep_obj)
                    for type in resources.get_all_types():
                        if type.startswith("ax.platform.volumes"):
                            raise AXApiForbiddenReq("Deployments with volumes can't be scaled to > 1 ({})".format(replicas))

            self._scale_to(replicas)

    @retry_unless(swallow_code=[404])
    def _deployment_status(self):
        return self.client.apisappsv1beta1_api.read_namespaced_deployment(self.application, self.name)

    @retry_unless(swallow_code=[404])
    def _deployment_pods(self):
        return self.client.api.list_namespaced_pod(self.application, label_selector="deployment={}".format(self.name))

    def _create_deployment_spec(self):

        pod_spec = PodSpec(self.name, namespace=self.application)
        main_container = self.spec.template.get_main_container()

        main_container_spec = self._create_main_container_spec(main_container)
        pod_spec.add_main_container(main_container_spec)

        container_vols = self._get_main_container_vols()
        main_container_spec.add_volumes(container_vols)

        hw_res = main_container.get_resources()
        main_container_spec.add_resource_constraints("cpu_cores", hw_res.cpu_cores, limit=None)
        main_container_spec.add_resource_constraints("mem_mib", hw_res.mem_mib, limit=None)

        artifacts_container = pod_spec.enable_artifacts(self._software_info.image_namespace, self._software_info.image_version,
                                                        None, main_container.to_dict())
        secret_resources = artifacts_container.add_configs_as_vols(main_container.get_all_configs(), self.name, self.application)
        self._resources.insert_all(secret_resources)

        # Set up special circumstances based on annotations
        # Check if we need to circumvent the executor script. This is needed for containers that run
        # special init processes such as systemd as these processes like to be pid 1
        if main_container.executor_spec:
            main_container_spec.command = None
            if main_container.docker_spec is not None:
                raise ValueError("We do not support ax_ea_docker_enable with ax_ea_executor")

        # Does this container need to be privileged
        main_container_spec.privileged = main_container.privileged

        # Check if docker daemon sidecar needs to be added
        if main_container.docker_spec:
            # graph storage size is specified in GiB
            dind_container_spec = pod_spec.enable_docker(main_container.docker_spec.graph_storage_size_mib)
            dind_container_spec.add_volumes(pod_spec.get_artifact_vols())
            dind_container_spec.add_resource_constraints("cpu_cores", main_container.docker_spec.cpu_cores, limit=None)
            dind_container_spec.add_resource_constraints("mem_mib", main_container.docker_spec.mem_mib, limit=None)
            dind_container_spec.add_volumes(container_vols)

        # Do we only need docker graph storage volume for the main container
        if main_container.graph_storage:
            dgs_vol = ContainerVolume("graph-storage-vol-only", main_container.graph_storage.mount_path)
            dgs_vol.set_type("DOCKERGRAPHSTORAGE", main_container.graph_storage.graph_storage_size_mib)
            main_container_spec.add_volume(dgs_vol)

        # set the pod hostname to value provided in main container spec
        pod_spec.hostname = main_container.hostname

        # TODO: This needs fixup. job name is used in init container to ask permission to start
        # TODO: Don't know if this is needed in deployment or not?
        artifacts_container.add_env("AX_JOB_NAME", value=self.application)
        artifacts_container.add_env("AX_DEPLOYMENT_NEW", value="True")

        if len(container_vols) > 0:
            tmp_container_vols = copy.deepcopy(container_vols)
            volume_paths = []
            for v in tmp_container_vols:
                v.set_mount_path("/ax/fix" + v.volmount.mount_path)
                volume_paths.append(v.volmount.mount_path)
            artifacts_container.add_volumes(tmp_container_vols)
            logger.info("Volumes to chmod: %s", volume_paths)
            artifacts_container.add_env("AX_VOL_MOUNT_PATHS", value=str(volume_paths))

        # add annotation for service env which will show up in artifacts container
        pod_spec.add_annotation("AX_SERVICE_ENV", self._generate_service_env(self.spec.template))
        pod_spec.add_annotation("AX_IDENTIFIERS", self._get_identifiers())
        if self.spec.costid:
            pod_spec.add_annotation("ax_costid", json.dumps(self.spec.costid))

        pod_spec.add_label("deployment", self.name)
        pod_spec.add_label("application", self.application)
        pod_spec.add_label("tier", "user")
        pod_spec.add_label("deployment_id", self.spec.id)

        # now that pod is ready get its spec and wrap it in a deployment
        k8s_spec = self._generate_deployment_spec_for_pod(pod_spec.get_spec())

        logger.info("Generated Kubernetes spec for deployment %s", self.name)
        return k8s_spec

    def _create_main_container_spec(self, container_template):
        """
        :type container_template: argo.template.v1.container.ContainerTemplate
        :rtype Container
        """
        logger.debug("Container template is {}".format(container_template))

        name = string_to_dns_label(container_template.name)
        container_spec = Container(name, container_template.image, pull_policy=container_template.image_pull_policy)
        container_spec.parse_probe_spec(container_template)

        # Necessary envs for handshake
        container_spec.add_env("AX_HANDSHAKE_VERSION", value=CUR_RECORD_VERSION)

        # Envs introduced to user
        container_spec.add_env("AX_POD_NAME", value_from="metadata.name")
        container_spec.add_env("AX_POD_IP", value_from="status.podIP")
        container_spec.add_env("AX_POD_NAMESPACE", value_from="metadata.namespace")
        container_spec.add_env("AX_NODE_NAME", value_from="spec.nodeName")
        container_spec.add_env("AX_CLUSTER_META_URL_V1", value=CLUSTER_META_URL_V1)

        # envs from user spec
        for env in container_template.env:
            (cfg_ns, cfg_name, cfg_key) = env.get_config()
            if cfg_ns is not None:
                secret = SecretResource(cfg_ns, cfg_name, self.name, self.application)
                secret.create()
                self._resources.insert(secret)
                container_spec.add_env(env.name, value_from_secret=(secret.get_resource_name(), cfg_key))
            else:
                container_spec.add_env(env.name, value=env.value)

        # Unix socket for applet
        applet_sock = ContainerVolume("applet", "/tmp/applatix.io/")
        applet_sock.set_type("HOSTPATH", "/var/run/")
        container_spec.add_volume(applet_sock)

        return container_spec

    @staticmethod
    def _get_valid_name_from_axrn(axrn):
        # AXRN's will have non-alphanumeric characters such as : / @, etc which K8S doesn't
        # like in its PVC name. Replace all non-alphanumeric characters with -.
        name_regex = re.compile(r"\W+")
        return name_regex.sub("-", axrn).replace("_", "-")

    def _get_main_container_vols(self):
        container_template = self.spec.template.get_main_container()
        ret = []

        for vol_name, vol in  iteritems(container_template.inputs.volumes):
            # sanitize the volume name for kubernetes
            vol_name = string_to_dns_label(vol_name)
            cvol = ContainerVolume(vol_name, vol.mount_path)
            assert "resource_id" in vol.details, "Volume resource-id absent in volume details"
            assert "filesystem" in vol.details, "Volume filesystem absent in volume details"
            cvol.set_type("AWS_EBS", vol_name, vol.details["resource_id"], vol.details["filesystem"])
            logger.debug("Volume {} {} mounted at {}".format(vol_name, vol.details, vol.mount_path))
            ret.append(cvol)

        return ret

    def _generate_service_env(self, template):
        return base64.b64encode(json.dumps(template.to_dict()))

    def _get_identifiers(self):
        return {
            "application_id": self.spec.app_generation,
            "deployment_id": self.spec.id,
            "static": {
                "application_id": self.spec.app_id,
                "deployment_id": self.spec.deployment_id
            }
        }

    def _generate_deployment_spec_for_pod(self, pod_spec):

        metadata = swagger_client.V1ObjectMeta()
        metadata.name = self.name

        dspec = swagger_client.V1beta1DeploymentSpec()
        dspec.strategy = self._get_strategy()
        if self.spec.template.min_ready_seconds:
            dspec.min_ready_seconds = self.spec.template.min_ready_seconds
        dspec.selector = swagger_client.V1LabelSelector()
        dspec.selector.match_labels = {
            "deployment": self.name
        }

        dspec.replicas = self.spec.template.scale.min
        dspec.template = pod_spec

        deployment_obj = swagger_client.V1beta1Deployment()
        deployment_obj.metadata = metadata
        deployment_obj.spec = dspec
        return deployment_obj

    def _create_deployment_resources(self):

        for route in self.spec.template.internal_routes:
            # ignore empty port spec
            if len(route.ports) == 0:
                logger.debug("Skipping internal route {} as port spec is empty".format(route.name))
                continue
            ir = InternalRoute(route.name, self.application)
            ir.create(route.to_dict()["ports"], selector={"deployment": self.name}, owner=self.name)
            self._resources.insert(ir)
            logger.debug("Created route {}".format(ir))

        for route in self.spec.template.external_routes:

            dns_name = route.dns_name()
            if dns_name.endswith("."):
                dns_name = dns_name[:-1]

            r = ExternalRoute(dns_name, self.application, {"deployment": self.name}, route.target_port, route.ip_white_list, route.visibility)
            try:
                elb_addr = visibility_to_elb_addr(route.visibility)
                elb_name = visibility_to_elb_name(route.visibility)
            except AXNotFoundException:
                if route.visibility == ExternalRouteVisibility.VISIBILITY_WORLD:
                    raise AXNotFoundException("Could not find the public ELB. Please report this error to Applatix Support at [email protected]")
                else:
                    assert route.visibility == ExternalRouteVisibility.VISIBILITY_ORGANIZATION, "Only world and organization are currently supported as visibility attributes"
                    raise AXNotFoundException("Please create a private ELB using the template named 'ax_private_elb_creator_workflow' before using 'visibility=organization'")

            name = r.create(elb_addr,elb_name=elb_name)
            self._resources.insert(r)
            logger.debug("Created external route {} for {}/{}/{}".format(name, self.application, self.name, dns_name))

        main_container = self.spec.template.get_main_container()
        for key_name, vol in iteritems(main_container.inputs.volumes):
            assert "resource_id" in vol.details, "Volume resource_id absent in volume details"
            name = vol.details.get("axrn", None)
            resource_id = vol.details.get("resource_id", None)
            assert name is not None and resource_id is not None, "axrn and resource_id are required details for volume {}".format(key_name)
            nv_res = AXNamedVolumeResource(name, resource_id)
            nv_res.create()
            self._resources.insert(nv_res)
            logger.debug("Using named volume resource {} in application {}".format(name, self.application))

    @retry_unless(status_code=[422], swallow_code=[400, 404])
    def _scale_to(self, replicas):
        logger.debug("Scaling deployment to {} for {} {}".format(replicas, self.application, self.name))
        scale = swagger_client.V1beta1Scale()
        scale.spec = swagger_client.V1beta1ScaleSpec()
        scale.spec.replicas = replicas
        scale.metadata = swagger_client.V1ObjectMeta()
        scale.metadata.name = self.name
        scale.metadata.namespace = self.application
        self.client.apisappsv1beta1_api.replace_namespaced_scale_scale(scale, self.application, self.name)

    def _template_checks(self):
        if self.spec.template.scale and self.spec.template.scale.min > 1 and len(self.spec.template.volumes) >= 1:
            raise ValueError("Deployments with volumes can't have scale > 1 ({})".format(self.spec.template.scale.min))

    def _get_strategy(self):
        s = swagger_client.V1beta1DeploymentStrategy()
        s.type = "RollingUpdate" if self.spec.template.strategy.type == "rolling_update" else "Recreate"
        if s.type == "RollingUpdate":
            rolling_update = swagger_client.V1beta1RollingUpdateDeployment()
            rolling_update.max_unavailable = self.spec.template.strategy.rolling_update.max_unavailable
            rolling_update.max_surge = self.spec.template.strategy.rolling_update.max_surge
            s.rolling_update = rolling_update
        return s
Esempio n. 2
0
class Task(object):
    """
    The job of this object is to track the creation and deletion of a step in a workflow.
    Callers can create this object with the specification from service template, followed
    by call to create, wait_for_start etc
    """
    def __init__(self, name, namespace="axuser"):
        self.name = name
        self.namespace = namespace
        self.client = KubernetesApiClient(use_proxy=True)

        self.service = None  # this is the argo.services.service.Service object
        self._host_vols = []
        self._name_id = AXClusterId().get_cluster_name_id()
        self._s3_bucket_ax_is_external = AXLogPath(self._name_id).is_external()
        self._s3_bucket_ax = AXLogPath(self._name_id).bucket()
        self._s3_key_prefix_ax = AXLogPath(self._name_id).artifact()
        self._s3_bucket = AXClusterDataPath(self._name_id).bucket()
        self._s3_key_prefix = AXClusterDataPath(self._name_id).artifact()

        self.software_info = SoftwareInfo()
        self._resources = AXResources()

    def create(self, conf):
        """
        Create a Kubernetes Job object
        :param conf: conf data from DevOps
        :return:
        """
        logger.debug("Task create for {}".format(json.dumps(conf)))
        self.service = Service()
        self.service.parse(conf)

        labels = {
            "app":
            self.name,
            "service_instance_id":
            self.service.service_context.service_instance_id,
            "root_workflow_id":
            self.service.service_context.root_workflow_id,
            "leaf_full_path":
            string_to_k8s_label(self.service.service_context.leaf_full_path
                                or "no_path"),
            "tier":
            "devops",
            "role":
            "user",
        }

        template_spec = self._container_to_pod(labels)

        # convert V1PodTemplateSpec to V1Pod
        pod_spec = swagger_client.V1Pod()
        pod_spec.metadata = template_spec.metadata
        pod_spec.spec = template_spec.spec
        self._resources.finalize(pod_spec)

        logger.debug("pod_spec {}".format(json.dumps(pod_spec.to_dict())))
        return pod_spec

    def start(self, spec):
        """
        Start a task
        :param spec: The swagger specification for Pod
        :type spec: swagger_client.V1Pod
        :return:
        """
        assert isinstance(
            spec,
            swagger_client.V1Pod), "Unexpected object {} in Task.start".format(
                type(spec))

        @retry_unless(status_code=[409, 422])
        def create_in_provider():
            return self.client.api.create_namespaced_pod(spec, self.namespace)

        with TaskOperation(self):
            return create_in_provider()

    def status(self, status_obj=None):
        """
        Return the status of the job with the pass name
        Args:
            status_obj: if passed the job status will be used instead of queried from provider

        Returns: a json dict with task status
        """
        if status_obj:
            assert isinstance(
                status_obj, swagger_client.V1Pod
            ), "Unexpected status object {} in Task.status".format(
                type(status_obj))
            status = status_obj
        else:
            status = Pod(self.name, self.namespace)._get_status_obj()

        # pkg/api/v1/types.go in kubernetes source code describes the following PodPhase
        # const (
        #     // PodPending means the pod has been accepted by the system, but one or more of the containers
        #     // has not been started. This includes time before being bound to a node, as well as time spent
        #     // pulling images onto the host.
        #     PodPending PodPhase = "Pending"
        #     // PodRunning means the pod has been bound to a node and all of the containers have been started.
        #     // At least one container is still running or is in the process of being restarted.
        #     PodRunning PodPhase = "Running"
        #     // PodSucceeded means that all containers in the pod have voluntarily terminated
        #     // with a container exit code of 0, and the system is not going to restart any of these containers.
        #     PodSucceeded PodPhase = "Succeeded"
        #     // PodFailed means that all containers in the pod have terminated, and at least one container has
        #     // terminated in a failure (exited with a non-zero exit code or was stopped by the system).
        #     PodFailed PodPhase = "Failed"
        #     // PodUnknown means that for some reason the state of the pod could not be obtained, typically due
        #     // to an error in communicating with the host of the pod.
        #     PodUnknown PodPhase = "Unknown"
        # )
        pending = True if status.status.phase == "Pending" else False
        active = True if status.status.phase == "Running" or pending else False
        failed = True if status.status.phase == "Failed" else False
        completed = True if status.status.phase == "Succeeded" or failed else False

        if status.status.phase == "Unknown":
            raise AXPlatformException(
                "Status of task {} could not be found due to some temporary problem. Please retry"
                .format(self.name))

        ret_status = {
            "active": active,
            "succeeded": completed,
            # TODO: Should this be removed
            "message": "",
            "reason": "",
            "failed": failed
        }

        if pending:
            # If the pod is pending it is likely stuck on container image pull failure
            try:
                for init_status in status.status.init_container_statuses or []:
                    if init_status.state.waiting is not None:
                        # find the first init container that is stuck on waiting and stuff the reason and message
                        ret_status[
                            "reason"] = init_status.state.waiting.reason or ""
                        ret_status[
                            "message"] = init_status.state.waiting.message or ""
                        break
            except Exception:
                pass

        return ret_status

    def delete(self, force=False):
        """
        Delete the task from kubernetes and returns the final status
        Returns: Last status of the job
        """
        logger.debug("Task delete for {}".format(self.name))
        with TaskOperation(self):
            status_obj = Pod(self.name, self.namespace)._get_status_obj()
            status = self.status(status_obj=status_obj)
            p = Pod(self.name, self.namespace)
            if not force:
                p.stop()

            # delete dependents
            resources = AXResources(existing=status_obj)
            resources.delete_all()

            # finally delete pod
            p.delete()
            return status

    def get_log_endpoint(self):
        url_run, _ = Pod(self.name, self.namespace).get_log_urls()
        return url_run

    def stop(self):
        Pod(self.name, self.namespace).stop()

    def _container_to_pod(self, labels):

        # generate the service environment
        self._gen_service_env()

        pod_spec = PodSpec(self.name)
        pod_spec.restart_policy = "Never"

        main_container = self._container_spec()

        for vol_tag, vol in iteritems(self.service.template.inputs.volumes):
            # sanitize name for kubernetes
            vol_tag = string_to_dns_label(vol_tag)
            cvol = ContainerVolume(vol_tag, vol.mount_path)
            assert "resource_id" in vol.details and "filesystem" in vol.details, "resource_id and filesystem are required fields in volume details"
            cvol.set_type("AWS_EBS", vol_tag, vol.details["resource_id"],
                          vol.details["filesystem"])
            main_container.add_volume(cvol)
            logger.info("Mounting volume {} {} in {}".format(
                vol_tag, vol.details, vol.mount_path))

        pod_spec.add_main_container(main_container)
        wait_container = self._generate_wait_container_spec()
        pod_spec.add_wait_container(wait_container)

        (cpu, mem, d_cpu, d_mem) = self._container_resources()
        main_container.add_resource_constraints("cpu_cores", cpu, limit=None)
        main_container.add_resource_constraints("mem_mib", mem, limit=mem)

        # handle artifacts
        self_sid = None
        if self.service.service_context:
            self_sid = self.service.service_context.service_instance_id

        # TODO: This function calls ax_artifact and needs to be rewritten. Ugly code.
        artifacts_container = pod_spec.enable_artifacts(
            self.software_info.image_namespace,
            self.software_info.image_version, self_sid,
            self.service.template.to_dict())
        artifacts_container.add_env("AX_JOB_NAME", value=self.name)
        secret_resources = artifacts_container.add_configs_as_vols(
            self.service.template.get_all_configs(), self.name, self.namespace)
        self._resources.insert_all(secret_resources)

        if self.service.template.docker_spec:
            dind_c = pod_spec.enable_docker(
                self.service.template.docker_spec.graph_storage_size_mib)
            dind_c.add_volumes(pod_spec.get_artifact_vols())
            dind_c.add_resource_constraints("cpu_cores", d_cpu, limit=None)
            dind_c.add_resource_constraints("mem_mib", d_mem, limit=d_mem)

        service_id = None
        if self.service.service_context:
            service_id = self.service.service_context.service_instance_id
        pod_spec.add_annotation("ax_serviceid", service_id)
        pod_spec.add_annotation("ax_costid", json.dumps(self.service.costid))
        pod_spec.add_annotation("AX_SERVICE_ENV", self._gen_service_env())

        for k in labels or []:
            pod_spec.add_label(k, labels[k])

        return pod_spec.get_spec()

    def _container_spec(self):
        """
        Converts service template to V1Container
        """
        container = self.service.template
        c = Container(container.name,
                      container.image,
                      pull_policy=container.image_pull_policy)

        c.add_env("AX_CONTAINER_NAME", value=self.name)
        c.add_env("AX_ROOT_SERVICE_INSTANCE_ID",
                  value=self.service.service_context.root_workflow_id)
        c.add_env("AX_SERVICE_INSTANCE_ID",
                  value=self.service.service_context.service_instance_id)

        # Envs introduced to user
        c.add_env("AX_POD_NAME", value_from="metadata.name")
        c.add_env("AX_POD_IP", value_from="status.podIP")
        c.add_env("AX_POD_NAMESPACE", value_from="metadata.namespace")
        c.add_env("AX_NODE_NAME", value_from="spec.nodeName")
        c.add_env("AX_CLUSTER_META_URL_V1", value=CLUSTER_META_URL_V1)

        for env in container.env:
            (cfg_ns, cfg_name, cfg_key) = env.get_config()
            if cfg_ns is not None:  # checking one of them is enough
                res = SecretResource(cfg_ns, cfg_name, self.name,
                                     self.namespace)
                res.create()
                self._resources.insert(res)
                c.add_env(env.name,
                          value_from_secret=(res.get_resource_name(), cfg_key))
            else:
                c.add_env(env.name, value=env.value)

        return c

    def _container_resources(self):

        container = self.service.template

        cpu = float(container.resources.cpu_cores)
        mem = float(container.resources.mem_mib)
        main_cpu = cpu
        main_mem = mem
        dind_cpu = 0.0
        dind_mem = 0.0

        if container.docker_spec:
            dind_cpu = float(container.docker_spec.cpu_cores)
            dind_mem = float(container.docker_spec.mem_mib)
            main_cpu = cpu
            main_mem = mem

            if dind_mem < MEM_MIB_MIN_DOCKER_ENABLE:
                raise ValueError(
                    "mem_mib must have a minimum value of {} for docker support"
                    .format(MEM_MIB_MIN_DOCKER_ENABLE))

            if main_mem < MEM_MIB_MIN:
                raise ValueError(
                    "mem_mib must have a minimum value of {}MB".format(
                        MEM_MIB_MIN))

        return main_cpu, main_mem, dind_cpu, dind_mem

    def _generate_wait_container_spec(self):

        main_container_name = self.service.template.name

        c = SidecarTask(main_container_name,
                        self.software_info.image_namespace,
                        self.software_info.image_version)
        c.add_env("AX_MAIN_CONTAINER_NAME", value=main_container_name)
        c.add_env("AX_JOB_NAME", value=self.name)
        c.add_env("AX_CUSTOMER_ID", AXCustomerId().get_customer_id())
        c.add_env("AX_REGION", AXClusterConfig().get_region())
        c.add_env("AX_CLUSTER_NAME_ID", self._name_id)

        return c

    def _gen_service_env(self):
        service_env = {
            "container": {
                "docker": {}
            },
            "s3_bucket": self._s3_bucket,
            "s3_key_prefix": self._s3_key_prefix,
            "s3_bucket_ax_is_external": self._s3_bucket_ax_is_external,
            "s3_bucket_ax": self._s3_bucket_ax,
            "s3_key_prefix_ax": self._s3_key_prefix_ax,
            "docker_enable": self.service.template.docker_spec is not None
        }

        container = self.service.template
        # if container is not 'once', i.e. need to be restarted if failed,
        # set keep_return_code to be True so the inner_executor will pass-through the return code
        service_env["keep_return_code"] = not container.once

        service_env["container"]["docker"]["commands"] = container.command
        service_env["container"]["docker"]["args"] = container.args
        if container.inputs.count() > 0:
            service_env["container"]["inputs"] = container.inputs.to_dict()
        if container.outputs.count() > 0:
            service_env["container"]["outputs"] = container.outputs.to_dict()
        if self.service.service_context:
            service_env["container"][
                "service_context"] = self.service.service_context.to_dict()
            service_env["container"]["service_context"][
                "name"] = self.service.name

        # use base64 encode then decode to accommodate all chars in json
        # xxx todo: which unicode encode to use?
        return base64.b64encode(json.dumps(service_env))

    @staticmethod
    def generate_name(conf):
        """
        This function generates a kubernetes job name from a service template and also
        ensures that the generated name has some relationship to human readable job names 
        while also being unique.
        :param conf: service template 
        :return: job name string
        """
        if not conf["template"].get("once", True):
            # name is fully specified by caller. This is currently only used by
            # workflow executor. No user jobs are expected to use this code path
            # Workflow executor generates a unique name for the workflow so we
            # do not have to worry about generating one for it.
            name = conf.get("name", None)
            if name is None:
                raise ValueError(
                    "name is a required field in service object for once=false."
                )
            return string_to_dns_label(name)
        else:
            return string_to_dns_label(conf["id"])

    @staticmethod
    def insert_defaults(conf):
        """
        This function inserts default that are required for Task processing
        :param conf: input conf
        :return: output conf
        """
        if conf["template"].get("name", None) is None:
            conf["template"]["name"] = "main"
        else:
            conf["template"]["name"] = string_to_dns_label(
                conf["template"]["name"])
        return conf
Esempio n. 3
0
    def delete(self, timeout=None):
        """
        Delete the deployment.

        Idempotency: This function is idempotent. If deployment does not exist then
        delete will silently fail without raising any exceptions.
        Args:
            timeout: In seconds or None for infinite
        """
        options = swagger_client.V1DeleteOptions()
        options.grace_period_seconds = 1
        options.orphan_dependents = False

        def check_result(result):
            # True for retry False for done
            return not result

        @retry(retry_on_result=check_result,
               wait_fixed=2000,
               stop_max_delay=timeout)
        def wait_for_scale_to_zero():
            logger.debug("Wait for scale of deployment to 0 for {} {}".format(
                self.application, self.name))

            @retry_unless(swallow_code=[404])
            def get_scale_from_provider():
                return self.client.apisappsv1beta1_api.read_namespaced_scale_scale(
                    self.application, self.name)

            scale = get_scale_from_provider()
            if scale is None:
                return True
            if scale.status.replicas == 0:
                return True

            return False

        @retry_unless(swallow_code=[404, 409])
        def delete_in_provider():
            logger.debug("Deleting deployment for {} {}".format(
                self.application, self.name))
            self.client.apisappsv1beta1_api.delete_namespaced_deployment(
                options, self.application, self.name)

        def delete_rs_in_provider():
            logger.debug("Deleting replica set for {} {}".format(
                self.application, self.name))
            self.client.extensionsv1beta1.deletecollection_namespaced_replica_set(
                self.application,
                label_selector="deployment={}".format(self.name))

        # now delete deployment object and replication set
        with DeploymentOperation(self):
            dep_obj = self._deployment_status()
            self._scale_to(0)
            wait_for_scale_to_zero()

            if dep_obj:
                resources = AXResources(existing=dep_obj)
                resources.delete_all()

            delete_in_provider()
            delete_rs_in_provider()