def evict_pod(api_instance, name, namespace):
    """
    Evict a given pod so that it will be rescheduled on a schedulable node.

    We do catch errors and log them here, however, we don't halt the whole
    process because the node needs to go down regardless.  This makes our
    evicting a "best effort".

    Parameters:
    api_instance (object): The K8S API object to use
    name (string): The name of the pod to evict
    namespace (string): The namespace the pod to evict is in
    """

    print("Evicting " + name + " in namespace " + namespace + "!")

    delete_options = client.V1DeleteOptions()
    # After checking pod status for 12 minute, we'll assume the any remaining
    # pods won't evict, and tell the lifecycle hook to move on.  Let's do 12 minutes
    # + 30 seconds to ungracefully terminate any remaining pods, yet still let us
    # list any pods that will be ungracefully terminated
    delete_options.grace_period_seconds = 750

    metadata = client.V1ObjectMeta(name=name, namespace=namespace)
    body = client.V1beta1Eviction(metadata=metadata,
                                  api_version="policy/v1beta1",
                                  kind="Eviction",
                                  delete_options=delete_options)

    try:
        api_instance.create_namespaced_pod_eviction(name=name,
                                                    namespace=namespace,
                                                    body=body)
    except ApiException as e:
        print("Exception when evicting %s: %s\n" % (name, e))
Ejemplo n.º 2
0
def evict_pod(pod):
    delete_options = client.V1DeleteOptions(grace_period_seconds=30)
    eviction = client.V1beta1Eviction(delete_options=delete_options, metadata=pod.metadata)
    if DRYRUN:
        print("DRYRUN: skipping evict_pod step")
    else:
        v1.create_namespaced_pod_eviction(
            name=pod.metadata.name,
            namespace=pod.metadata.namespace,
            body=eviction
        )
    annotate_pod(pod)
Ejemplo n.º 3
0
def evict_pod(pod):
    delete_options = client.V1DeleteOptions(grace_period_seconds=30)
    eviction = client.V1beta1Eviction(delete_options=delete_options,
                                      metadata=pod.metadata)
    if pod.metadata.namespace in ["kube-system"]:
        print("PROTECTED: not evicting from {}: {}".format(
            pod.metadata.namespace, pod.metadata.name))
    elif DRYRUN:
        print("DRYRUN: skipping evict_pod step for {}/{}".format(
            pod.metadata.namespace, pod.metadata.name))
    else:
        v1.create_namespaced_pod_eviction(name=pod.metadata.name,
                                          namespace=pod.metadata.namespace,
                                          body=eviction)
Ejemplo n.º 4
0
def drain_node(node: str):
    core_v1.patch_node(node, body={"spec": {"unschedulable": True}})
    pods = core_v1.list_pod_for_all_namespaces(
        field_selector=f'spec.nodeName={node}').items
    for pod in pods:
        if is_daemonset_pod(pod):
            continue
        pod_meta = client.V1ObjectMeta(name=pod.metadata.name,
                                       namespace=pod.metadata.namespace)
        eviction = client.V1beta1Eviction(
            metadata=pod_meta,
            delete_options=client.V1DeleteOptions(grace_period_seconds=60))
        core_v1.create_namespaced_pod_eviction(
            name=pod.metadata.name,
            namespace=pod.metadata.namespace,
            body=eviction)
Ejemplo n.º 5
0
def evict_pod(name, namespace):
    """
    Evict a pod from a node.

    This method evicts a single pod from a node
    """
    logger = get_logger("evict_pod")
    api_instance = client.CoreV1Api()

    ev = client.V1beta1Eviction()
    ev.metadata = client.V1ObjectMeta()
    ev.metadata.name = name
    ev.metadata.namespace = namespace
    ev.delete_options = client.V1DeleteOptions()

    try:
        api_instance.create_namespaced_pod_eviction(name=name,
                                                    namespace=namespace,
                                                    body=ev)
    except Exception as e:
        logger.debug(e)
        raise Exception("Failed to evict pod " + name + ": " + e)
def drain_nodes(name: str = None, label_selector: str = None,
                delete_pods_with_local_storage: bool = False,
                timeout: int = 120, secrets: Secrets = None) -> bool:
    """
    Drain nodes matching the given label or name, so that no pods are scheduled
    on them any longer and running pods are evicted.

    It does a similar job to `kubectl drain --ignore-daemonsets` or
    `kubectl drain --delete-local-data --ignore-daemonsets` if
    `delete_pods_with_local_storage` is set to `True`. There is no
    equivalent to the `kubectl drain --force` flag.

    You probably want to call `uncordon` from in your experiment's rollbacks.
    """
    # first let's make the node unschedulable
    cordon_node(name=name, label_selector=label_selector, secrets=secrets)

    api = create_k8s_api_client(secrets)

    v1 = client.CoreV1Api(api)
    if name:
        ret = v1.list_node(field_selector="metadata.name={}".format(name))

        logger.debug("Found {d} node named '{s}'".format(
            d=len(ret.items), s=name))
    else:
        ret = v1.list_node(label_selector=label_selector)

        logger.debug("Found {d} node(s) labelled '{s}'".format(
            d=len(ret.items), s=label_selector))

    nodes = ret.items
    if not nodes:
        raise FailedActivity(
            "failed to find a node that matches selector {}".format(
                label_selector))

    for node in nodes:
        node_name = node.metadata.name
        ret = v1.list_pod_for_all_namespaces(
            include_uninitialized=True,
            field_selector="spec.nodeName={}".format(node_name))

        logger.debug("Found {d} pods on node '{n}'".format(
            d=len(ret.items), n=node_name))

        if not ret.items:
            continue

        # following the drain command from kubectl as best as we can
        eviction_candidates = []
        for pod in ret.items:
            name = pod.metadata.name
            phase = pod.status.phase
            volumes = pod.spec.volumes
            annotations = pod.metadata.annotations

            # do not handle mirror pods
            if annotations and "kubernetes.io/config.mirror" in annotations:
                logger.debug("Not deleting mirror pod '{}' on "
                             "node '{}'".format(name, node_name))
                continue

            if any(filter(lambda v: v.empty_dir is not None, volumes)):
                logger.debug(
                    "Pod '{}' on node '{}' has a volume made "
                    "of a local storage".format(name, node_name))
                if not delete_pods_with_local_storage:
                    logger.debug("Not evicting a pod with local storage")
                    continue
                logger.debug("Deleting anyway due to flag")
                eviction_candidates.append(pod)
                continue

            if phase in ["Succeeded", "Failed"]:
                eviction_candidates.append(pod)
                continue

            for owner in pod.metadata.owner_references:
                if owner.controller and owner.kind != "DaemonSet":
                    eviction_candidates.append(pod)
                    break
                elif owner.kind == "DaemonSet":
                    logger.debug(
                        "Pod '{}' on node '{}' is owned by a DaemonSet. Will "
                        "not evict it".format(name, node_name))
                    break
            else:
                raise FailedActivity(
                    "Pod '{}' on node '{}' is unmanaged, cannot drain this "
                    "node. Delete it manually first?".format(name, node_name))

        if not eviction_candidates:
            logger.debug("No pods to evict. Let's return.")
            return True

        logger.debug("Found {} pods to evict".format(len(eviction_candidates)))
        for pod in eviction_candidates:
            eviction = client.V1beta1Eviction()

            eviction.metadata = client.V1ObjectMeta()
            eviction.metadata.name = pod.metadata.name
            eviction.metadata.namespace = pod.metadata.namespace

            eviction.delete_options = client.V1DeleteOptions()
            try:
                v1.create_namespaced_pod_eviction(
                    pod.metadata.name, pod.metadata.namespace, body=eviction)
            except ApiException as x:
                raise FailedActivity(
                    "Failed to evict pod {}: {}".format(
                        pod.metadata.name, x.body))

        pods = eviction_candidates[:]
        started = time.time()
        while True:
            logger.debug("Waiting for {} pods to go".format(len(pods)))

            if time.time() - started > timeout:
                remaining_pods = "\n".join([p.metadata.name for p in pods])
                raise FailedActivity(
                    "Draining nodes did not completed within {}s. "
                    "Remaining pods are:\n{}".format(timeout, remaining_pods))

            pending_pods = pods[:]
            for pod in pods:
                try:
                    p = v1.read_namespaced_pod(
                        pod.metadata.name, pod.metadata.namespace)
                    # rescheduled elsewhere?
                    if p.metadata.uid != pod.metadata.uid:
                        pending_pods.remove(pod)
                        continue
                    logger.debug("Pod '{}' still around in phase: {}".format(
                        p.metadata.name, p.status.phase))
                except ApiException as x:
                    if x.status == 404:
                        # gone...
                        pending_pods.remove(pod)
            pods = pending_pods[:]
            if not pods:
                logger.debug("Evicted all pods we could")
                break

            time.sleep(10)

        return True
Ejemplo n.º 7
0
def drain_node(node_name):
    ret = v1.list_pod_for_all_namespaces(
        field_selector="spec.nodeName={}".format(node_name))

    if not ret.items:
        continue

    # following the drain command from kubectl as best as we can
    eviction_candidates = []
    for pod in ret.items:
        name = pod.metadata.name
        phase = pod.status.phase
        volumes = pod.spec.volumes
        annotations = pod.metadata.annotations

        # do not handle mirror pods
        if annotations and "kubernetes.io/config.mirror" in annotations:
            logger.debug("Not deleting mirror pod '{}' on "
                         "node '{}'".format(name, node_name))
            continue

        if any(filter(lambda v: v.empty_dir is not None, volumes)):
            logger.debug("Pod '{}' on node '{}' has a volume made "
                         "of a local storage".format(name, node_name))
            if not delete_pods_with_local_storage:
                logger.debug("Not evicting a pod with local storage")
                continue
            logger.debug("Deleting anyway due to flag")
            eviction_candidates.append(pod)
            continue

        if phase in ["Succeeded", "Failed"]:
            eviction_candidates.append(pod)
            continue

        for owner in pod.metadata.owner_references:
            if owner.controller and owner.kind != "DaemonSet":
                eviction_candidates.append(pod)
                break
            elif owner.kind == "DaemonSet":
                logger.debug(
                    "Pod '{}' on node '{}' is owned by a DaemonSet. Will "
                    "not evict it".format(name, node_name))
                break
        else:
            raise ActivityFailed(
                "Pod '{}' on node '{}' is unmanaged, cannot drain this "
                "node. Delete it manually first?".format(name, node_name))

    if not eviction_candidates:
        logger.debug("No pods to evict. Let's return.")
        return True

    logger.debug("Found {} pods to evict".format(len(eviction_candidates)))
    for pod in eviction_candidates:
        eviction = client.V1beta1Eviction()

        eviction.metadata = client.V1ObjectMeta()
        eviction.metadata.name = pod.metadata.name
        eviction.metadata.namespace = pod.metadata.namespace

        eviction.delete_options = client.V1DeleteOptions()
        try:
            v1.create_namespaced_pod_eviction(pod.metadata.name,
                                              pod.metadata.namespace,
                                              body=eviction)
        except ApiException as x:
            raise ActivityFailed("Failed to evict pod {}: {}".format(
                pod.metadata.name, x.body))

    pods = eviction_candidates[:]
    started = time.time()
    while True:
        logger.debug("Waiting for {} pods to go".format(len(pods)))
        if time.time() - started > timeout:
            remaining_pods = "\n".join([p.metadata.name for p in pods])
            raise ActivityFailed(
                "Draining nodes did not completed within {}s. "
                "Remaining pods are:\n{}".format(timeout, remaining_pods))

        pending_pods = pods[:]
        for pod in pods:
            try:
                p = v1.read_namespaced_pod(pod.metadata.name,
                                           pod.metadata.namespace)
                # rescheduled elsewhere?
                if p.metadata.uid != pod.metadata.uid:
                    pending_pods.remove(pod)
                    continue
                logger.debug("Pod '{}' still around in phase: {}".format(
                    p.metadata.name, p.status.phase))
            except ApiException as x:
                if x.status == 404:
                    # gone...
                    pending_pods.remove(pod)
        pods = pending_pods[:]
        if not pods:
            logger.debug("Evicted all pods we could")
            break

        time.sleep(10)

    return True
Ejemplo n.º 8
0
def toolkit_clean_evicted_pod(cluster, task_id=None):

    cluster = KubernetesCluster.objects.get(name=cluster)

    if task_id is None:
        user = User.objects.get_or_create(username="******")[0]
        cleaner = toolKitKubernetesCleaner(
            user=user,
            status="RUNNING",
            cluster=cluster,
            task_id=toolkit_clean_evicted_pod.request.id)
    else:
        cleaner = toolKitKubernetesCleaner.objects.get(id=task_id)
        cleaner.status = "RUNNING"
        cleaner.task_id = toolkit_clean_evicted_pod.request.id
        dingding.send(
            title=cleaner.status,
            content=
            "### **{}** @{} \n > * TASK: {}\n > * ID: {} \n > * TASK_ID: {}".
            format(cleaner.status.upper(), cleaner.user.profile.phone,
                   'toolkit_clean_evicted_pod', cleaner.id, cleaner.task_id),
            users=[cleaner.user.profile.phone])

    try:
        cleaner.save()

        client = KubernetesNamespace.get_client(cluster_config=cluster.config)
        v1 = client.AppsV1Api()
        coreV1 = client.CoreV1Api()

        pods = coreV1.list_pod_for_all_namespaces().items
        __pod_list = []
        for pod in pods:
            if pod.status.phase == "Failed" and pod.status.reason == "Evicted":

                namespace = pod.metadata.namespace
                pod = pod.metadata.name
                body = client.V1beta1Eviction(metadata=client.V1ObjectMeta(
                    name=pod, namespace=namespace))
                response = coreV1.create_namespaced_pod_eviction(
                    name=pod, namespace=namespace, body=body, pretty=True)
                __pod_list.append("{}/{}".format(namespace, pod))

    except Exception as err:
        cleaner.status = "FAILURE"
        cleaner.save()
        dingding.send(
            title=cleaner.status,
            content=
            "### **{}** @{} \n > * TASK: {}\n > * ID: {} \n > * TASK_ID: {}".
            format(cleaner.status.upper(), cleaner.user.profile.phone,
                   'toolkit_clean_evicted_pod', cleaner.id, cleaner.task_id),
            users=[cleaner.user.profile.phone])
        raise Exception(err)

    else:
        cleaner.status = "SUCCESS"
        cleaner.pods = __pod_list
        cleaner.save()
        dingding.send(
            title=cleaner.status,
            content=
            "### **{}** @{} \n > * TASK: {}\n > * ID: {} \n > * TASK_ID: {}".
            format(cleaner.status.upper(), cleaner.user.profile.phone,
                   'toolkit_clean_evicted_pod', cleaner.id, cleaner.task_id),
            users=[cleaner.user.profile.phone])
        return "Pod: {} is evicted!".format(__pod_list)