def get_kubernetes_pods_and_nodes(
    namespace: str, ) -> Tuple[Sequence[V1Pod], Sequence[V1Node]]:
    kube_client = KubeClient()
    all_pods = get_all_pods(kube_client=kube_client, namespace=namespace)
    all_nodes = get_all_nodes(kube_client)

    return all_pods, all_nodes
Beispiel #2
0
def check_all_kubernetes_services_replication(soa_dir: str) -> None:
    kube_client = KubeClient()
    all_pods = get_all_pods(kube_client)
    all_nodes = get_all_nodes(kube_client)
    system_paasta_config = load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()
    smartstack_replication_checker = KubeSmartstackReplicationChecker(
        nodes=all_nodes,
        system_paasta_config=system_paasta_config,
    )

    for service in list_services(soa_dir=soa_dir):
        service_config = PaastaServiceConfigLoader(service=service,
                                                   soa_dir=soa_dir)
        for instance_config in service_config.instance_configs(
                cluster=cluster,
                instance_type_class=kubernetes_tools.
                KubernetesDeploymentConfig,
        ):
            if instance_config.get_docker_image():
                check_service_replication(
                    instance_config=instance_config,
                    all_pods=all_pods,
                    smartstack_replication_checker=
                    smartstack_replication_checker,
                )
            else:
                log.debug(
                    '%s is not deployed. Skipping replication monitoring.' %
                    instance_config.job_id, )
def get_all_running_kubernetes_pods(kube_client: KubeClient,
                                    namespace: str) -> Iterable[V1Pod]:
    running = []
    for pod in kubernetes_tools.get_all_pods(kube_client, namespace):
        if kubernetes_tools.get_pod_status(
                pod) == kubernetes_tools.PodStatus.RUNNING:
            running.append(pod)
    return running
Beispiel #4
0
def assert_kube_pods_running(kube_client: KubeClient, ) -> HealthCheckResult:
    statuses = [get_pod_status(pod) for pod in get_all_pods(kube_client)]
    running = statuses.count(PodStatus.RUNNING)
    pending = statuses.count(PodStatus.PENDING)
    failed = statuses.count(PodStatus.FAILED)
    healthy = running > 0
    return HealthCheckResult(
        message=f"Pods: running: {running} pending: {pending} failed: {failed}",
        healthy=healthy,
    )
Beispiel #5
0
def evicted_pods_per_service(
    client: KubeClient, ) -> Mapping[str, Sequence[EvictedPod]]:
    all_pods = get_all_pods(kube_client=client, namespace="")
    evicted_pods = get_evicted_pods(all_pods)
    log.info(
        f"Pods in evicted state: {[pod.metadata.name for pod in evicted_pods]}"
    )
    evicted_pods_aggregated: Dict[str, List[EvictedPod]] = defaultdict(list)
    for pod in evicted_pods:
        service = get_pod_service(pod)
        if service:
            evicted_pods_aggregated[service].append(
                EvictedPod(pod.metadata.name, pod.metadata.namespace,
                           pod.status.message))
        else:
            log.info(f"Could not get service name for pod {pod.metadata.name}")
    return evicted_pods_aggregated
Beispiel #6
0
def check_all_kubernetes_based_services_replication(
    soa_dir: str,
    service_instances: Sequence[str],
    instance_type_class: Type[InstanceConfig_T],
    check_service_replication: CheckServiceReplication,
    namespace: str,
) -> None:
    kube_client = KubeClient()
    all_pods = get_all_pods(kube_client=kube_client, namespace=namespace)
    all_nodes = get_all_nodes(kube_client)
    system_paasta_config = load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()
    smartstack_replication_checker = KubeSmartstackReplicationChecker(
        nodes=all_nodes, system_paasta_config=system_paasta_config)
    service_instances_set = set(service_instances)

    for service in list_services(soa_dir=soa_dir):
        service_config = PaastaServiceConfigLoader(service=service,
                                                   soa_dir=soa_dir)
        for instance_config in service_config.instance_configs(
                cluster=cluster, instance_type_class=instance_type_class):
            if (service_instances_set
                    and f"{service}{SPACER}{instance_config.instance}"
                    not in service_instances_set):
                continue
            if instance_config.get_docker_image():
                check_service_replication(
                    instance_config=instance_config,
                    all_pods=all_pods,
                    smartstack_replication_checker=
                    smartstack_replication_checker,
                )
            else:
                log.debug(
                    "%s is not deployed. Skipping replication monitoring." %
                    instance_config.job_id)
Beispiel #7
0
def main():
    args = parse_args()
    setup_logging(args.verbose)

    kube_client = KubeClient()
    pods = get_all_pods(kube_client, args.namespace)

    allowed_uptime_minutes = args.minutes
    allowed_error_minutes = args.error_minutes

    completed_pods = []
    errored_pods = []

    for pod in pods:
        if is_pod_completed(pod) and _completed_longer_than_threshold(
                pod, allowed_uptime_minutes):
            completed_pods.append(pod)
        elif (
                # this is currently optional
                allowed_error_minutes is not None
                # there's no direct way to get what type of "bad" state these Pods ended up
                # (kubectl looks at phase and then container statuses to give something descriptive)
                # but, in the end, we really just care that a Pod is in a Failed phase
                and pod.status.phase == "Failed"):
            try:
                # and that said Pod has been around for a while (generally longer than we'd leave
                # Pods that exited sucessfully)
                # NOTE: we do this in a try-except since we're intermittently seeing pods in an error
                # state without a PodScheduled condition (even though that should be impossible)
                # this is not ideal, but its fine to skip these since this isn't a critical process
                if _scheduled_longer_than_threshold(pod,
                                                    allowed_error_minutes):
                    errored_pods.append(pod)
            except AttributeError:
                log.exception(
                    f"Unable to check {pod.metadata.name}'s schedule time. Pod status: {pod.status}.'"
                )

    if not (completed_pods or errored_pods):
        log.debug("No pods to terminate.")
        sys.exit(0)

    if args.dry_run:
        log.debug(
            "Dry run would have terminated the following completed pods:\n " +
            "\n ".join([pod.metadata.name for pod in completed_pods]))
        log.debug(
            "Dry run would have terminated the following errored pods:\n " +
            "\n ".join([pod.metadata.name for pod in errored_pods]))
        sys.exit(0)

    completed_successes, completed_errors = terminate_pods(
        completed_pods, kube_client)
    errored_successes, errored_errors = terminate_pods(errored_pods,
                                                       kube_client)

    successes = {
        "completed": completed_successes,
        "errored": errored_successes,
    }
    errors = {
        "completed": completed_errors,
        "errored": errored_errors,
    }

    for typ, pod_names in successes.items():
        if pod_names:
            log.debug(f"Successfully terminated the following {typ} pods:\n" +
                      "\n ".join(pod_names))

    # we've only really seen this fail recently due to the k8s API being flaky and returning
    # 404s for Pods that its returning to us when we get all Pods, so we just print the error
    # here for now and don't exit with a non-zero exit code since, again, this isn't a critical
    # process
    for typ, pod_names_and_errors in errors.items():
        if pod_names_and_errors:
            log.error(f"Failed to terminate the following {typ} pods:\n" +
                      "\n  ".join(f"{pod_name}: {error}"
                                  for pod_name, error in pod_names_and_errors))