Example #1
0
    def __init__(self, org_name: str, course_id: str):
        """
        Helper class to launch grader notebooks within the kubernetes cluster

        Args:
          org_name: the organization name
          course_id: the course id

        Raises:
          ConfigException if the kubectl python client does not have a valid configuration set.
        """
        try:
            # try to load the cluster credentials
            # Configs can be set in Configuration class directly or using helper utility
            config.load_incluster_config()
        except ConfigException:
            # next method uses the KUBECONFIG env var by default
            config.load_kube_config()
        # Uncomment the following lines to enable debug logging
        self.apps_v1 = client.AppsV1Api()
        self.coreV1Api = client.CoreV1Api()
        self.course_id = course_id
        self.grader_name = f"grader-{self.course_id}"
        self.grader_token = token_hex(32)
        self.org_name = org_name

        # Course home directory, its parent should be the grader name
        self.course_dir = Path(
            f"{MNT_ROOT}/{self.org_name}/home/grader-{self.course_id}/{self.course_id}"
        )
        # set the exchange directory path
        self.exchange_dir = Path(EXCHANGE_MNT_ROOT, self.org_name, "exchange")
Example #2
0
def get_configmap(PAI_KUBE_CONFIG_DEFAULT_LOCATION, name, namespace="default"):

    config.load_kube_config(config_file=PAI_KUBE_CONFIG_DEFAULT_LOCATION)
    api_instance = kubernetes.client.CoreV1Api()
    exact = True
    export = True

    target_configmap_data = None
    target_configmap_metadata = None

    try:
        api_response = api_instance.read_namespaced_config_map(name,
                                                               namespace,
                                                               exact=exact,
                                                               export=export)
        target_configmap_data = api_response.data
        target_configmap_metadata = api_response.metadata

    except ApiException as e:
        if e.status == 404:
            logger.info("Couldn't find configmap named {0}".format(name))
            return None
        else:
            logger.error(
                "Exception when calling CoreV1Api->read_namespaced_config_map: {0}"
                .format(str(e)))
            sys.exit(1)

    ret = {
        "metadata": target_configmap_metadata,
        "data": target_configmap_data
    }

    return ret
Example #3
0
 def create_secret_in_namespace_if_not_exist(self, payload, namespace):
     if self.in_cluster:
         config.load_incluster_config()
     else:
         config.load_kube_config(config_file="~/.kube/config")
     try:
         api_instance = client.CoreV1Api()
         api_instance.read_namespaced_secret(payload['metadata']['name'],
                                             namespace)
     except ApiException as e:
         if e.status == 404:
             try:
                 api_instance = client.CoreV1Api()
                 meta_data = client.V1ObjectMeta()
                 meta_data.name = payload['metadata']['name']
                 body = client.V1Secret(metadata=meta_data,
                                        data=payload['data'])
                 api_instance.create_namespaced_secret(namespace, body)
             except ApiException as create_e:
                 logger.error(
                     "Exception when calling CoreV1Api->create_namespaced_secret: %s\n"
                     % create_e)
                 sys.exit(1)
         else:
             logger.error(
                 "Exception when calling CoreV1Api->read_namespaced_secret: %s\n"
                 % e)
             sys.exit(1)
Example #4
0
    def check_python_kubernetes(self):

        #configuration = kubernetes.client.Configuration()
        core_api_instance = client.CoreApi()
        try_count = 0

        while True:

            try:
                self.logger.info(
                    "Try to access to the target kubernetes cluster")
                config.load_kube_config(
                    config_file=self.KUBE_CONFIG_DEFAULT_LOCATION)
                api_response = core_api_instance.get_api_versions()
                self.logger.info(str(api_response))
                break

            except ApiException as e:
                self.logger.error("Failed connect to k8s with python client.")
                try_count = try_count + 1

            if try_count == 3:
                self.logger.error(
                    "All 3 tries of connecting k8s with python client fails.")
                sys.exit(1)

            time.sleep(5)

        self.logger.info(
            "CHECKING SUCCESSFULLY: Successfully access kubernetes through python client. "
        )
Example #5
0
def update_configmap(name, data_dict, namespace):
    config.load_kube_config()
    api_instance = client.CoreV1Api()

    meta_data = client.V1ObjectMeta()
    meta_data.namespace = namespace
    meta_data.name = name
    body = client.V1ConfigMap(metadata=meta_data, data=data_dict)

    try:
        api_response = api_instance.patch_namespaced_config_map(
            name, namespace, body)
        logger.info("configmap named {0} is updated.".format(name))
    except ApiException as e:
        if e.status == 404:
            try:
                logger.info(
                    "Couldn't find configmap named {0}. Create a new configmap"
                    .format(name))
                api_response = api_instance.create_namespaced_config_map(
                    namespace, body)
                logger.info("Configmap named {0} is created".format(name))
            except ApiException as ie:
                logger.error(
                    "Exception when calling CoreV1Api->create_namespaced_config_map: {0}"
                    .format(str(e)))
                sys.exit(1)
        else:
            logger.error(
                "Exception when calling CoreV1Api->patch_namespaced_config_map: {0}"
                .format(str(e)))
            sys.exit(1)
Example #6
0
def main():
    level_str = os.getenv('LOG_LEVEL', 'WARNING').upper()
    format_str = os.getenv('LOG_FORMAT',
                           '%(asctime)s [%(levelname)s] %(message)s')
    console = logging.StreamHandler()
    console.setFormatter(RFC3339Formatter(format_str))
    LOGGER.addHandler(console)
    sleep_time = os.environ.get("SECONDS_BETWEEN_STREAMS", '30')
    sleep_time = int(sleep_time)
    try:
        logging.basicConfig(level=logging.getLevelName(level_str))
    except ValueError as err:
        LOGGER.error(err)
        sys.exit(1)

    try:
        config.load_kube_config()
    except (FileNotFoundError, ConfigException) as err:
        LOGGER.debug("Not able to use Kubeconfig: %s", err)
        try:
            config.load_incluster_config()
        except (FileNotFoundError, ConfigException) as err:
            LOGGER.error("Not able to use in-cluster config: %s", err)
            sys.exit(1)

    try:
        while True:
            hostess.Watcher(env=os.environ, config=configuration).execute()
            LOGGER.info("API closed connection, sleeping for %i seconds",
                        sleep_time)
            time.sleep(sleep_time)
    except RuntimeError as err:
        LOGGER.exception(err)
        sys.exit(1)
Example #7
0
def get_kubernetes_node_info_from_API():
    config.load_kube_config()
    api_instance = client.CoreV1Api()

    # https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/CoreV1Api.md#list_node
    pretty = 'true'
    timeout_seconds = 56

    ret = dict()
    try:
        api_response = api_instance.list_node(pretty=pretty,
                                              timeout_seconds=timeout_seconds)
        for node in api_response.items:
            ret[node.metadata.name] = {
                "cpu-resource":
                int(parse_quantity(node.status.allocatable['cpu'])),
                "mem-resource":
                int(
                    parse_quantity(node.status.allocatable['memory']) / 1024 /
                    1024),
                "gpu-resource":
                int(parse_quantity(node.status.allocatable['nvidia.com/gpu'])),
            }
    except ApiException as e:
        logger.error("Exception when calling CoreV1Api->list_node: %s\n" % e)

    return ret
Example #8
0
def update_configmap(PAI_KUBE_CONFIG_DEFAULT_LOCATION, name, data_dict, namespace = "default"):

    config.load_kube_config(config_file=PAI_KUBE_CONFIG_DEFAULT_LOCATION)
    api_instance = kubernetes.client.CoreV1Api()

    meta_data = kubernetes.client.V1ObjectMeta()
    meta_data.namespace = namespace
    meta_data.name = name
    body = kubernetes.client.V1ConfigMap(
                            metadata = meta_data,
                            data = data_dict)

    try:
        api_response = api_instance.replace_namespaced_config_map(name, namespace, body)
        logger.info("configmap named {0} is updated.".format(name))

    except ApiException as e:

        if e.status == 404:

            try:
                logger.info("Couldn't find configmap named {0}. Create a new configmap".format(name))
                api_response = api_instance.create_namespaced_config_map(namespace, body)
                logger.info("Configmap named {0} is created".format(name))

            except ApiException as ie:
                logger.error("Exception when calling CoreV1Api->create_namespaced_config_map: {0}".format(str(e)))
                sys.exit(1)

        else:
            logger.error("Exception when calling CoreV1Api->replace_namespaced_config_map: {0}".format(str(e)))
            sys.exit(1)
Example #9
0
    def __init__(self):

        logger.debug('Creating KubernetesClient')

        config_file = os.getenv('KUBECONFIG', None)
        context = os.getenv('KUBECONTEXT', DEFAULT_CONTEXT)

        logger.debug('Configuration file is: ' +
                     os.getenv('KUBECONFIG', 'None'))
        logger.debug('Configuration context is: ' + context)

        list_kube_config_contexts = config.list_kube_config_contexts(
            config_file)
        print(list_kube_config_contexts)

        try:
            logger.debug('Trying to load config.load_incluster_config()')
            config.load_incluster_config()
        except Exception as e:
            s = str(e)
            logger.critical('Exception when config.load_incluster_config()',
                            exception=s)
            try:
                logger.debug('Trying to load config.load_kube_config()')
                config.load_kube_config(config_file, context)
            except Exception as e:
                s = str(e)
                logger.critical('Exception when config.load_kube_config()',
                                exception=s)

        # Clients
        self.ClientV1 = client.CoreV1Api()
        self.ExtensionsV1beta1Api = client.ExtensionsV1beta1Api()
Example #10
0
def get_namespaced_secret(namespace):
    config.load_kube_config()
    try:
        api_instance = client.CoreV1Api()
        api_response = api_instance.list_namespaced_secret(namespace)
        return api_response.items
    except ApiException as e:
        if e.status == 404:
            return []
        sys.exit(1)
 def list_all_secrets_from_namespace(self, namespace):
     if self.in_cluster:
         config.load_incluster_config()
     else:
         config.load_kube_config(config_file="~/.kube/config")
     try:
         api_instance = client.CoreV1Api()
         api_response = api_instance.list_namespaced_secret(namespace)
         return api_response.items
     except ApiException as e:
         if e.status == 404:
             return []
         logger.error('Exception when calling CoreV1Api->list_namespaced_secret: %s\n' % e)
         sys.exit(1)
Example #12
0
def list_running_job():
    config.load_kube_config('/root/.kube/config')
    configuration = client.Configuration()
    configuration = client.Configuration()
    configuration.verify_ssl = False
    configuration.debug = False
    client.Configuration.set_default(configuration)
    v1 = client.CoreV1Api()
    pods = v1.list_namespaced_pod(namespace="dev").items
    count = 0
    for pod in pods:
        if pod.metadata.to_dict()["name"].startswith(
                "appname") and "exec" not in pod.metadata.to_dict()["name"]:
            count += 1
    logger.info("running job {}".format(count))
    return count
def get_secret(name, namespace):
    confirm_namespace(namespace)

    config.load_kube_config()
    api_instance = client.CoreV1Api()

    try:
        api_response = api_instance.read_namespaced_secret(name, namespace)
    except ApiException as e:
        if e.status == 404:
            logger.info("Couldn't find secret named {0}.".format(name))
            return None
        else:
            logger.error("Exception when calling CoreV1Api->read_namespaced_config_map: {0}".format(str(e)))
            sys.exit(1)

    return api_response.data
Example #14
0
 def replace_secret_in_namespace(self, payload, namespace):
     if self.in_cluster:
         config.load_incluster_config()
     else:
         config.load_kube_config(config_file="~/.kube/config")
     try:
         api_instance = client.CoreV1Api()
         meta_data = client.V1ObjectMeta()
         meta_data.name = payload['metadata']['name']
         body = client.V1Secret(metadata=meta_data, data=payload['data'])
         # don't use patch, which can't handle empty string: https://github.com/kubernetes/kubernetes/issues/37216
         api_instance.replace_namespaced_secret(payload['metadata']['name'],
                                                namespace, body)
     except ApiException as e:
         logger.error(
             "Exception when calling CoreV1Api->patch_namespaced_secret: %s\n"
             % e)
         sys.exit(1)
Example #15
0
def create_namespace_if_not_exist(namespace):
    config.load_kube_config()
    try:
        api_instance = client.CoreV1Api()
        api_instance.read_namespace(namespace)
    except ApiException as e:
        if e.status == 404:
            api_instance = client.CoreV1Api()
            meta_data = client.V1ObjectMeta()
            meta_data.name = namespace
            body = client.V1Namespace(
                metadata=meta_data
            )
            api_instance.create_namespace(body)
            return True
        logger.error("Failed to create namespace [{0}]".format(namespace))
        sys.exit(1)
    return False
Example #16
0
 def create_group_if_not_exist(self, name):
     if self.in_cluster:
         config.load_incluster_config()
     else:
         config.load_kube_config(config_file="~/.kube/config")
     try:
         api_instance = client.CoreV1Api()
         api_instance.read_namespace(name)
     except ApiException as e:
         if e.status == 404:
             api_instance = client.CoreV1Api()
             meta_data = client.V1ObjectMeta()
             meta_data.name = name
             body = client.V1Namespace(metadata=meta_data)
             api_instance.create_namespace(body)
             return True
         logger.error("Failed to create namespace [{0}]".format(name))
         sys.exit(1)
     return False
Example #17
0
def pod_is_ready_or_not(label_key, label_value, service_name, kubeconfig):

    label_selector_str = "{0}={1}".format(label_key, label_value)

    config.load_kube_config(config_file=kubeconfig)
    v1 = client.CoreV1Api()

    try:
        pod_list = v1.list_pod_for_all_namespaces(
            label_selector=label_selector_str, watch=False)
    except ApiException as e:
        logger.error(
            "Exception when calling CoreV1Api->list_pod_for_all_namespaces: %s\n"
            % e)
        return False

    if len(pod_list.items) == 0:
        logger.warning("No pod can be dectected.")
        return False

    ready = 0
    unready = 0
    if len(pod_list.items) == 0:
        return False
    for pod in pod_list.items:
        if pod.status.container_statuses is None:
            unready = unready + 1
            continue
        flag = True
        for container in pod.status.container_statuses:
            if container.ready != True:
                unready = unready + 1
                flag = False
                break
        if flag:
            ready = ready + 1
    if unready != 0:
        logger.info("{0} is not ready.".format(service_name))
        logger.info("Total: {0}".format(ready + unready))
        logger.info("Ready: {0}".format(ready))
        return False

    return True
def get_pai_users():
    users = []
    config.load_kube_config()
    api_instance = client.CoreV1Api()

    try:
        api_response = api_instance.list_namespaced_secret("pai-user")
        for item in api_response.items:
            users.append(base64.b64decode(item.data["username"]))

    except ApiException as e:
        if e.status == 404:
            logger.info("Couldn't find secret in namespace pai-user, exit")
            sys.exit(1)
        else:
            logger.error("Exception when calling CoreV1Api->list_namespaced_secret: {0}".format(str(e)))
            sys.exit(1)

    return users
Example #19
0
def get_kubernetes_pod_info_from_API():
    config.load_kube_config()
    api_instance = client.CoreV1Api()

    timeout_seconds = 56

    ret = dict()
    try:
        api_response = api_instance.list_pod_for_all_namespaces(
            timeout_seconds=timeout_seconds)
        for pod in api_response.items:
            if pod.spec.node_name not in ret:
                ret[pod.spec.node_name] = [get_pod_requests(pod)]
            else:
                ret[pod.spec.node_name].append(get_pod_requests(pod))
    except ApiException:
        logger.error("Exception when calling CoreV1Api->list_pod",
                     exc_info=True)
        raise
    return ret
Example #20
0
def save_and_clear_k8s(job_folder_name_map):
    config.load_kube_config('/root/.kube/config')
    configuration = client.Configuration()
    configuration = client.Configuration()
    configuration.verify_ssl = False
    configuration.debug = False
    client.Configuration.set_default(configuration)
    v1 = client.CoreV1Api()
    pods = v1.list_namespaced_pod(namespace="dev").items
    for pod in pods:
        time.sleep(1)
        status = pod._status.to_dict()["phase"]
        pod_name = pod.metadata.to_dict()["name"]
        if not pod_name.startswith("appname"):
            continue
        job_id_re = re.search(r"appname(\d+)-", pod_name).groups()
        if not job_id_re:
            continue
        else:
            job_id = job_id_re[0]
        if job_id not in job_folder_name_map.keys():
            continue

        if status.lower() in ("failed", "succeeded"):
            os.makedirs("{}-{}".format(job_folder_name_map[job_id],
                                       status.lower()),
                        exist_ok=True,
                        mode=0o777)
            save_log(pod_name, job_folder_name_map[job_id], status.lower())
            save_yaml(pod.to_dict(), job_folder_name_map[job_id],
                      status.lower())
            if "exec" in pod_name:
                continue
            v1.delete_namespaced_pod(pod_name, "dev")
            logger.info("deleted pod {}".format(pod_name))
        else:
            os.makedirs("{}-{}".format(job_folder_name_map[job_id], "running"),
                        exist_ok=True,
                        mode=0o777)
            append_log(pod_name, job_folder_name_map[job_id], "running")
            save_yaml(pod.to_dict(), job_folder_name_map[job_id], "running")
def delete_secret_content(name, key, namespace):
    confirm_namespace(namespace)
    
    config.load_kube_config()
    api_instance = client.CoreV1Api()
    try:
        api_response = api_instance.read_namespaced_secret(name, namespace)
        if api_response is not None and type(api_response.data) is dict:
            removed_content = api_response.data.pop(key, None)
            if removed_content is not None:
                meta_data = client.V1ObjectMeta()
                meta_data.namespace = namespace
                meta_data.name = name
                body = client.V1Secret(metadata = meta_data, data = api_response.data)
                api_instance.replace_namespaced_secret(name, namespace, body)
    except ApiException as e:
        if e.status == 404:
            logger.info("Couldn't find secret named {0}.".format(name))
        else:
            logger.error("Exception when try to delete {0} from {1}: reason: {2}".format(key, name, str(e)))
            sys.exit(1)
def confirm_namespace(namespace):
    config.load_kube_config()
    api_instance = client.CoreV1Api()

    try:
        api_response = api_instance.read_namespace(namespace)

    except ApiException as e:
        if e.status == 404:
            logger.info("Couldn't find namespace {0}. Create new namespace".format(namespace))
            try:
                meta_data = client.V1ObjectMeta(name=namespace)
                body = client.V1ConfigMap(metadata=meta_data)
                api_response = api_instance.create_namespace(body)
                logger.info("Namesapce {0} is created".format(namespace))
            except ApiException as ie:
                logger.error("Exception when calling CoreV1Api->create_namespace: {0}".format(str(ie)))
                sys.exit(1)
        else:
            logger.error("Exception when calling CoreV1Api->read_namespace: {0}".format(str(e)))
            sys.exit(1)
Example #23
0
def load_kubernetes_config():
    """
    Loads kubernetes configuration in cluster or from file to be able to interact with kubernetes api
    :return:
    """
    config_loaded = False
    try:
        config.load_incluster_config()
        config_loaded = True
    except config.config_exception.ConfigException:
        logger.debug(
            "Unable to load in-cluster configuration; trying to load from Kube config file"
        )
        try:
            config.load_kube_config()
            config_loaded = True
        except (IOError, config.config_exception.ConfigException) as exc:
            logger.debug("Unable to load Kube config; reason={}".format(exc))

    if not config_loaded:
        logger.error("Unable to load in-cluster or Kube config")
        raise SystemExit(1)
Example #24
0
def submit_job(job_list, index_start):
    config.load_kube_config('/root/.kube/config')
    configuration = client.Configuration()
    configuration = client.Configuration()
    configuration.verify_ssl = False
    configuration.debug = False
    client.Configuration.set_default(configuration)
    v1 = client.CoreV1Api()
    running_num = list_running_job()
    end = index_start + (5 - running_num)
    if end > len(job_list):
        end = len(job_list)
    logger.info("submitting index: from {} to {}".format(index_start, end))
    for job_info in job_list[index_start:end]:
        time.sleep(2)
        time_stamp = int(time.time())
        create_pod_yaml(job_info, time_stamp)
        submit_pod(v1)
        job_id = str(job_info[1])
        uid = get_pod_uid(job_id, time_stamp)
        create_confmap_service(uid, job_info, time_stamp)
        submit_confmap_service(v1)
    return end
    def __init__(self):
        config_loaded = False

        try:
            config.load_incluster_config()
            config_loaded = True
        except config.config_exception.ConfigException:
            logger.warn(
                "Unable to load in-cluster configuration; trying to load from Kube config file"
            )
            try:
                config.load_kube_config()
                config_loaded = True
            except (IOError, config.config_exception.ConfigException) as exc:
                logger.warn(
                    "Unable to load Kube config; reason={}".format(exc))

        if not config_loaded:
            logger.error("Unable to load in-cluster or Kube config")
            sys.exit(1)

        cli = client.CoreV1Api()
        cli.api_client.configuration.assert_hostname = False
        self.client = cli
Example #26
0
        labelnames=("fn_name",))

def record(fn):
    @functools.wraps(fn)
    def wrapped(*args, **kwargs):
        start = timeit.default_timer()
        try:
            return fn(*args, **kwargs)
        finally:
            elapsed = timeit.default_timer() - start
            job_deployer_fn_histogram.labels(fn.__name__).observe(elapsed)
    return wrapped


# The config will be loaded from default location.
config.load_kube_config()
k8s_client = client.CoreV1Api()


class JobDeployer:

    def __init__(self):
        self.v1 = k8s_client
        self.namespace = "default"
        self.pretty = "pretty_example"

    @record
    def create_pod(self, body):
        api_response = self.v1.create_namespaced_pod(
            namespace=self.namespace,
            body=body,
Example #27
0
def get_k8s_cluster_info(working_dir, dns_prefix, location):
    kube_config_path = "{0}/_output/{1}/kubeconfig/kubeconfig.{2}.json".format(
        working_dir, dns_prefix, location)
    master_string = "opmaster"
    worker_string = "opworker"

    config.load_kube_config(config_file=kube_config_path)
    api_instance = client.CoreV1Api()
    pretty = 'true'
    timeout_seconds = 56

    master = dict()
    worker = dict()
    sku = None
    gpu_enable = False
    master_ip = None
    master_ip_internal = None

    worker_count = 0
    worker_with_gpu = 0

    try:
        api_response = api_instance.list_node(pretty=pretty,
                                              timeout_seconds=timeout_seconds)
        for node in api_response.items:
            gpu_resource = 0
            if 'nvidia.com/gpu' in node.status.allocatable:
                gpu_resource = int(
                    parse_quantity(node.status.allocatable['nvidia.com/gpu']))
            if master_string in node.metadata.name:
                master[node.metadata.name] = {
                    "cpu-resource":
                    int(parse_quantity(node.status.allocatable['cpu'])) - 2,
                    "mem-resource":
                    int(
                        parse_quantity(node.status.allocatable['memory']) /
                        1024 / 1024) - 8 * 1024,
                    "gpu-resource":
                    gpu_resource,
                }
                master[node.metadata.name]["hostname"] = node.metadata.name
                for address in node.status.addresses:
                    if address.type == "Hostname":
                        continue
                    if master_ip == None:
                        master_ip = address.address
                    if address.type == "ExternalIP":
                        master_ip = address.address
                    if address.type == "InternalIP":
                        master[node.metadata.name]["ip"] = address.address
                        master_ip_internal = address.address
            elif worker_string in node.metadata.name:
                worker[node.metadata.name] = {
                    "cpu-resource":
                    int(parse_quantity(node.status.allocatable['cpu'])) - 2,
                    "mem-resource":
                    int(
                        parse_quantity(node.status.allocatable['memory']) /
                        1024 / 1024) - 8 * 1024,
                    "gpu-resource":
                    gpu_resource,
                }
                if sku is None:
                    sku = dict()
                    if gpu_resource != 0:
                        sku["gpu_resource"] = worker[
                            node.metadata.name]["gpu-resource"]
                        sku["mem-unit"] = int(
                            worker[node.metadata.name]["mem-resource"] /
                            worker[node.metadata.name]["gpu-resource"])
                        sku["cpu-unit"] = int(
                            worker[node.metadata.name]["cpu-resource"] /
                            worker[node.metadata.name]["gpu-resource"])
                    else:
                        sku["cpu_resource"] = worker[
                            node.metadata.name]["cpu-resource"]
                        sku["mem-unit"] = int(
                            worker[node.metadata.name]["mem-resource"] /
                            worker[node.metadata.name]["cpu-resource"])
                worker_count = worker_count + 1
                if worker[node.metadata.name]["gpu-resource"] != 0:
                    worker_with_gpu = worker_with_gpu + 1
                    gpu_enable = True
                worker[node.metadata.name]["hostname"] = node.metadata.name
                for address in node.status.addresses:
                    if address.type == "Hostname":
                        continue
                    if address.type == "InternalIP":
                        worker[node.metadata.name]["ip"] = address.address

    except ApiException as e:
        logger.error("Exception when calling CoreV1Api->list_node: %s\n" % e)

    return {
        "master":
        master,
        "worker":
        worker,
        "sku":
        sku,
        "gpu":
        gpu_enable,
        "gpu-ready":
        worker_count == worker_with_gpu,
        "master_ip":
        master_ip,
        "master_internal":
        master_ip_internal,
        "working_dir":
        "{0}/{1}".format(working_dir, TEMPORARY_DIR_NAME),
        "kube_config":
        "{0}/_output/{1}/kubeconfig/kubeconfig.{2}.json".format(
            working_dir, dns_prefix, location)
    }
Example #28
0
def get_kubernetes_corev1api(PAI_KUBE_CONFIG_PATH, **kwargs):

    config.load_kube_config(config_file=PAI_KUBE_CONFIG_PATH)
    api_instance = kubernetes.client.CoreV1Api()

    return api_instance
Example #29
0
def get_stuck_pods():
    # set up logging configuration
    config_file = os.path.join(sys.path[0], 'logging.conf')
    logging.config.fileConfig(config_file)
    logger = logging.getLogger('output')

    # filters for get pods
    label_selector = "jcx.inst.component=webapp"
    limit = 100

    # initializing stuck pod variables
    pod_list_stuck = []
    pod_list_starting = []
    pod_list_old = []
    starting_cutoff = timedelta(minutes=10)
    old_cutoff = timedelta(hours=8)

    # collecting list of webapp pods from jcx-prod-us-east
    config.load_kube_config(context="jcx-prod-us-east")
    api = client.CoreV1Api()
    pod_list_us = api.list_pod_for_all_namespaces(label_selector=label_selector, limit=limit)

    # collecting list of webapp pods from jcx-prod-eu
    config.load_kube_config(context="jcx-prod-eu")
    api = client.CoreV1Api()
    pod_list_eu = api.list_pod_for_all_namespaces(label_selector=label_selector, limit=limit)

    # combine eu and us pod lists
    pod_list = pod_list_us.items + pod_list_eu.items

    # filter pods into category lists based on age of pod
    if len(pod_list) != 0:
        for pod in pod_list:
            try:
                if not pod.status.container_statuses[0].ready:
                    age = datetime.now(pytz.utc) - pod.metadata.creation_timestamp
                    if age < starting_cutoff:
                        pod_list_starting.append(pod)
                    elif age > old_cutoff:
                        pod_list_old.append(pod)
                    else:
                        pod_list_stuck.append(pod)
            except Exception as e:
                logger.debug("failed on pod: ", pod.metadata.namespace, "  ", pod.metadata.name, e)

        # display results
        os.system('cls||clear')
        logger.debug("{:^33} {:^16} {:^19} {:^6} {:^20}".format("namespace",
                                                                "Pod",
                                                                "Age (H:M:S)",
                                                                "Loc",
                                                                "installation name"))

        logger.debug("{:^94}".format("--------------- STUCK PODS (>10 mins) ---------------"))
        print_pod_list(pod_list_stuck)

        logger.debug("{:^94}".format("------------ IN PROGRESS PODS (<10 mins) ------------"))
        print_pod_list(pod_list_starting)

        logger.debug("{:^94}".format("----------- VERY OLD DEAD PODS (>8 hours) -----------"))
        print_pod_list(pod_list_old)

    else:
        logger.debug("No stuck pods. Continue to monitor")
Example #30
0
def generate_layout(output_file):
    # init client
    config.load_kube_config()
    v1 = client.CoreV1Api()

    # api server url
    api_servers_url = v1.api_client.configuration.host
    # generate dashboard-url
    services = v1.list_service_for_all_namespaces(field_selector="metadata.name=kubernetes-dashboard", pretty=False, timeout_seconds=56, watch=False)
    dashboard_service = services.items[0]
    dashboard_url = "http://{0}:80".format(dashboard_service.spec.cluster_ip)

    # query k8s nodes
    nodes = v1.list_node(pretty=False, timeout_seconds=56, watch=False)
    addressesList = map(lambda node: node.status.addresses, nodes.items)
    machineList = []
    for addresses in addressesList:
        machine = dict()
        machine['machine-type'] = 'GENERIC'
        for address in addresses:
            if address.type == 'InternalIP':
                machine['hostip'] = address.address
            if address.type == 'Hostname':
                machine['hostname'] = address.address
                # TODO nodename == hostname on aks
                machine['nodename'] = address.address
        machineList.append(machine)
    machineList.sort(key=lambda k: k['hostname'])

    # assgin pai-master
    master = machineList[0]
    master['pai-master'] = 'true'
    master['zkid'] = 1

    # assign pai-workers
    workers = machineList[1:] if len(machineList) > 1 else machineList
    for worker in workers:
        worker['pai-worker'] = 'true'

    # the default sku
    machineSku = yaml.load("""
GENERIC:
    mem: 1
    gpu:
        type: generic
        count: 1
    cpu:
        vcore: 1
    os: ubuntu16.04
    """, yaml.SafeLoader)

    layout = {
        "kubernetes": {
            "api-servers-url": api_servers_url,
            "dashboard-url": dashboard_url
        },
        "machine-sku": machineSku,
        "machine-list": machineList
    }
    # print(yaml.dump(layout, default_flow_style=False))
    with open(output_file, 'w') as outfile:
        yaml.dump(layout, outfile, default_flow_style=False)