Ejemplo n.º 1
0
def check_pod_status():
    """Background Task to update status/phase of known pods
    """

    from api.models.kubepod import KubePod

    try:
        with PidFile('pod_status') as p:
            print(p.pidname)

            ns = os.environ.get('MLBENCH_NAMESPACE')

            config.load_incluster_config()
            v1 = client.CoreV1Api()

            pods = pods = KubePod.objects.all()

            for pod in pods:
                ret = v1.read_namespaced_pod(pod.name, ns)
                phase = ret.status.phase

                if phase != pod.phase:
                    pod.phase = phase
                    pod.save()

    except PidFileError:
        return
def main():
    global_options = {}

    logging.basicConfig(level=logging.DEBUG, format='%(asctime)-15s %(message)s')
    logging.getLogger('kubernetes').setLevel(logging.WARNING)

    try:
        k8s_config.load_kube_config()
        _, context = k8s_config.list_kube_config_contexts()
        region = context['context']['cluster']
        domain = 'cc.{}.cloud.sap'.format(region)
        global_options['own_namespace'] = 'kube-system' #context['context']['namespace']
    except IOError:
        from os import environ
        environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default'
        k8s_config.load_incluster_config()
        with open('/var/run/secrets/kubernetes.io/serviceaccount/namespace', 'r') as f:
            global_options['own_namespace'] = f.read()
        with open('/etc/resolv.conf', 'r') as f:
            for l in f:
                if re.match('^search\s+', l):
                    _, domain = l.rsplit(' ', 1)
                    domain = domain.strip()

    configurator = Configurator(domain, global_options)
    configurator.poll_config()
    discovery = DnsDiscovery(domain, configurator.global_options)
    discovery.register(re.compile(six.b('\Avc-[a-z]+-?\d+\Z')), configurator)

    while True:
        discovery.discover()
        configurator.poll()
        sleep(10)
Ejemplo n.º 3
0
    def __init__(self, config):
        self._labels = config['labels']
        self._labels[config.get('scope_label', 'cluster-name')] = config['scope']
        self._label_selector = ','.join('{0}={1}'.format(k, v) for k, v in self._labels.items())
        self._namespace = config.get('namespace') or 'default'
        self._role_label = config.get('role_label', 'role')
        config['namespace'] = ''
        super(Kubernetes, self).__init__(config)
        self._retry = Retry(deadline=config['retry_timeout'], max_delay=1, max_tries=-1,
                            retry_exceptions=(KubernetesRetriableException, HTTPException,
                                              HTTPError, socket.error, socket.timeout))
        self._ttl = None
        try:
            k8s_config.load_incluster_config()
        except k8s_config.ConfigException:
            k8s_config.load_kube_config(context=config.get('context', 'local'))

        self.__subsets = None
        use_endpoints = config.get('use_endpoints') and (config.get('patronictl') or 'pod_ip' in config)
        if use_endpoints:
            addresses = [k8s_client.V1EndpointAddress(ip=config['pod_ip'])]
            ports = []
            for p in config.get('ports', [{}]):
                port = {'port': int(p.get('port', '5432'))}
                port.update({n: p[n] for n in ('name', 'protocol') if p.get(n)})
                ports.append(k8s_client.V1EndpointPort(**port))
            self.__subsets = [k8s_client.V1EndpointSubset(addresses=addresses, ports=ports)]
        self._api = CoreV1ApiProxy(use_endpoints)
        self.set_retry_timeout(config['retry_timeout'])
        self.set_ttl(config.get('ttl') or 30)
        self._leader_observed_record = {}
        self._leader_observed_time = None
        self._leader_resource_version = None
        self._leader_observed_subsets = []
        self.__do_not_watch = False
Ejemplo n.º 4
0
def pytest_collection_modifyitems(config, items):
    c = Configuration()
    c.assert_hostname = False
    Configuration.set_default(c)
    k8sconfig.load_incluster_config()
    core_api = k8sclient.CoreV1Api()

    check_longhorn(core_api)

    if config.getoption(SKIP_RECURRING_JOB_OPT):
        skip_upgrade = pytest.mark.skip(reason="remove " +
                                               SKIP_RECURRING_JOB_OPT +
                                               " option to run")
        for item in items:
            if "recurring_job" in item.keywords:
                item.add_marker(skip_upgrade)

    using_csi = check_csi(core_api)
    if using_csi:
        skip_upgrade = pytest.mark.skip(reason="environment is not using " +
                                               "flexvolume")
        for item in items:
            if "flexvolume" in item.keywords:
                item.add_marker(skip_upgrade)

    else:
        skip_upgrade = pytest.mark.skip(reason="environment is not " +
                                               "using csi")
        for item in items:
            if "csi" in item.keywords:
                item.add_marker(skip_upgrade)

    all_nodes_support_mount_propagation = True
    for node in get_longhorn_api_client().list_node():
        node = wait_for_node_mountpropagation_condition(
            get_longhorn_api_client(), node["name"])
        if "conditions" not in node.keys():
            all_nodes_support_mount_propagation = False
        else:
            conditions = node["conditions"]
            for key, condition in conditions.iteritems():
                if key == NODE_CONDITION_MOUNTPROPAGATION and \
                        condition["status"] != CONDITION_STATUS_TRUE:
                    all_nodes_support_mount_propagation = False
                    break
        if not all_nodes_support_mount_propagation:
            break

    if not all_nodes_support_mount_propagation:
        skip_upgrade = pytest.mark.skip(reason="environment does not " +
                                               "support base image")
        skip_node = pytest.mark.skip(reason="environment does not " +
                                            "support mount disk")

        for item in items:
            if "baseimage" in item.keywords:
                item.add_marker(skip_upgrade)
            elif "mountdisk" in item.keywords:
                item.add_marker(skip_node)
Ejemplo n.º 5
0
def _load_kube_config(in_cluster, cluster_context, config_file):
    if not has_kubernetes:
        raise _import_err
    if in_cluster:
        config.load_incluster_config()
    else:
        config.load_kube_config(config_file=config_file, context=cluster_context)
    return client.CoreV1Api()
Ejemplo n.º 6
0
def _load_kube_config(in_cluster):
    from kubernetes import config, client
    if in_cluster:
        config.load_incluster_config()
        return client.CoreV1Api()
    else:
        config.load_kube_config()
        return client.CoreV1Api()
Ejemplo n.º 7
0
def main():
    config.load_incluster_config()

    v1 = client.CoreV1Api()
    print("Listing pods with their IPs:")
    ret = v1.list_pod_for_all_namespaces(watch=False)
    for i in ret.items:
        print("%s\t%s\t%s" %
              (i.status.pod_ip, i.metadata.namespace, i.metadata.name))
Ejemplo n.º 8
0
def main():
    currNameSpace=sys.argv[1]
    config.load_incluster_config()
    v1 = client.CoreV1Api()
    nodeList = v1.list_pod_for_all_namespaces(watch=False)

    for node in nodeList.items:
        if node.metadata.namespace == currNameSpace:
            print("%s %s" % (node.metadata.name, node.status.pod_ip))
Ejemplo n.º 9
0
def _load_kube_config(in_cluster, cluster_context):
    from kubernetes import config, client
    if in_cluster:
        config.load_incluster_config()
        return client.CoreV1Api()
    else:
        if cluster_context is None:
            config.load_kube_config()
            return client.CoreV1Api()
        else:
            return client.CoreV1Api(
                api_client=config.new_client_from_config(context=cluster_context))
Ejemplo n.º 10
0
    def serve(self):
        # For deployed clusters, we should always be running inside
        # a Rook cluster.  For development convenience, also support
        # running outside (reading ~/.kube config)

        if self._in_cluster():
            config.load_incluster_config()
            cluster_name = os.environ['ROOK_CLUSTER_NAME']
        else:
            self.log.warning("DEVELOPMENT ONLY: Reading kube config from ~")
            config.load_kube_config()

            cluster_name = "rook"

            # So that I can do port forwarding from my workstation - jcsp
            from kubernetes.client import configuration
            configuration.verify_ssl = False

        self._k8s = client.CoreV1Api()

        try:
            # XXX mystery hack -- I need to do an API call from
            # this context, or subsequent API usage from handle_command
            # fails with SSLError('bad handshake').  Suspect some kind of
            # thread context setup in SSL lib?
            self._k8s.list_namespaced_pod(cluster_name)
        except ApiException:
            # Ignore here to make self.available() fail with a proper error message
            pass

        self._rook_cluster = RookCluster(
            self._k8s,
            cluster_name)


        # In case Rook isn't already clued in to this ceph
        # cluster's existence, initialize it.
        # self._rook_cluster.init_rook()

        self._initialized.set()

        while not self._shutdown.is_set():
            # XXX hack (or is it?) to kick all completions periodically,
            # in case we had a caller that wait()'ed on them long enough
            # to get persistence but not long enough to get completion

            global all_completions
            self.wait(all_completions)
            all_completions = filter(lambda x: not x.is_complete,
                                     all_completions)

            self._shutdown.wait(5)
Ejemplo n.º 11
0
def _load_kube_config(in_cluster, cluster_context, config_file):
    from kubernetes import config, client
    if in_cluster:
        config.load_incluster_config()
    else:
        config.load_kube_config(config_file=config_file, context=cluster_context)
    if PY2:
        # For connect_get_namespaced_pod_exec
        from kubernetes.client import Configuration
        configuration = Configuration()
        configuration.assert_hostname = False
        Configuration.set_default(configuration)
    return client.CoreV1Api()
Ejemplo n.º 12
0
def main():
    logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO)
    if len(sys.argv) != 4 or sys.argv[1] not in ('on_start', 'on_stop', 'on_role_change'):
        sys.exit('Usage: %s <action> <role> <cluster_name>', sys.argv[0])

    action, role, cluster = sys.argv[1:4]

    k8s_config.load_incluster_config()
    k8s_api = CoreV1Api()

    namespace = os.environ['POD_NAMESPACE']

    if role == 'master' and action in ('on_start', 'on_role_change'):
        patch_master_endpoint(k8s_api, namespace, cluster)
Ejemplo n.º 13
0
 def __init__(self, **kwargs):
     self.svcaccount = kwargs.get('svcaccount','default')
     self.namespace = kwargs.get('namespace','default')
     if kwargs.get('kubeconfig') == 'incluster':
         log.info('load incluster config')
         config.load_incluster_config()
     else:
         cfg = kwargs.get('kubeconfig')
         log.info('load config %s', cfg)
         if not cfg:
             config.load_kube_config()
         else:
             config.load_kube_config(cfg)
         import urllib3
         urllib3.disable_warnings()
Ejemplo n.º 14
0
 def __init__(self, api_client=None, config_file=None):
     config = client.Configuration()
     if api_client:
         self.api_client = api_client
     else:
         if not config.api_client:
             if config_file is not None and config_file != "":
                 konfig.load_kube_config(config_file=config_file)
             else:
                 konfig.load_incluster_config()
             config.api_client = klient.ApiClient()
             # K8S python client doesn't provide any way to configure the
             # client pool size, so we inject the value here
             config.api_client.rest_client.pool_manager.connection_pool_kw[
                 'maxsize'] = 20
         self.api_client = config.api_client
     self._watch = None
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Load kubernetes config here, since this is a Singleton and
        # so this __init__ will be run way before anything else gets run.
        try:
            config.load_incluster_config()
        except config.ConfigException:
            config.load_kube_config()
        self.api = shared_client(self.api_group_name)

        # FIXME: Protect against malicious labels?
        self.label_selector = ','.join(['{}={}'.format(k, v) for k, v in self.labels.items()])
        self.field_selector = ','.join(['{}={}'.format(k, v) for k, v in self.fields.items()])

        self.first_load_future = Future()
        self._stop_event = threading.Event()

        self.start()
Ejemplo n.º 16
0
def check_new_pods():
    """Background Task to look for new pods available in cluster
    """

    from api.models.kubepod import KubePod

    try:
        with PidFile('new_pods') as p:
            print(p.pidname)
            config.load_incluster_config()
            v1 = client.CoreV1Api()

            release_name = os.environ.get('MLBENCH_KUBE_RELEASENAME')
            ns = os.environ.get('MLBENCH_NAMESPACE')

            ret = v1.list_namespaced_pod(
                ns,
                label_selector="component=worker,app=mlbench,release={}"
                .format(release_name))

            all_pods = list(KubePod.objects.all().values_list('name'))

            for i in ret.items:
                if KubePod.objects.filter(name=i.metadata.name).count() == 0:
                    ip = i.status.pod_ip
                    if ip is None:
                        ip = ""

                    pod = KubePod(name=i.metadata.name,
                                  labels=i.metadata.labels,
                                  phase=i.status.phase,
                                  ip=ip,
                                  node_name=i.spec.node_name)
                    pod.save()
                if i.metadata.name in all_pods:
                    all_pods.remove(i.metadata.name)

            KubePod.objects.filter(name__in=all_pods).delete()

    except PidFileError:
        return
Ejemplo n.º 17
0
    def init(self):
        from kubernetes import config, client
        try:
            config.load_incluster_config()
        except Exception as e:
            self._incluster = False
            return

        configuration = client.Configuration()

        class MyApiClient(client.ApiClient):
            """
            A bug introduced by a fix.

            https://github.com/kubernetes-client/python/issues/411
            https://github.com/swagger-api/swagger-codegen/issues/6392
            """

            def __del__(self):
                pass

        self.api_instance = client.CoreV1Api(MyApiClient(configuration))

        # TODO: remove hardcoded part in the future.
        self.namespace = 'default'
        label_selector = 'component=master,app=mlbench'

        try:
            api_response = self.api_instance.list_namespaced_pod(
                self.namespace, label_selector=label_selector)
        except Exception as e:
            print("Exception when calling CoreV1Api->list_namespaced_pod: %s\n" % e)

        assert len(api_response.items) == 1
        master_pod = api_response.items[0]
        ip = master_pod.status.pod_ip
        self.endpoint = "http://{ip}/api/metrics/".format(ip=ip)
        self._initialized = True
Ejemplo n.º 18
0
def kube_v1():
    # Assume we got nothin'.
    k8s_api = None

    # XXX: is there a better way to check if we are inside a cluster or not?
    if "KUBERNETES_SERVICE_HOST" in os.environ:
        # If this goes horribly wrong and raises an exception (it shouldn't),
        # we'll crash, and Kubernetes will kill the pod. That's probably not an
        # unreasonable response.
        config.load_incluster_config()
        k8s_api = client.CoreV1Api()
    else:
        # Here, we might be running in docker, in which case we'll likely not
        # have any Kube secrets, and that's OK.
        try:
            config.load_kube_config()
            k8s_api = client.CoreV1Api()
        except FileNotFoundError:
            # Meh, just ride through.
            logger.info("No K8s")
            pass

    return k8s_api
Ejemplo n.º 19
0
def kube():
    config.load_incluster_config()
    return client.CoreV1Api()
Ejemplo n.º 20
0
def main(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--name',
        dest='name',
        type=str,
        help='Experiment name.'
    )
    parser.add_argument(
        '--destination',
        dest='destination',
        type=str,
        help='The file which stores the best trial of the experiment.'
    )
    parser.add_argument(
        '--train_file_path',
        dest='train_file_path',
        type=str,
        help='Location where training data is located.'
    )
    parser.add_argument(
        '--validation_files_path',
        dest='validation_files_path',
        type=str,
        help='Location where validation data is located.'
    )
    parser.add_argument(
        '--validation_train_files_path',
        dest='validation_train_files_path',
        type=str,
        help='Location where validation of training data is located.'
    )
    parser.add_argument(
        '--es_host',
        dest='es_host',
        type=str,
        help='Name host of Elasticsearch.'
    )
    parser.add_argument(
        '--model_name',
        dest='model_name',
        type=str,
        help='Name of feature set saved in Elasticsearch.'
    )
    parser.add_argument(
        '--ranker',
        dest='ranker',
        type=str,
        help='RankLib algorith to use.'
    )

    args = parser.parse_args()

    files = [f'{args.destination}/best_rank.txt', f'{args.destination}/best_model.txt']
    for file_ in files:
        if os.path.isfile(file_):
            os.remove(file_)

    exp_json_file = PATH / 'experiment.json'
    exp_def = json.loads(open(str(exp_json_file)).read())

    raw_template = json.dumps(
        exp_def['spec']['trialTemplate']['goTemplate']['rawTemplate']
    )
    raw_template = raw_template\
        .replace('{PROJECT_ID}', os.getenv('PROJECT_ID'))\
        .replace('{train_file_path}', args.train_file_path)\
        .replace('{validation_files_path}', args.validation_files_path)\
        .replace('{validation_train_files_path}', args.validation_train_files_path)\
        .replace('{es_host}', args.es_host)\
        .replace('{destination}', args.destination)\
        .replace('{model_name}', args.model_name)\
        .replace('{ranker}', args.ranker)

    exp_def['spec']['trialTemplate']['goTemplate']['rawTemplate'] = raw_template

    config.load_incluster_config()
    api_client = k8s_client.ApiClient()
    experiment = Experiment(client=api_client)
    exp_name = f'{args.name}-{uuid.uuid4().hex}'[:33]

    exp_def['spec']['parameters'] = get_ranker_parameters(args.ranker)
    exp_def['metadata']['name'] = exp_name
    print('this is exp_def: ', json.dumps(exp_def))

    create_response = experiment.create(exp_def)
    print('create response: ', create_response)

    expected_conditions = ["Succeeded", "Failed"]
    current_exp = experiment.wait_for_condition('kubeflow', exp_name,
                                                expected_conditions)
    print('current_exp: ', json.dumps(current_exp))

    expected, _ = experiment.is_expected_conditions(current_exp, ["Succeeded"])

    if expected:
        best_rank = current_exp["status"]["currentOptimalTrial"]["observation"][
            'metrics'][0]['value']
        print('Best Rank Found: ', best_rank)
        params = current_exp["status"]["currentOptimalTrial"]["parameterAssignments"]
        print(json.dumps(params))
        os.makedirs(os.path.dirname(args.destination), exist_ok=True)
        if os.path.isfile(args.destination):
            os.remove(args.destination)

    experiment.delete(exp_name, 'kubeflow')
Ejemplo n.º 21
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--verbose', type=bool, default=False)
    parser.add_argument('--node',
                        action='append',
                        default=[],
                        help='Cilium pod names. Can specify multiple.')

    parser.add_argument('--selector',
                        action='append',
                        default=[],
                        help='k8s equality label selectors for pods which '
                        'monitor should listen to. each selector will '
                        'retrieve its own set of pods. '
                        'Format is "label-name=label-value" '
                        'Can specify multiple.')
    parser.add_argument('--pod',
                        action='append',
                        default=[],
                        help='pod names in form of "namespace:pod-name", '
                        'if there is no namespace, default is assumed. '
                        'Can specify multiple.')
    parser.add_argument('--endpoint',
                        action='append',
                        type=int,
                        default=[],
                        help='Cilium endpoint ids. Can specify multiple.')

    parser.add_argument('--to-selector',
                        action='append',
                        default=[],
                        help='k8s equality label selectors for pods which '
                        'monitor should listen to. each selector will '
                        'retrieve its own set of pods. '
                        'Matches events that go to selected pods. '
                        'Format is "label-name=label-value" '
                        'Can specify multiple.')
    parser.add_argument('--to-pod',
                        action='append',
                        default=[],
                        help='pod names in form of "namespace:pod-name", '
                        'if there is no namespace, default is assumed. '
                        'Matches events that go to specified pods. '
                        'Can specify multiple.')
    parser.add_argument('--to-endpoint',
                        action='append',
                        type=int,
                        default=[],
                        help='Cilium endpoint ids. '
                        'Matches events that go to specified endpoints. '
                        'Can specify multiple.')

    parser.add_argument('--from-selector',
                        action='append',
                        default=[],
                        help='k8s equality label selectors for pods which '
                        'monitor should listen to. each selector will '
                        'retrieve its own set of pods. '
                        'Matches events that come from selected pods. '
                        'Format is "label-name=label-value" '
                        'Can specify multiple.')
    parser.add_argument('--from-pod',
                        action='append',
                        default=[],
                        help='pod names in form of "namespace:pod-name", '
                        'if there is no namespace, default is assumed. '
                        'Matches events that come from specified pods. '
                        'Can specify multiple.')
    parser.add_argument('--from-endpoint',
                        action='append',
                        type=int,
                        default=[],
                        help='Cilium endpoint ids. '
                        'Matches events that come from specified endpoints. '
                        'Can specify multiple.')

    args = parser.parse_args()

    try:
        config.load_kube_config()
    except FileNotFoundError:
        config.load_incluster_config()

    c = Configuration()
    c.assert_hostname = False
    Configuration.set_default(c)
    api = core_v1_api.CoreV1Api()
    runner = MonitorRunner('kube-system', api)

    monitor_args = MonitorArgs(args.verbose, args.selector, args.pod,
                               args.endpoint, args.to_selector, args.to_pod,
                               args.to_endpoint, args.from_selector,
                               args.from_pod, args.from_endpoint)

    try:
        runner.run(monitor_args, args.node)
        ui(runner)
    except KeyboardInterrupt as e:
        pass
    finally:
        runner.finish()
Ejemplo n.º 22
0
def connect():
    config_file = None

    if os.environ.get('RD_CONFIG_ENV') == 'incluster':
        config.load_incluster_config()
        return

    if os.environ.get('RD_CONFIG_CONFIG_FILE'):
        config_file = os.environ.get('RD_CONFIG_CONFIG_FILE')
    elif os.environ.get('RD_NODE_KUBERNETES_CONFIG_FILE'):
        config_file = os.environ.get('RD_NODE_KUBERNETES_CONFIG_FILE')

    url = None
    if os.environ.get('RD_CONFIG_URL'):
        url = os.environ.get('RD_CONFIG_URL')
    elif os.environ.get('RD_NODE_KUBERNETES_CLUSTER_URL'):
        url = os.environ.get('RD_NODE_KUBERNETES_CLUSTER_URL')

    verify_ssl = None
    if os.environ.get('RD_CONFIG_VERIFY_SSL'):
        verify_ssl = os.environ.get('RD_CONFIG_VERIFY_SSL')
    elif os.environ.get('RD_NODE_KUBERNETES_VERIFY_SSL'):
        verify_ssl = os.environ.get('RD_NODE_KUBERNETES_VERIFY_SSL')

    ssl_ca_cert = None
    if os.environ.get('RD_CONFIG_SSL_CA_CERT'):
        ssl_ca_cert = os.environ.get('RD_CONFIG_SSL_CA_CERT')
    elif os.environ.get('RD_NODE_KUBERNETES_SSL_CA_CERT'):
        ssl_ca_cert = os.environ.get('RD_NODE_KUBERNETES_SSL_CA_CERT')

    token = None
    if os.environ.get('RD_CONFIG_TOKEN'):
        token = os.environ.get('RD_CONFIG_TOKEN')
    elif os.environ.get('RD_NODE_KUBERNETES_API_TOKEN'):
        token = os.environ.get('RD_NODE_KUBERNETES_API_TOKEN')

    log.debug("config file")
    log.debug(config_file)
    log.debug("-------------------")

    if config_file:
        log.debug("getting settings from file %s", config_file)
        config.load_kube_config(config_file=config_file)
    else:

        if url:
            log.debug("getting settings from plugin configuration")

            configuration = Configuration()
            configuration.host = url

            if verify_ssl == 'true':
                configuration.verify_ssl = verify_ssl
            else:
                configuration.verify_ssl = None
                configuration.assert_hostname = False

            if ssl_ca_cert:
                configuration.ssl_ca_cert = ssl_ca_cert

            configuration.api_key['authorization'] = token
            configuration.api_key_prefix['authorization'] = 'Bearer'

            client.Configuration.set_default(configuration)
        else:
            log.debug("getting settings from default config file")
            config.load_kube_config()
Ejemplo n.º 23
0
            #print("%s" % (i.metadata.name)) #디버그 용
            for j in i.status.conditions:
                #print("\t%s\t%s" % (j.type, j.status)) #디버그 용
                if (j.type == "Ready" and j.status != "True"):
                    if n_name in uk_node:
                        uk_node[n_name] += 1
                    else:
                        uk_node[n_name] = 0
                    print("unknown %s  count=%d" % (n_name, uk_node[n_name]))
                    # 카운터가 3회 넘어서면 노드를 제거
                    if uk_node[n_name] > 3:
                        del uk_node[n_name]
                        node_delete(v1, i.metadata.name)
                # 1번이라도 상태가 돌아오면 카운터를 초기화
                if (j.type == "Ready" and j.status == "True"):
                    if n_name in uk_node:
                        del uk_node[n_name]
    except ApiException as e:
        print("Exception when calling CoreV1Api->list_node: %s\n" % e)


## 메인
if __name__ == '__main__':
    signal.signal(signal.SIGTERM, handler)  # 시그널 처리
    config.load_incluster_config()  # 인증 정보 취득
    v1 = client.CoreV1Api()  # 인스턴스화
    # 감시 루프
    while True:
        node_monitor(v1)
        sleep(5)  # 감시 간격
Ejemplo n.º 24
0
def main():

    config.load_incluster_config()


    api = client.CustomObjectsApi()

    credentials = pika.PlainCredentials("guest", "guest")
    connection = pika.BlockingConnection(pika.ConnectionParameters("rabbitmq-0.rabbitmq.rabbits.svc.cluster.local", "5672", '/', credentials ))    
    channel = connection.channel()

    channel.exchange_declare(exchange='topic_logs', exchange_type='topic')

    result = channel.queue_declare('', exclusive=True)
    queue_name = result.method.queue
    
    # binding_keys = sys.argv[1:]
    namespaces=['rabbits','team-a','team-b']
    # if not binding_keys:
    #     sys.stderr.write("Usage: %s [binding_key]...\n" % sys.argv[0])
    #     sys.exit(1)

    for namespace in namespaces:
        channel.queue_bind(
            exchange='topic_logs', queue=queue_name, routing_key=namespace)

    print(' [*] Waiting for logs. To exit press CTRL+C')


    def callback(ch, method, properties, body):
        print(" [x] %r:%r" % (method.routing_key, body))

        # namespace = request.json['namespace']
        # revision = request.json['command']
        # print(body.decode())
        # print(revision)

        api.create_namespaced_custom_object(
        group="tekton.dev",
        version="v1beta1",
        namespace=body.decode(),
        plural="taskruns",
        body={
            "apiVersion": "tekton.dev/v1beta1",
            "kind": "TaskRun",
            "metadata": {
                "generateName": "echo-hello-world-taskrun-",
                "namespace":body.decode()
            },
            "spec": {
                "serviceAccountName": "rabbitmq",
                "taskRef": {
                    "name":"echo-hello-world"
                }   
            },
        },
        )

        print("Resource created")

        # output = sp.getoutput(str('tkn taskrun list | grep Succeeded'))
        # tasks = len(output.splitlines())
        # if int(tasks)<2:

        # os.system(str(body.decode()))

            # ch.basic_ack(delivery_tag = method.delivery_tag)   


    channel.basic_consume(
        queue=queue_name, on_message_callback=callback, auto_ack=True)

    channel.start_consuming()
Ejemplo n.º 25
0
 def __init__(self):
     config.load_incluster_config()
     self.kubecoreapi = client.CoreV1Api()
     self.kubebatchapi = client.BatchV1Api()
Ejemplo n.º 26
0
    def __init__(self,
                 namespace=None,
                 service_type=None,
                 gs_image=None,
                 etcd_image=None,
                 zookeeper_image=None,
                 gie_graph_manager_image=None,
                 coordinator_name=None,
                 coordinator_service_name=None,
                 etcd_cpu=None,
                 etcd_mem=None,
                 zookeeper_cpu=None,
                 zookeeper_mem=None,
                 gie_graph_manager_cpu=None,
                 gie_graph_manager_mem=None,
                 engine_cpu=None,
                 engine_mem=None,
                 vineyard_cpu=None,
                 vineyard_mem=None,
                 vineyard_shared_mem=None,
                 image_pull_policy=None,
                 image_pull_secrets=None,
                 volumes=None,
                 num_workers=None,
                 instance_id=None,
                 log_level=None,
                 timeout_seconds=None,
                 waiting_for_delete=None,
                 delete_namespace=None,
                 **kwargs):
        try:
            kube_config.load_incluster_config()
        except:  # noqa: E722
            kube_config.load_kube_config()
        self._api_client = kube_client.ApiClient()
        self._core_api = kube_client.CoreV1Api(self._api_client)
        self._app_api = kube_client.AppsV1Api(self._api_client)

        self._instance_id = instance_id

        # random for multiple k8s cluster in the same namespace
        self._engine_name = self._engine_name_prefix + self._instance_id
        self._etcd_name = self._etcd_name_prefix + self._instance_id
        self._etcd_service_name = self._etcd_service_name_prefix + self._instance_id

        self._gie_graph_manager_name = (self._gie_graph_manager_name_prefix +
                                        self._instance_id)
        self._gie_graph_manager_service_name = (
            self._gie_graph_manager_service_name_prefix + self._instance_id)
        self._vineyard_service_name = (self._vineyard_service_name_prefix +
                                       self._instance_id)

        self._namespace = namespace
        self._service_type = service_type
        self._num_workers = num_workers

        self._coordinator_name = coordinator_name
        self._coordinator_service_name = coordinator_service_name

        self._resource_object = []

        # engine container info
        self._gs_image = gs_image
        self._engine_cpu = engine_cpu
        self._engine_mem = engine_mem

        # vineyard container info
        self._vineyard_cpu = vineyard_cpu
        self._vineyard_mem = vineyard_mem
        self._vineyard_shared_mem = vineyard_shared_mem

        # etcd pod info
        self._etcd_image = etcd_image
        self._etcd_cpu = etcd_cpu
        self._etcd_mem = etcd_mem

        # zookeeper pod info
        self._zookeeper_image = zookeeper_image
        self._zookeeper_cpu = zookeeper_cpu
        self._zookeeper_mem = zookeeper_mem

        # interactive engine graph manager info
        self._gie_graph_manager_image = gie_graph_manager_image
        self._gie_graph_manager_cpu = gie_graph_manager_cpu
        self._gie_graph_manager_mem = gie_graph_manager_mem

        self._image_pull_policy = image_pull_policy

        # image pull secrets
        self._etcd_endpoint = None
        if image_pull_secrets is not None:
            self._image_pull_secrets = image_pull_secrets.split(",")
        else:
            self._image_pull_secrets = []

        self._volumes = json.loads(volumes)

        self._host0 = None
        self._pod_name_list = None
        self._pod_ip_list = None
        self._pod_host_ip_list = None

        self._analytical_engine_endpoint = None
        self._vineyard_service_endpoint = None

        self._closed = False
        self._glog_level = parse_as_glog_level(log_level)
        self._timeout_seconds = timeout_seconds
        self._waiting_for_delete = waiting_for_delete
        self._delete_namespace = delete_namespace

        self._analytical_engine_process = None

        # 8000 ~ 9000 is exposed
        self._learning_engine_ports_usage = 8000
        self._graphlearn_services = dict()
        self._learning_instance_processes = {}
Ejemplo n.º 27
0
def create_k8s_api_client(configuration: Configuration,
                          secrets: Secrets = None) -> client.ApiClient:
    """
    Create a Kubernetes client from:

    1. From a local configuration file if it exists (`~/.kube/config`). You
       can specify which context you want to use as well through the
       `KUBERNETES_CONTEXT` key in the environment or in the `secrets` object.
    2. From the cluster configuration if executed from a Kubernetes pod and
       the CHAOSTOOLKIT_IN_POD is set to `"true"`.
    3. From a mix of the following environment keys:

        * KUBERNETES_HOST: Kubernetes API address

        You can authenticate with a token via:
        * KUBERNETES_API_KEY: the API key to authenticate with
        * KUBERNETES_API_KEY_PREFIX: the key kind, if not set, defaults to
          "Bearer"

        Or via a username/password:
        * KUBERNETES_USERNAME
        * KUBERNETES_PASSWORD

        Or via SSL:
        * KUBERNETES_CERT_FILE
        * KUBERNETES_KEY_FILE

        Finally, you may disable SSL verification against HTTPS endpoints:
        * KUBERNETES_VERIFY_SSL: should we verify the SSL (unset means no)
        * KUBERNETES_CA_CERT_FILE: path the CA certificate when verification is
          expected

        You may pass a secrets dictionary, in which case, values will be looked
        there before the environ.
    """
    env = os.environ
    secrets = secrets or {}

    def lookup(k: str, d: str = None) -> str:
        return secrets.get(k, env.get(k, d))

    if has_local_config_file():
        context = lookup("KUBERNETES_CONTEXT")
        logger.debug("Using Kubernetes context: {}".format(context
                                                           or "default"))
        return config.new_client_from_config(context=context)

    elif env.get("CHAOSTOOLKIT_IN_POD") == "true":
        config.load_incluster_config()
        return client.ApiClient()

    else:
        cfg = client.Configuration()
        cfg.debug = True
        cfg.host = lookup("KUBERNETES_HOST", "http://localhost")
        cfg.verify_ssl = lookup("KUBERNETES_VERIFY_SSL", False) is not False
        cfg.cert_file = lookup("KUBERNETES_CA_CERT_FILE")

        if "KUBERNETES_API_KEY" in env or "KUBERNETES_API_KEY" in secrets:
            cfg.api_key['authorization'] = lookup("KUBERNETES_API_KEY")
            cfg.api_key_prefix['authorization'] = lookup(
                "KUBERNETES_API_KEY_PREFIX", "Bearer")
        elif "KUBERNETES_CERT_FILE" in env or \
                "KUBERNETES_CERT_FILE" in secrets:
            cfg.cert_file = lookup("KUBERNETES_CERT_FILE")
            cfg.key_file = lookup("KUBERNETES_KEY_FILE")
        elif "KUBERNETES_USERNAME" in env or "KUBERNETES_USERNAME" in secrets:
            cfg.username = lookup("KUBERNETES_USERNAME")
            cfg.password = lookup("KUBERNETES_PASSWORD", "")

    return client.ApiClient(cfg)
Ejemplo n.º 28
0
def main():
    config.load_incluster_config()

    api_ext = client.ApiextensionsV1beta1Api()
    apps = client.AppsV1beta1Api()
    crds = client.CustomObjectsApi()

    # Create API controllers within our namespace, which we
    # get through the downward API.
    namespace = os.environ["MY_NAMESPACE"]
    api_controller_image = os.environ["API_IMAGE"]

    owner = apps.read_namespaced_deployment(os.environ["OWNER_NAME"],
                                            namespace)

    # Define our OwnerReference that we will add to the metadata of
    # objects we create so that they are garbage collected when this
    # controller is deleted.
    controller_ref = {
        "apiVersion": owner.api_version,
        "blockOwnerDeletion": True,
        "controller": True,
        "kind": owner.kind,
        "name": os.environ["OWNER_NAME"],
        "uid": owner.metadata.uid,
    }

    def owner_ref(obj, controller=False):
        return {
            "apiVersion": obj["apiVersion"],
            "blockOwnerDeletion": True,
            "controller": controller,
            "kind": obj["kind"],
            "name": obj["metadata"]["name"],
            "uid": obj["metadata"]["uid"],
        }

    def delete_meta(api, resource):
        logging.error("Deleting deployment: %s", resource.group())
        apps.delete_namespaced_deployment(resource.group(),
                                          namespace,
                                          body=client.V1DeleteOptions(
                                              propagation_policy='Foreground',
                                              grace_period_seconds=5))

        logging.error("Deleting CRD: %s", resource.name())
        api_ext.delete_custom_resource_definition(
            resource.name(),
            body=client.V1DeleteOptions(propagation_policy='Foreground',
                                        grace_period_seconds=5))

    def update_meta(api, resource):
        # TODO(mattmoor): Establish a better way to diff the actual/desired
        # object states and reconcile them.  For now, just check the image.
        controller = apps.read_namespaced_deployment(resource.group(),
                                                     namespace)
        if controller.spec.template.spec.containers[
                0].image == api_controller_image:
            logging.warn("Image for %s controller is up-to-date!",
                         resource.name())
            return
        logging.warn("Updating image for %s controller", resource.name())
        controller.spec.template.spec.containers[
            0].image = api_controller_image
        apps.replace_namespaced_deployment(resource.group(), namespace,
                                           controller)

    def create_meta(api, resource):
        api_ext.create_custom_resource_definition(
            resource.definition([controller_ref]))
        apps.create_namespaced_deployment(
            namespace,
            resource.controller(api_controller_image, [controller_ref]))

    def process_meta(t, api, obj):
        if t == "DELETED":
            logging.error("Delete event: %s", json.dumps(obj, indent=1))
            for resource in api.resources():
                delete_meta(api, resource)
        elif t == "MODIFIED" or t == "ADDED":
            for resource in api.resources():
                controller_namespace = resource.controller_namespace()
                if controller_namespace:
                    if controller_namespace != namespace:
                        # This is being controlled by a controller in another namespace.
                        logging.warn(
                            "Found resourced being managed by another "
                            "meta-controller, this is bound to create "
                            "contention.  Skipping %s", api.name())
                        return
                    else:
                        # This is being controlled by us, make sure it is up to date.
                        update_meta(api, resource)
                        continue

                # TODO(mattmoor): See if we can make the api-controller owned
                # by the CRD that spawned it.  Right now, this seems ineffective.
                # crd_ref = owner_ref(obj)
                # This has not been processed yet.
                create_meta(api, resource)

                # Annotate our object with our resource (and namespace)
                resource.annotate(obj, namespace)
                obj = crds.replace_namespaced_custom_object(
                    DOMAIN, VERSION, namespace, PLURAL,
                    obj["metadata"]["name"], obj)
        else:
            logging.error("Unrecognized type: %s", t)

    resource_version = ""
    while True:
        stream = watch.Watch().stream(crds.list_cluster_custom_object,
                                      DOMAIN,
                                      VERSION,
                                      PLURAL,
                                      resource_version=resource_version)
        for event in stream:
            try:
                t = event["type"]
                obj = event["object"]
                api = Api(obj)
                process_meta(t, api, obj)

                # Configure where to resume streaming.
                metadata = obj.get("metadata")
                if metadata:
                    resource_version = metadata["resourceVersion"]
            except:
                logging.exception("Error handling event")
Ejemplo n.º 29
0
async def k8s_update_dn_info(app):
    """ update dn urls by querying k8s api.  Call each url to determine node_ids """
    log.info("k8s_update_dn_info")
    # TBD - find more elegant way to avoid this warning
    import urllib3
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    k8s_config.load_incluster_config(
    )  #get the config from within the cluster and set it as the default config for all new clients
    c = k8s_client.Configuration()  #go and get a copy of the default config
    c.verify_ssl = False  #set verify_ssl to false in that config
    k8s_client.Configuration.set_default(
        c)  #make that config the default for all new clients
    v1 = k8s_client.CoreV1Api()
    k8s_namespace = config.get("k8s_namespace")
    if k8s_namespace:
        # get pods for given namespace
        log.info(f"getting pods for namespace: {k8s_namespace}")
        ret = v1.list_namespaced_pod(namespace=k8s_namespace)
    else:
        log.info("getting pods for all namespaces")
        ret = v1.list_pod_for_all_namespaces(watch=False)
    pod_ips = []
    dn_urls = []
    k8s_app_label = config.get("k8s_app_label")
    for i in ret.items:
        pod_ip = i.status.pod_ip
        if not pod_ip:
            continue
        labels = i.metadata.labels
        if labels and "app" in labels and labels["app"] == k8s_app_label:
            log.info(
                f"found hsds pod with app label: {k8s_app_label} - ip: {pod_ip}"
            )
            pod_ips.append(pod_ip)
    if not pod_ips:
        log.error("Expected to find at least one hsds pod")
        return
    pod_ips.sort()  # for assigning node numbers
    dn_port = config.get("dn_port")
    for pod_ip in pod_ips:
        dn_urls.append(f"http://{pod_ip}:{dn_port}")
    # call info on each dn container and get node ids
    dn_ids = []
    for dn_url in app["dn_urls"]:
        req = dn_url + "/info"
        log.debug(f"about to call: {req}")
        try:
            rsp_json = await http_get(app, req)
            if "node" not in rsp_json:
                log.error("Unexepected response from info (no node key)")
                continue
            node_json = rsp_json["node"]
            if "id" not in node_json:
                log.error("Unexepected response from info (no node/id key)")
                continue
            dn_ids.append(node_json["id"])
        except HTTPServiceUnavailable:
            log.warn("503 error from /info request")
        except Exception as e:
            log.error(f"Exception: {e} from /info request")
    log.info(f"node_info check dn_ids: {dn_ids}")

    # save to global
    app["dn_urls"] = dn_urls
    app["dn_ids"] = dn_ids
Ejemplo n.º 30
0
 def __init__(self):
     config.load_incluster_config()
     self.k8s = client.CoreV1Api()
Ejemplo n.º 31
0
def loadK8SConfig():
    if 'KUBERNETES_PORT' in os.environ:
        config.load_incluster_config()
    else:
        config.load_kube_config()
Ejemplo n.º 32
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--timeout-monitors',
                        type=int,
                        default=0,
                        help='Will remove monitor output which did '
                        'not update in last `timeout` seconds. '
                        'Will not work on last monitor on screen.')
    parser.add_argument('--verbose', action='store_true', default=False)
    parser.add_argument('--hex', action='store_true', default=False)

    # taken from github.com/cilium/cilium/cmd/monitor.go
    type_choices = ['drop', 'debug', 'capture', 'trace']
    parser.add_argument('--type',
                        action='append',
                        default=[],
                        choices=type_choices)

    parser.add_argument('--node',
                        action='append',
                        default=[],
                        help='Specify which nodes monitor will be run on. '
                        'Can match either by cilium pod names or k8s node '
                        'names. Can specify multiple.')

    parser.add_argument('--selector',
                        action='append',
                        default=[],
                        help='k8s equality label selectors for pods which '
                        'monitor should listen to. each selector will '
                        'retrieve its own set of pods. '
                        'Format is "label-name=label-value" '
                        'Can specify multiple.')
    parser.add_argument('--pod',
                        action='append',
                        default=[],
                        help='pod names in form of "namespace:pod-name", '
                        'if there is no namespace, default is assumed. '
                        'Can specify multiple.')
    parser.add_argument('--endpoint',
                        action='append',
                        type=int,
                        default=[],
                        help='Cilium endpoint ids. Can specify multiple.')

    parser.add_argument('--to-selector',
                        action='append',
                        default=[],
                        help='k8s equality label selectors for pods which '
                        'monitor should listen to. each selector will '
                        'retrieve its own set of pods. '
                        'Matches events that go to selected pods. '
                        'Format is "label-name=label-value" '
                        'Can specify multiple.')
    parser.add_argument('--to-pod',
                        action='append',
                        default=[],
                        help='pod names in form of "namespace:pod-name", '
                        'if there is no namespace, default is assumed. '
                        'Matches events that go to specified pods. '
                        'Can specify multiple.')
    parser.add_argument('--to-endpoint',
                        action='append',
                        type=int,
                        default=[],
                        help='Cilium endpoint ids. '
                        'Matches events that go to specified endpoints. '
                        'Can specify multiple.')

    parser.add_argument('--from-selector',
                        action='append',
                        default=[],
                        help='k8s equality label selectors for pods which '
                        'monitor should listen to. each selector will '
                        'retrieve its own set of pods. '
                        'Matches events that come from selected pods. '
                        'Format is "label-name=label-value" '
                        'Can specify multiple.')
    parser.add_argument('--from-pod',
                        action='append',
                        default=[],
                        help='pod names in form of "namespace:pod-name", '
                        'if there is no namespace, default is assumed. '
                        'Matches events that come from specified pods. '
                        'Can specify multiple.')
    parser.add_argument('--from-endpoint',
                        action='append',
                        type=int,
                        default=[],
                        help='Cilium endpoint ids. '
                        'Matches events that come from specified endpoints. '
                        'Can specify multiple.')

    parser.add_argument('--send-command',
                        type=str,
                        default="",
                        help='Execute command as-provided in argument on '
                        'all specified nodes and show output.')

    parser.add_argument('--cilium-namespace',
                        type=str,
                        default="kube-system",
                        help='Specify namespace in which Cilium pods reside')

    parser.add_argument('--clear-monitors',
                        action='store_true',
                        default=False,
                        help='Kill all `cilium monitor` on Cilium nodes. '
                        'Helpful for debugging')

    parser.add_argument('--combine',
                        action='store_true',
                        default=False,
                        help='Prints all output retrieved from nodes to '
                        'stdout. Times out after timeout_monitors.')

    args = parser.parse_args()

    try:
        config.load_kube_config()
    except FileNotFoundError:
        config.load_incluster_config()

    c = Configuration()
    c.assert_hostname = False
    Configuration.set_default(c)
    api = core_v1_api.CoreV1Api()
    runner = MonitorRunner(args.cilium_namespace, api)

    monitor_args = MonitorArgs(args.verbose, args.hex, args.selector, args.pod,
                               args.endpoint, args.to_selector, args.to_pod,
                               args.to_endpoint, args.from_selector,
                               args.from_pod, args.from_endpoint, args.type)

    try:
        if args.clear_monitors:
            cmd = "pkill -f \"cilium monitor\""
        else:
            cmd = args.send_command

        runner.run(monitor_args, args.node, cmd)
        if args.combine:
            batch(runner, args.timeout_monitors)
        elif not args.clear_monitors:
            ui(runner, args.timeout_monitors)
    except KeyboardInterrupt as e:
        pass
    finally:
        runner.finish()
Ejemplo n.º 33
0
from socket import *
from .container import ContainerProcessProxy
from kubernetes import client, config
from ..sessions.kernelsessionmanager import KernelSessionManager
import urllib3
urllib3.disable_warnings()

# Default logging level of kubernetes produces too much noise - raise to warning only.
logging.getLogger('kubernetes').setLevel(os.environ.get('EG_KUBERNETES_LOG_LEVEL', logging.WARNING))

enterprise_gateway_namespace = os.environ.get('EG_NAMESPACE', 'default')
default_kernel_service_account_name = os.environ.get('EG_DEFAULT_KERNEL_SERVICE_ACCOUNT_NAME', 'default')
kernel_cluster_role = os.environ.get('EG_KERNEL_CLUSTER_ROLE', 'cluster-admin')
shared_namespace = bool(os.environ.get('EG_SHARED_NAMESPACE', 'False').lower() == 'true')

config.load_incluster_config()


class KubernetesProcessProxy(ContainerProcessProxy):

    def __init__(self, kernel_manager, proxy_config):
        super(KubernetesProcessProxy, self).__init__(kernel_manager, proxy_config)

        self.kernel_namespace = None
        self.delete_kernel_namespace = False

    def launch_process(self, kernel_cmd, **kw):

        # Set env before superclass call so we see these in the debug output

        # Kubernetes relies on many internal env variables.  Since EG is running in a k8s pod, we will
Ejemplo n.º 34
0
    def resolve_hostnames(self):

        orchestrator = os.getenv('KOLLAPS_ORCHESTRATOR', 'swarm')
        if orchestrator == 'kubernetes':
            # kubernetes version
            # we are only talking to the kubernetes API

            experimentUUID = environ.get('KOLLAPS_UUID', '')
            config.load_incluster_config()
            kubeAPIInstance = client.CoreV1Api()
            need_pods = kubeAPIInstance.list_namespaced_pod('default')
            for service in self.services:
                hosts = self.services[service]
                answers = []
                ips = []
                while len(ips) != len(hosts):
                    answers = []
                    need_pods = kubeAPIInstance.list_namespaced_pod('default')
                    try:
                        for pod in need_pods.items:  # loop through pods - much less elegant than using a DNS service
                            if pod.metadata.name.startswith(service + "-" + experimentUUID):
                                if pod.status.pod_ip is not None:  # LL
                                    answers.append(pod.status.pod_ip)
                        ips = [str(ip) for ip in answers]
                    except:
                        sleep(3)
                ips.sort()  # needed for deterministic behaviour
                for i in range(len(hosts)):
                    int_ip = ip2int(ips[i])
                    hosts[i].ip = int_ip
                    hosts[i].replica_id = i
                    self.hosts_by_ip[int_ip] = hosts[i]

        else:
            if orchestrator != 'swarm':
                print("Unrecognized orchestrator. Using default docker swarm.")

            # python's built in address resolver looks in /etc/hosts first
            # this is a problem since services with multiple replicas (same hostname)
            # will only have ONE entry in /etc/hosts, so the other hosts will never be found...
            # Solution: forcefully use dns queries that skip /etc/hosts (this pulls the dnspython dependency...)

            # Moreover, in some scenarios the /etc/resolv.conf is broken inside the containers
            # So to get the names to resolve properly we need to force to use dockers internal nameserver
            # 127.0.0.11

            experimentUUID = environ.get('KOLLAPS_UUID', '')
            docker_resolver = dns.resolver.Resolver(configure=False)
            docker_resolver.nameservers = ['127.0.0.11']
            for service in self.services:
                hosts = self.services[service]
                ips = []
                while len(ips) != len(hosts):
                    try:
                        answers = docker_resolver.query(service + "-" + experimentUUID, 'A')
                        ips = [str(ip) for ip in answers]
                        if len(ips) != len(hosts):
                            sleep(3)
                    except:
                        sleep(3)
                        
                ips.sort()  # needed for deterministic behaviour
                for i in range(len(hosts)):
                    int_ip = ip2int(ips[i])
                    hosts[i].ip = int_ip
                    hosts[i].replica_id = i
                    self.hosts_by_ip[int_ip] = hosts[i]
Ejemplo n.º 35
0
def monitor():
  global monitored_pods
  global zeek_pods

  print("Running on node %s as %s" % (my_node_name, my_zeek_node_type))

  config.load_incluster_config()

  v1 = client.CoreV1Api()
  w = watch.Watch()

  for event in w.stream(v1.list_pod_for_all_namespaces):
    event_type = event['type']
    labels = event['object'].metadata.labels
    metadata = event['object'].metadata
    key = metadata.namespace + '.' + metadata.name

    if event_type in ['ADDED', 'MODIFIED']:
      if 'zeek-monitor' in labels:
        monitored_pods[key] = event['object']
      if 'zeek-node' in labels:
        zeek_pods[key] = event['object']

    if event_type in ['DELETED']:
      if key in monitored_pods:
        del monitored_pods[key]
      if key in zeek_pods:
        del zeek_pods[key]

    zeek_topology = []
    zeek_workers = {}

    for key in zeek_pods.keys():
      pod = zeek_pods[key]
      pod_name = pod.metadata.name
      pod_ip = pod.status.pod_ip

      node_name = pod.spec.node_name

      zeek_node_type = pod.metadata.labels['zeek-node'].upper()

      if zeek_node_type in ['MANAGER']:
        zeek_node_name = 'manager'
        zeek_topology.append({ 'name': zeek_node_name, 'type': zeek_node_type, 'ip': pod_ip })
      if zeek_node_type in ['PROXY', 'LOGGER']:
        zeek_node_name = '%s-%s' % (node_type.lower(), pod_name)
        zeek_topology.append({ 'name': zeek_node_name, 'type': zeek_node_type, 'ip': pod_ip, 'manager': 'manager' })
      if zeek_node_type in ['WORKER']:
        zeek_workers[node_name] = pod

    for key in monitored_pods.keys():
      pod = monitored_pods[key]
      pod_name = pod.metadata.name
      pod_namespace = pod.metadata.namespace

      node_name = pod.spec.node_name

      if node_name in zeek_workers:
        zeek_worker_ip = zeek_workers[node_name].status.pod_ip   
        zeek_node_name = 'worker-%s-%s' % (pod_namespace, pod_name)
        interface_hash = hashlib.sha1(('%s.%s' % (pod_namespace, pod_name)).encode('utf-8'))
        zeek_interface = '%s%s' % (interface_prefix, interface_hash.hexdigest()[:11])
        zeek_topology.append({ 'name': zeek_node_name, 'type': 'WORKER', 'ip': zeek_worker_ip, 'manager': 'manager', 'interface': zeek_interface })

    zeek_topology.sort(key = lambda e: e['name'])
    port = 47761

    for element in zeek_topology:
      element['port'] = port
      port += 1

    with open(cluster_layout_template_file) as fi:
      template = Template(fi.read())

      if not os.path.isfile(cluster_layout_file):
        with open(cluster_layout_file, 'w') as fo:
          fo.write(template.render(zeek_topology=zeek_topology))
        sync_zeek()
      else:
        cluster_layout_file_temp = cluster_layout_file + '.tmp'

        with open(cluster_layout_file_temp, 'w') as fo:
          fo.write(template.render(zeek_topology=zeek_topology))

        equal = filecmp.cmp(cluster_layout_file, cluster_layout_file_temp)

        if not equal:
          os.rename(cluster_layout_file_temp, cluster_layout_file)
          sync_zeek()
Ejemplo n.º 36
0
    def get_conn(self) -> client.ApiClient:
        """Returns kubernetes api session for use with requests"""
        in_cluster = self._coalesce_param(
            self.in_cluster,
            self.conn_extras.get("extra__kubernetes__in_cluster") or None)
        cluster_context = self._coalesce_param(
            self.cluster_context,
            self.conn_extras.get("extra__kubernetes__cluster_context") or None)
        kubeconfig_path = self._coalesce_param(
            self.config_file,
            self.conn_extras.get("extra__kubernetes__kube_config_path")
            or None)

        kubeconfig = self.conn_extras.get(
            "extra__kubernetes__kube_config") or None
        num_selected_configuration = len(
            [o for o in [in_cluster, kubeconfig, kubeconfig_path] if o])

        if num_selected_configuration > 1:
            raise AirflowException(
                "Invalid connection configuration. Options kube_config_path, "
                "kube_config, in_cluster are mutually exclusive. "
                "You can only use one option at a time.")

        disable_verify_ssl = self._coalesce_param(
            self.disable_verify_ssl,
            _get_bool(self._get_field("disable_verify_ssl")))
        disable_tcp_keepalive = self._coalesce_param(
            self.disable_tcp_keepalive,
            _get_bool(self._get_field("disable_tcp_keepalive")))

        # BEGIN apply settings from core kubernetes configuration
        # this section should be removed in next major release
        deprecation_warnings: List[Tuple[str, Any]] = []
        if disable_verify_ssl is None and self._deprecated_core_disable_verify_ssl is True:
            deprecation_warnings.append(('verify_ssl', False))
            disable_verify_ssl = self._deprecated_core_disable_verify_ssl
        # by default, hook will try in_cluster first. so we only need to
        # apply core airflow config and alert when False and in_cluster not otherwise set.
        if in_cluster is None and self._deprecated_core_in_cluster is False:
            deprecation_warnings.append(
                ('in_cluster', self._deprecated_core_in_cluster))
            in_cluster = self._deprecated_core_in_cluster
        if not cluster_context and self._deprecated_core_cluster_context:
            deprecation_warnings.append(
                ('cluster_context', self._deprecated_core_cluster_context))
            cluster_context = self._deprecated_core_cluster_context
        if not kubeconfig_path and self._deprecated_core_config_file:
            deprecation_warnings.append(
                ('config_file', self._deprecated_core_config_file))
            kubeconfig_path = self._deprecated_core_config_file
        if disable_tcp_keepalive is None and self._deprecated_core_disable_tcp_keepalive is True:
            deprecation_warnings.append(('enable_tcp_keepalive', False))
            disable_tcp_keepalive = True
        if deprecation_warnings:
            self._deprecation_warning_core_param(deprecation_warnings)
        # END apply settings from core kubernetes configuration

        if disable_verify_ssl is True:
            _disable_verify_ssl()
        if disable_tcp_keepalive is not True:
            _enable_tcp_keepalive()

        if in_cluster:
            self.log.debug(
                "loading kube_config from: in_cluster configuration")
            self._is_in_cluster = True
            config.load_incluster_config()
            return client.ApiClient()

        if kubeconfig_path is not None:
            self.log.debug("loading kube_config from: %s", kubeconfig_path)
            self._is_in_cluster = False
            config.load_kube_config(
                config_file=kubeconfig_path,
                client_configuration=self.client_configuration,
                context=cluster_context,
            )
            return client.ApiClient()

        if kubeconfig is not None:
            with tempfile.NamedTemporaryFile() as temp_config:
                self.log.debug(
                    "loading kube_config from: connection kube_config")
                temp_config.write(kubeconfig.encode())
                temp_config.flush()
                self._is_in_cluster = False
                config.load_kube_config(
                    config_file=temp_config.name,
                    client_configuration=self.client_configuration,
                    context=cluster_context,
                )
            return client.ApiClient()

        return self._get_default_client(cluster_context=cluster_context)
Ejemplo n.º 37
0
def main():
    print(f"{timestamp()} Starting collector")

    folder_annotation = os.getenv(FOLDER_ANNOTATION)
    if folder_annotation is None:
        print(f"{timestamp()} No folder annotation was provided, "
              "defaulting to k8s-sidecar-target-directory")
        folder_annotation = "k8s-sidecar-target-directory"

    label = os.getenv(LABEL)
    if label is None:
        print(
            f"{timestamp()} Should have added {LABEL} as environment variable! Exit"
        )
        return -1

    label_value = os.getenv(LABEL_VALUE)
    if label_value:
        print(f"{timestamp()} Filter labels with value: {label_value}")

    target_folder = os.getenv(FOLDER)
    if target_folder is None:
        print(
            f"{timestamp()} Should have added {FOLDER} as environment variable! Exit"
        )
        return -1

    resources = os.getenv(RESOURCE, "configmap")
    resources = ("secret",
                 "configmap") if resources == "both" else (resources, )
    print(f"{timestamp()} Selected resource type: {resources}")

    method = os.getenv(REQ_METHOD)
    url = os.getenv(REQ_URL)
    payload = os.getenv(REQ_PAYLOAD)
    script = os.getenv(SCRIPT)

    # this is where kube_config is going to look for a config file
    kube_config = os.path.expanduser(KUBE_CONFIG_DEFAULT_LOCATION)
    if os.path.exists(kube_config):
        config.load_kube_config(kube_config)
    else:
        config.load_incluster_config()

    print(f"{timestamp()} Config for cluster api loaded...")
    current_namespace = open(
        "/var/run/secrets/kubernetes.io/serviceaccount/namespace").read()

    if os.getenv(SKIP_TLS_VERIFY) == "true":
        configuration = client.Configuration()
        configuration.verify_ssl = False
        configuration.debug = False
        client.Configuration.set_default(configuration)

    unique_filenames = os.getenv(UNIQUE_FILENAMES)
    if unique_filenames is not None and unique_filenames.lower() == "true":
        print(f"{timestamp()} Unique filenames will be enforced.")
        unique_filenames = True
    else:
        print(f"{timestamp()} Unique filenames will not be enforced.")
        unique_filenames = False

    if os.getenv(METHOD) == "LIST":
        for res in resources:
            list_resources(label, label_value, target_folder, url, method,
                           payload, current_namespace, folder_annotation, res,
                           unique_filenames, script)
    else:
        watch_for_changes(os.getenv(METHOD), label, label_value, target_folder,
                          url, method, payload, current_namespace,
                          folder_annotation, resources, unique_filenames,
                          script)
Ejemplo n.º 38
0
    def __init__(self, options):
        self.options = options
        self.sqs_client = boto3.client('sqs', region_name=options.aws_region)
        if not self.options.sqs_queue_url:
        # Derive the URL from the queue name
        self.options.sqs_queue_url = self.sqs_client.get_queue_url(QueueName=self.options.sqs_queue_name)['QueueUrl']
        config.load_incluster_config()
        self.apps_v1 = client.AppsV1Api()
        self.last_scale_up_time = time()
        self.last_scale_down_time = time()

    def message_count(self):
        response = self.sqs_client.get_queue_attributes(
            QueueUrl=self.options.sqs_queue_url,
            AttributeNames=['ApproximateNumberOfMessages']
        )
        return int(response['Attributes']['ApproximateNumberOfMessages'])


    def poll(self):
        message_count = self.message_count()
        t = time()
        if  message_count >= self.options.scale_up_messages:
            if t - self.last_scale_up_time > self.options.scale_up_cool_down:
                self.scale_up()
                self.last_scale_up_time = t
            else:
                logger.debug("Waiting for scale up cooldown")
        if message_count <= self.options.scale_down_messages:
            if t - self.last_scale_down_time > self.options.scale_down_cool_down:
                self.scale_down()
                self.last_scale_down_time = t
            else:
                logger.debug("Waiting for scale down cooldown")

        # code for scale to use msg_count
        sleep(self.options.poll_period)

    def scale_up(self):
        deployment = self.deployment()
        if deployment.spec.replicas < self.options.max_pods:
            logger.info("Scaling up")
            deployment.spec.replicas += 1
            self.update_deployment(deployment)
        elif deployment.spec.replicas > self.options.max_pods:
            self.scale_down()
        else:
            logger.info("Max pods reached")

    def scale_down(self):
        deployment = self.deployment()
        if deployment.spec.replicas > self.options.min_pods:
            logger.info("Scaling Down")
            deployment.spec.replicas -= 1
            self.update_deployment(deployment)
        elif deployment.spec.replicas < self.options.min_pods:
            self.scale_up()
        else:
            logger.info("Min pods reached")

    def deployment(self):
        logger.debug("loading deployment: {} from namespace: {}".format(self.options.kubernetes_deployment, self.options.kubernetes_namespace))
        deployments = self.apps_v1.list_namespaced_deployment(self.options.kubernetes_namespace, label_selector="component={}".format(self.options.kubernetes_deployment))
        return deployments.items[0]

    def update_deployment(self, deployment):
        # Update the deployment
        api_response = self.apps_v1.patch_namespaced_deployment(
            name=self.options.kubernetes_deployment,
            namespace=self.options.kubernetes_namespace,
            body=deployment)
        logger.debug("Deployment updated. status='%s'" % str(api_response.status))

    def run(self):
        options = self.options
        logger.debug("Starting poll for {} every {}s".format(options.sqs_queue_url, options.poll_period))
        while True:
            self.poll()
Ejemplo n.º 39
0
def launch_kubernetes_kernel(connection_file, response_addr, spark_context_init_mode):
    # Launches a containerized kernel as a kubernetes pod.

    config.load_incluster_config()

    # Capture keywords and their values.
    keywords = dict()

    # Factory values...
    # Since jupyter lower cases the kernel directory as the kernel-name, we need to capture its case-sensitive
    # value since this is used to locate the kernel launch script within the image.
    keywords['kernel_name'] = os.path.basename(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
    keywords['eg_response_address'] = response_addr
    keywords['kernel_connection_filename'] = connection_file
    keywords['kernel_spark_context_init_mode'] = spark_context_init_mode

    # Walk env variables looking for names prefixed with KERNEL_.  When found, set corresponding keyword value
    # with name in lower case.
    for name, value in os.environ.items():
        if name.startswith('KERNEL_'):
            keywords[name.lower()] = value

    # Read the kernel-pod yaml file, stripping off any commented lines.  This allows instances of the
    # yaml file to comment out substitution parameters since we want to fail the launch if any are left
    # unsubstituted.  Otherwise, commented out parameters could fail the launch if they had no substitutions.
    #
    yaml_template = ''
    with open(os.path.join(os.path.dirname(__file__), "kernel-pod.yaml")) as f:
        for line in f:
            line = line.split('#', 1)[0]
            yaml_template = yaml_template + line
        f.close()

    # Perform substitutions, then verify all parameters have been replaced.  If any
    # parameters still exist, print their names and exit.  If all have been replaced,
    # iterate over each document, issue creation statements.
    #
    k8s_yaml = Template(yaml_template).safe_substitute(keywords)

    # Check for non-substituted parameters - exit if found.
    #
    missing_params = [param[1] for param in Formatter().parse(k8s_yaml) if param[1]]
    if len(missing_params) > 0:
        missing_params = ['${' + param[1] + '}' for param in Formatter().parse(k8s_yaml) if param[1]]
        if len(missing_params) > 0:
            sys.exit("ERROR - The following parameters were not substituted - kernel launch terminating! {}".
                     format(missing_params))

    # For each k8s object (kind), call the appropriate API method.  Too bad there isn't a method
    # that can take a set of objects.
    #
    # Creation for additional kinds of k8s objects can be added below.  Refer to
    # https://github.com/kubernetes-client/python for API signatures.  Other examples can be found in
    # https://github.com/jupyter-incubator/enterprise_gateway/blob/master/enterprise_gateway/services/processproxies/k8s.py
    #
    kernel_namespace = keywords['kernel_namespace']
    k8s_objs = yaml.load_all(k8s_yaml)
    for k8s_obj in k8s_objs:
        if k8s_obj.get('kind'):
            if k8s_obj['kind'] == 'Pod':
                client.CoreV1Api(client.ApiClient()).create_namespaced_pod(body=k8s_obj, namespace=kernel_namespace)
            elif k8s_obj['kind'] == 'Secret':
                client.CoreV1Api(client.ApiClient()).create_namespaced_secret(body=k8s_obj, namespace=kernel_namespace)
            elif k8s_obj['kind'] == 'PersistentVolumeClaim':
                client.CoreV1Api(client.ApiClient()).create_namespaced_persistent_volume_claim(
                    body=k8s_obj, namespace=kernel_namespace)
            elif k8s_obj['kind'] == 'PersistentVolume':
                client.CoreV1Api(client.ApiClient()).create_persistent_volume(body=k8s_obj)
            else:
                sys.exit("ERROR - Unhandled Kubernetes object kind '{}' found in yaml file - kernel launch terminating!".
                      format(k8s_obj['kind']))
        else:
            sys.exit("ERROR - Unknown Kubernetes object '{}' found in yaml file - kernel launch terminating!".
                      format(k8s_obj))
Ejemplo n.º 40
0
 def __init__(self):
     if is_running_in_k8s():
         config.load_incluster_config()
     else:
         config.load_kube_config()
Ejemplo n.º 41
0
def main():
    config.load_incluster_config()
    #config.load_kube_config()

    apps_beta1 = client.AppsV1beta1Api()
    crds = client.CustomObjectsApi()
    v1 = client.CoreV1Api()
    batch = client.BatchV2alpha1Api()

    def create_meta(app):
        controller_ref = {
            "apiVersion": app._apiversion.rstrip("/v1"),
            "blockOwnerDeletion": True,
            "kind": app._kind,
            "name": app.crd_name(),
            "uid": app._metadata["uid"],
        }
        job = batch.create_namespaced_cron_job(namespace="default", body=app.cronjob([controller_ref]))
        logging.warning("Created CronJob for App: %s", job.metadata.name)
        logging.warning("Owner's reference: %s", json.dumps(controller_ref))
        
    def update_meta(app):
        try:
            create_meta(app)
        except ApiException as e:
            if e.status != httplib.CONFLICT:
                raise e

        # Tear down any versions that shouldn't exist.
        #delete_meta(app.other_versions())
        
    def delete_meta(selector):
        # Handle random namespace later...
        namespace = "default"
        for job in batch.list_namespaced_cron_job(
                namespace, label_selector=selector).items:
            batch.delete_namespaced_cron_job(
                job.metadata.name, namespace, body=client.V1DeleteOptions(
                    propagation_policy='Foreground', grace_period_seconds=5))
            logging.warning("Deleted the CronJob for: %s", job.metadata.name)

    def process_meta(t, app, obj):
        if t == "DELETED":
            delete_meta(app.any_versions())
            logging.warning("Deleted CRD, check garbage collection")
        elif t in ["MODIFIED", "ADDED"]:
            update_meta(app)
        else:
            logging.error("Unrecognized type: %s", t)

    # hack, using default namespace, default service account to get a token for kubecfg to work
    token = v1.read_namespaced_service_account(namespace="default",name="default").secrets[0].name
    resource_version = ""
    while True:
        stream = watch.Watch().stream(crds.list_cluster_custom_object,
                                      DOMAIN, VERSION, PLURAL,
                                      resource_version=resource_version)
        for event in stream:
            try:
                t = event["type"]
                obj = event["object"]
                print obj
                app = App(obj, token)
                logging.warning("Apps %s, %s" % (app.crd_name(),t))  
                process_meta(t, app, obj)

                # Configure where to resume streaming.
                metadata = obj.get("metadata")
                if metadata:
                    resource_version = metadata["resourceVersion"]
            except:
                logging.exception("Error handling event")
Ejemplo n.º 42
0
 def _load_config(self):
     """Load kubernetes configuration."""
     if self.in_cluster:
         config.load_incluster_config()
     else:
         config.load_kube_config(context=self.context)
Ejemplo n.º 43
0
async def k8s_register(app):
    log.info("k8s_register")
    # TBD - find more elegant way to avoid this warning
    import urllib3
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    k8s_config.load_incluster_config(
    )  #get the config from within the cluster and set it as the default config for all new clients
    c = k8s_client.Configuration()  #go and get a copy of the default config
    c.verify_ssl = False  #set verify_ssl to false in that config
    k8s_client.Configuration.set_default(
        c)  #make that config the default for all new clients
    v1 = k8s_client.CoreV1Api()
    # TBD - use the async version
    ret = v1.list_pod_for_all_namespaces(watch=False)
    pod_ips = []
    sn_urls = {}
    dn_urls = {}
    for i in ret.items:
        pod_ip = i.status.pod_ip
        if not pod_ip:
            continue
        labels = i.metadata.labels
        if labels and "app" in labels and labels["app"] == "hsds":
            log.info(f"hsds pod - ip: {pod_ip}")
            pod_ips.append(pod_ip)
    if not pod_ips:
        log.error("Expected to find at least one hsds pod")
        return
    pod_ips.sort()  # for assigning node numbers
    node_count = len(pod_ips)
    ready_count = 0
    this_node_id = app["id"]
    sn_port = config.get("sn_port")
    dn_port = config.get("dn_port")
    for node_number in range(node_count):
        for port in (sn_port, dn_port):
            # send an info request to the node
            pod_ip = pod_ips[node_number]
            url = f"http://{pod_ip}:{port}"
            if port == sn_port:
                sn_urls[node_number] = url
            else:
                dn_urls[node_number] = url

            info_rsp = await get_info(app, url)
            if not info_rsp:
                # timeout or other failure
                continue
            if "node" not in info_rsp:
                log.error("expected to find node key in info resp")
                continue

            node_rsp = info_rsp["node"]
            log.debug(f"got info resp: {node_rsp}")
            for key in ("type", "id", "node_number", "node_count"):
                if key not in node_rsp:
                    log.error(
                        f"unexpected node type in node state, expected to find key: {key}"
                    )
                    continue
            if node_rsp["type"] not in ("sn", "dn"):
                log.error(
                    f"expected node_type to be sn or dn, type is {node_rsp['type']}"
                )
                continue
            node_id = node_rsp["id"]
            if node_id == this_node_id:
                # set node_number and node_count
                log.debug("got info_rsp for this node")
                if app["node_number"] != node_number:
                    old_number = app["node_number"]
                    log.info(
                        f"node_number has changed - old value was {old_number} new number is {node_number}"
                    )
                    if app["node_type"] == "dn":
                        meta_cache = app["meta_cache"]
                        chunk_cache = app["chunk_cache"]
                        if meta_cache.dirtyCount > 0 or chunk_cache.dirtyCount > 0:
                            # set the node state to waiting till the chunk cache have been flushed
                            if app["node_state"] == "READY":
                                log.info(
                                    "setting node_state to waiting while cache is flushing"
                                )
                                app["node_state"] = "WAITING"
                        else:
                            meta_cache.clearCache()
                            chunk_cache.clearCache()
                            log.info(
                                f"node number was: {old_number} setting to: {node_number}"
                            )
                            app["node_number"] = node_number
                            app['register_time'] = time.time()
                    else:
                        # SN nodes can update node_number immediately
                        log.info(
                            f"node number was: {old_number} setting to: {node_number}"
                        )
                        app["node_number"] = node_number
                        app['register_time'] = time.time()
                if app["node_count"] != node_count:
                    old_count = app["node_count"]
                    log.info(
                        f"node count was: {old_count} setting to: {node_count}"
                    )
                    app["node_count"] = node_count
            if node_number == node_rsp[
                    "node_number"] and node_count == node_rsp["node_count"]:
                ready_count += 1
                log.debug(f"incremented ready_count to {ready_count}")
            else:
                log.info(f"differing node_number/node_count for url: {url}")
                log.info(
                    f"expected node_number: {node_number} actual: {node_rsp['node_number']}"
                )
                log.info(
                    f"expected node_count: {node_count} actual: {node_rsp['node_count']}"
                )

    if ready_count == node_count * 2:
        if app["node_state"] != "READY":
            log.info("setting node state to READY")
            app["node_state"] = "READY"
        app["node_count"] = node_count
        app["sn_urls"] = sn_urls
        app["dn_urls"] = dn_urls
    else:
        log.info(
            f"not all pods ready - ready_count: {ready_count}/{node_count*2}")
        if app["node_state"] == "READY":
            log.info("setting node state to SCALING")
            app["node_state"] = "SCALING"
Ejemplo n.º 44
0
def load_kube_config():
    if "AWS_WEB_IDENTITY_TOKEN_FILE" in os.environ and "eks.amazonaws.com" in os.environ[
            "AWS_WEB_IDENTITY_TOKEN_FILE"]:
        k8_config.load_incluster_config()
    else:
        k8_config.load_kube_config()
Ejemplo n.º 45
0
def create_config(in_cluster=False):
    if in_cluster:
        logging.info("Loading in-cluster config")
        return config.load_incluster_config()
    else:
        return config.load_kube_config()
Ejemplo n.º 46
0
                            type='Approved')
                        body.status.conditions = [approval_condition]
                        try:
                            certs_api.replace_certificate_signing_request_approval(
                                csr_name, body)
                        except Exception as e:
                            print(
                                "Hit %s when signing cert %s. This will be retried"
                                % (e, csr_name))
                        break
                continue


if __name__ == "__main__":
    if 'KUBERNETES_PORT' in os.environ:
        config.load_incluster_config()
    else:
        config.load_kube_config()
    configuration = client.Configuration()
    configuration.assert_hostname = False
    api_client = client.api_client.ApiClient(configuration=configuration)
    v1 = client.CoreV1Api()
    certs_api = client.CertificatesV1beta1Api()
    try:
        k8sfile = '/var/run/secrets/kubernetes.io/serviceaccount/namespace'
        namespace = open(k8sfile, 'r').read() if os.path.exists(
            k8sfile) else os.environ.get('NAMESPACE', 'default')
        config_map_name = os.environ.get('CONFIG_MAP', 'autorules')
        config_map = v1.read_namespaced_config_map(namespace=namespace,
                                                   name=config_map_name)
        config_map_data = config_map.to_dict().get('data', {})
Ejemplo n.º 47
0
    def _run_scale(self):
        # Var defs
        machineset_workers = []
        machine_spread = []
        extra = 0
        add_per = 0

        if self.incluster == "true":
            config.load_incluster_config()
            k8s_config = client.Configuration()
            k8s_client = client.api_client.ApiClient(configuration=k8s_config)
        elif self.kubeconfig:
            k8s_client = config.new_client_from_config(self.kubeconfig)
        else:
            k8s_client = config.new_client_from_config()

        try:
            dyn_client = DynamicClient(k8s_client)
        except Exception as err:
            logger.error("Could not configure client, failing the run")
            logger.error(err)
            exit(1)

        if self.is_rosa:
            self.rosa_machinepools = self._rosa_getmachinepools()
            logger.debug("ROSA MachinePools: %s" % self.rosa_machinepools)

        try:
            nodes = dyn_client.resources.get(api_version="v1", kind="Node")
            machinesets = dyn_client.resources.get(kind="MachineSet")
        except Exception as err:
            logger.error(
                "Could not get information on nodes/machinesets, failing the run"
            )
            logger.error(err)
            exit(1)

        worker_count = (len(
            nodes.get(
                label_selector=
                "node-role.kubernetes.io/worker,!node-role.kubernetes.io/master"
            ).attributes.items) or 0)
        workload_count = (len(
            nodes.get(label_selector="node-role.kubernetes.io/workload").
            attributes.items) or 0)
        master_count = len(
            nodes.get(label_selector="node-role.kubernetes.io/master").
            attributes.items) or 0
        infra_count = len(
            nodes.get(label_selector="node-role.kubernetes.io/infra").
            attributes.items) or 0
        init_workers = worker_count

        infra = dyn_client.resources.get(kind="Infrastructure")

        try:
            platform = infra.get().attributes.items[0].spec.platformSpec.type
        except Exception as err:
            logger.error(
                "Platform type not obtained through spec.platformSpec.type")
            logger.error("Trying to query status.platform")
            logger.error(err)

            try:
                platform = infra.get().attributes.items[0].status.platform
            except Exception as err:
                logger.error("Could not identify platform. Marking as Unknown")
                logger.error(err)
                platform = "Unknown"

        # Machine set name list
        machineset_all_list = machinesets.get(
            namespace="openshift-machine-api").attributes.items

        machineset_worker_list = []

        for i in range(len(machineset_all_list)):
            if (machineset_all_list[i].spec.template.metadata.
                    labels["machine.openshift.io/cluster-api-machine-role"] ==
                    "worker"):
                machineset_worker_list.append(machineset_all_list[i])
        # If we are already at the requested scale exit
        # Determine if we are scaling down or up
        action = "scale_nochange"
        if int(worker_count) == int(self.scale):
            logger.info("Already at requested worker count")
            return init_workers, worker_count, master_count, infra_count, workload_count, platform, action
        elif int(worker_count) > int(self.scale):
            action = "scale_down"
        else:
            action = "scale_up"

        logger.info("Current Worker count %s" % (worker_count))

        # Number of workers to add per machine set
        add_per = int(self.scale / len(machineset_worker_list))

        # Additional number of workers to add b/c math
        extra = self.scale % len(machineset_worker_list)

        logger.info("Number of machine sets %s" %
                    (len(machineset_worker_list)))

        for i in range(len(machineset_worker_list)):
            machineset_workers.append(machineset_worker_list[i].metadata.name)
            machine_spread.append(add_per)
        for i in range(extra):
            machine_spread[i] += 1

        logger.info("Machine sets: %s" % (machineset_workers))
        logger.info("New worker per machine set %s" % (machine_spread))

        logger.info("Starting Patching of machine sets")
        # Patch the machinesets
        if not self.is_rosa:
            for i in range(len(machineset_workers)):
                body = {"spec": {"replicas": machine_spread[i]}}
                machinesets.patch(
                    body=body,
                    namespace="openshift-machine-api",
                    name=machineset_workers[i],
                    content_type="application/merge-patch+json",
                )
        else:
            self._rosa_scale("Default")

        logger.info(
            "Waiting for worker machine set to show the appropiate ready replicas"
        )
        for i in range(len(machineset_worker_list)):
            new_machine_sets = machinesets.get(
                namespace="openshift-machine-api",
                name=machineset_worker_list[i].metadata.name)
            while new_machine_sets.status.readyReplicas != machine_spread[i]:
                if new_machine_sets.status.readyReplicas is None and machine_spread[
                        i] == 0:
                    break
                new_machine_sets = machinesets.get(
                    namespace="openshift-machine-api",
                    name=machineset_worker_list[i].metadata.name)
                logger.debug(
                    "Number of ready replicas for %s: %s. Waiting %d seconds for next check..."
                    % (
                        new_machine_sets.metadata.name,
                        str(new_machine_sets.status.readyReplicas),
                        self.poll_interval,
                    ))
                time.sleep(self.poll_interval)

        logger.info("Patching of machine sets complete")
        logger.info("Waiting for all workers to be schedulable")
        # Ensure all workers are not listed as unschedulable
        # If we don't do this it will auto-complete a scale-down even though the workers
        # have not been eliminated yet
        new_worker_list = nodes.get(
            label_selector="node-role.kubernetes.io/worker").attributes.items
        for i in range(len(new_worker_list)):
            while i < len(
                    new_worker_list) and new_worker_list[i].spec.unschedulable:
                new_worker_list = nodes.get(
                    label_selector="node-role.kubernetes.io/worker"
                ).attributes.items
                logger.debug(
                    "Number of ready workers: %d. Waiting %d seconds for next check..."
                    % (len(new_worker_list), self.poll_interval))
                time.sleep(self.poll_interval)
        logger.info("All workers schedulable")

        worker_count = (len(
            nodes.get(
                label_selector=
                "node-role.kubernetes.io/worker,!node-role.kubernetes.io/master"
            ).attributes.items) or 0)
        workload_count = (len(
            nodes.get(label_selector="node-role.kubernetes.io/workload").
            attributes.items) or 0)
        master_count = len(
            nodes.get(label_selector="node-role.kubernetes.io/master").
            attributes.items) or 0
        infra_count = len(
            nodes.get(label_selector="node-role.kubernetes.io/infra").
            attributes.items) or 0

        return init_workers, worker_count, master_count, infra_count, workload_count, platform, action
Ejemplo n.º 48
0
def main(argv=None):
  parser = argparse.ArgumentParser(description='ML Trainer')
  parser.add_argument(
      '--working-dir',
      help='Training job working directory.',
      required=True)
  parser.add_argument(
      '--train-files-dir',
      help='Path to training data',
      required=True)
  parser.add_argument(
      '--train-files-prefix',
      help='The prefix of the training input files.',
      required=True)

  parser.add_argument(
      '--tf-transform-dir',
      help='Tf-transform directory with model from preprocessing step',
      required=True)

  parser.add_argument(
      '--output-dir',
      help="""\
      Directory under which which the serving model (under /serving_model_dir)\
      and the tf-mode-analysis model (under /eval_model_dir) will be written\
      """,
      required=True)

  parser.add_argument(
      '--eval-files-dir',
      help='Path to evaluation data',
      required=True
  )
  parser.add_argument(
      '--eval-files-prefix',
      help='The prefix of the eval input files.',
      required=True)

  # Training arguments
  parser.add_argument(
      '--job-dir',
      help='GCS location to write checkpoints and export models',
      required=True)

  # Argument to turn on all logging
  parser.add_argument(
      '--verbosity',
      choices=['DEBUG', 'ERROR', 'FATAL', 'INFO', 'WARN'],
      default='INFO',
  )
  # Experiment arguments
  parser.add_argument(
      '--train-steps',
      help='Count of steps to run the training job for',
      required=True,
      type=int)
  parser.add_argument(
      '--eval-steps',
      help='Number of steps to run evalution for at each checkpoint',
      default=100,
      type=int)
  parser.add_argument('--workers', type=int, default=0)
  parser.add_argument('--pss', type=int, default=0)
  parser.add_argument('--cluster', type=str,
                      help='GKE cluster set up for kubeflow. If set, zone must be provided. ' +
                           'If not set, assuming this runs in a GKE container and current ' +
                           'cluster is used.')
  parser.add_argument('--zone', type=str, help='zone of the kubeflow cluster.')
  parser.add_argument('--kfversion', type=str,
                      default='v1beta1',
                      help='The version of the deployed kubeflow. ' +
                           'If not set, the default version is v1beta1')
  parser.add_argument('--tfjob-ns', type=str,
                      default='kubeflow',
                      help='The namespace where the tfjob is submitted' +
                           'If not set, the namespace is kubeflow')
  parser.add_argument('--tfjob-timeout-minutes', type=int,
                      default=20,
                      help='Time in minutes to wait for the TFJob to complete')
  args = parser.parse_args()

  logging.getLogger().setLevel(logging.INFO)
  args_dict = vars(args)
  if args.cluster and args.zone:
    cluster = args_dict.pop('cluster')
    zone = args_dict.pop('zone')
  else:
    # Get cluster name and zone from metadata
    metadata_server = "http://metadata/computeMetadata/v1/instance/"
    metadata_flavor = {'Metadata-Flavor' : 'Google'}
    cluster = requests.get(metadata_server + "attributes/cluster-name",
                           headers = metadata_flavor).text
    zone = requests.get(metadata_server + "zone",
                        headers = metadata_flavor).text.split('/')[-1]

  # logging.info('Getting credentials for GKE cluster %s.' % cluster)
  # subprocess.call(['gcloud', 'container', 'clusters', 'get-credentials', cluster,
                   # '--zone', zone])

  # Create metadata.json file for visualization.
  tb_dir = args_dict.pop('working_dir') # don't pass this arg to the training module
  metadata = {
    'outputs' : [{
      'type': 'tensorboard',
      'source': tb_dir,
    }]
  }
  with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as f:
    json.dump(metadata, f)

  workers = args_dict.pop('workers')
  pss = args_dict.pop('pss')
  kf_version = args_dict.pop('kfversion')
  tfjob_ns = args_dict.pop('tfjob_ns')
  tfjob_timeout_minutes = args_dict.pop('tfjob_timeout_minutes')
  args_list = ['--%s=%s' % (k.replace('_', '-'),v)
               for k,v in six.iteritems(args_dict) if v is not None]
  logging.info('Generating training template.')
  template_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'train.template.yaml')
  content_yaml = _generate_train_yaml(template_file, tfjob_ns, workers, pss, args_list)

  logging.info('Start training.')
  # Set up handler for k8s clients
  config.load_incluster_config()
  api_client = k8s_client.ApiClient()
  create_response = tf_job_client.create_tf_job(api_client, content_yaml, version=kf_version)
  job_name = create_response['metadata']['name']

  wait_response = tf_job_client.wait_for_job(
      api_client, tfjob_ns, job_name, kf_version,
      timeout=datetime.timedelta(minutes=tfjob_timeout_minutes))
  succ = True

  # TODO: update this failure checking after tf-operator has the condition checking function.
  if 'Worker' in wait_response['status']['replicaStatuses']:
    if 'Failed' in wait_response['status']['replicaStatuses']['Worker']:
      logging.error('Training failed since workers failed.')
      succ = False
  if 'PS' in wait_response['status']['replicaStatuses']:
    if 'Failed' in wait_response['status']['replicaStatuses']['PS']:
      logging.error('Training failed since PSs failed.')
      succ = False
  if 'Master' in wait_response['status']['replicaStatuses']:
    if 'Failed' in wait_response['status']['replicaStatuses']['Master']:
      logging.error('Training failed since Master failed.')
      succ = False

  # #TODO: remove this after kubeflow fixes the wait_for_job issue
  # # because the wait_for_job returns when the worker finishes but the master might not be complete yet.
  # if 'Master' in wait_response['status']['replicaStatuses'] and 'active' in wait_response['status']['replicaStatuses']['Master']:
  #   master_active = True
  #   while master_active:
  #     # Wait for master to finish
  #     time.sleep(2)
  #     wait_response = tf_job_client.wait_for_job(api_client, tfjob_ns, job_name, kf_version,
  #                                            timeout=datetime.timedelta(minutes=tfjob_timeout_minutes))
  #     if 'active' not in wait_response['status']['tfReplicaStatuses']['Master']:
  #       master_active = False

  if succ:
    logging.info('Training success.')

  tf_job_client.delete_tf_job(api_client, tfjob_ns, job_name, version=kf_version)
  with open('/output.txt', 'w') as f:
    f.write(args.job_dir)
Ejemplo n.º 49
0
from kubernetes.client import V1Container
from kubernetes.client import V1ResourceRequirements
from kfserving import KFServingClient
from kfserving import constants
from kfserving import utils
from kfserving import V1alpha2EndpointSpec
from kfserving import V1alpha2PredictorSpec
from kfserving import V1alpha2InferenceServiceSpec
from kfserving import V1alpha2InferenceService
from kfserving import V1alpha2CustomSpec

from kubernetes import client as k8s_client
from kubernetes import config as k8s_config
from kubernetes.client.rest import ApiException

k8s_config.load_incluster_config()


def main():

    api_version = constants.KFSERVING_GROUP + '/' + constants.KFSERVING_VERSION
    default_endpoint_spec = V1alpha2EndpointSpec(
        predictor=V1alpha2PredictorSpec(custom=V1alpha2CustomSpec(
            container=V1Container(
                name="kfserving-container",
                image=FLAGS.image,
                env=[{
                    "name": "STORAGE_URI",
                    "value": "%s" % FLAGS.storage_uri
                }],
                resources=V1ResourceRequirements(
Ejemplo n.º 50
0
def init():
    # Set passed environment variables as global variables
    for variable in [
            "MARIADB_CLUSTER", "MARIADB_HOST", "MARIADB_USER",
            "MARIADB_PASSWORD"
    ]:
        checkAndSetEnvironmentVariablesAsGlobalVariables(variable)

    # Load the kubectl config and initialize the API
    config.load_incluster_config()
    global v1
    v1 = CoreV1Api()

    # Get k8s topology information about the cluster to test
    global serverPods
    serverPods = v1.list_namespaced_pod(
        NAMESPACE,
        watch=False,
        label_selector="mariadb=%s,server.mariadb" % (MARIADB_CLUSTER, ))
    global maxScalePods
    maxScalePods = v1.list_namespaced_pod(
        NAMESPACE,
        watch=False,
        label_selector="mariadb=%s,maxscale.mariadb" % (MARIADB_CLUSTER, ))
    global umPods
    umPods = v1.list_namespaced_pod(NAMESPACE,
                                    watch=False,
                                    label_selector="mariadb=%s,um.mariadb" %
                                    (MARIADB_CLUSTER, ))
    global pmPods
    pmPods = v1.list_namespaced_pod(NAMESPACE,
                                    watch=False,
                                    label_selector="mariadb=%s,pm.mariadb" %
                                    (MARIADB_CLUSTER, ))
    global topology
    global system
    global MARIADB_PORT
    if len(umPods.items) > 0 and len(pmPods.items) > 0:
        system = "columnstore"
        MARIADB_PORT = 3306
        if umPods.items[0].metadata.name == "%s-mdb-cs-single-0" % (
                MARIADB_CLUSTER, ):
            topology = "columnstore-standalone"
        else:
            topology = "columnstore"
    elif len(serverPods.items) > 0 and len(maxScalePods.items) > 0:
        system = "server"
        MARIADB_PORT = 4006
        if serverPods.items[0].metadata.name == "%s-mdb-galera-0" % (
                MARIADB_CLUSTER, ):
            topology = "galera"
        else:
            topology = "masterslave"
    elif len(serverPods.items) == 1 and len(maxScalePods.items) == 0:
        system = "server"
        topology = "standalone"
        MARIADB_PORT = 3306
    else:
        print(
            "error: no valid topology could be found in namespace %s.\nserver pods found: %d\nmaxscale pods found: %d\ncolumnstore um pods found: %d\ncolumnstore pm pods found: %d"
            % (NAMESPACE, len(serverPods.items), len(
                maxScalePods.items), len(umPods.items), len(pmPods.items)))
        sys.exit(666)

    # Wait for the database to be active
    if system == "columnstore":
        helper_functions.waitForColumnStoreActive(umPods.items[0], v1,
                                                  MARIADB_CLUSTER,
                                                  COLUMNSTORE_TIMEOUT)
    else:
        helper_functions.waitForServerActive(serverPods.items[0], v1,
                                             MARIADB_USER, MARIADB_PASSWORD,
                                             MARIADB_HOST, MARIADB_PORT,
                                             SERVER_TIMEOUT)
    print("")

    # Get a SQL connection, and prepare the test database
    error = False
    try:
        conn = mariadb.connect(user=MARIADB_USER,
                               password=MARIADB_PASSWORD,
                               host=MARIADB_HOST,
                               port=MARIADB_PORT)
        cursor = conn.cursor()
        cursor.execute("DROP DATABASE IF EXISTS %s" % (DB_NAME, ))
        cursor.execute("CREATE DATABASE IF NOT EXISTS %s" % (DB_NAME, ))
    except Exception as e:
        print("error: could not prepare test database '%s'\n%s" % (DB_NAME, e))
        error = True
    finally:
        try:
            if cursor: cursor.close()
            if conn: conn.close()
        except Exception:
            pass
    if error:
        sys.exit(666)
Ejemplo n.º 51
0
def load_config():
    if frappe.get_conf().get("developer_mode"):
        config.load_kube_config()
    else:
        config.load_incluster_config()
Ejemplo n.º 52
0
def main(argv=None):
    parser = argparse.ArgumentParser(description='Kubeflow StudyJob launcher')
    parser.add_argument('--name', type=str, help='StudyJob name.')
    parser.add_argument('--namespace',
                        type=str,
                        default='kubeflow',
                        help='StudyJob namespace.')
    parser.add_argument(
        '--optimizationtype',
        type=str,
        default='minimize',
        help='Direction of optimization. minimize or maximize.')
    parser.add_argument('--objectivevaluename',
                        type=str,
                        help='Objective value name which trainer optimizes.')
    parser.add_argument('--optimizationgoal',
                        type=float,
                        help='Stop studying once objectivevaluename value ' +
                        'exceeds optimizationgoal')
    parser.add_argument('--requestcount',
                        type=int,
                        default=1,
                        help='The times asking request to suggestion service.')
    parser.add_argument('--metricsnames',
                        type=strToList,
                        help='StudyJob metrics name list.')
    parser.add_argument('--parameterconfigs',
                        type=yamlOrJsonStr,
                        default={},
                        help='StudyJob parameterconfigs.')
    parser.add_argument('--nasConfig',
                        type=yamlOrJsonStr,
                        default={},
                        help='StudyJob nasConfig.')
    parser.add_argument('--workertemplatepath',
                        type=str,
                        default="",
                        help='StudyJob worker spec.')
    parser.add_argument('--mcollectortemplatepath',
                        type=str,
                        default="",
                        help='StudyJob worker spec.')
    parser.add_argument('--suggestionspec',
                        type=yamlOrJsonStr,
                        default={},
                        help='StudyJob suggestion spec.')
    parser.add_argument(
        '--outputfile',
        type=str,
        default='/output.txt',
        help='The file which stores the best trial of the studyJob.')
    parser.add_argument(
        '--deleteAfterDone',
        type=strtobool,
        default=True,
        help=
        'When studyjob done, delete the studyjob automatically if it is True.')
    parser.add_argument(
        '--studyjobtimeoutminutes',
        type=int,
        default=10,
        help='Time in minutes to wait for the StudyJob to complete')

    args = parser.parse_args()

    logging.getLogger().setLevel(logging.INFO)

    logging.info('Generating studyjob template.')
    template_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                 'hp.template.yaml')
    content_yaml = _generate_studyjob_yaml(
        template_file, args.name, args.namespace, args.optimizationtype,
        args.objectivevaluename, args.optimizationgoal, args.requestcount,
        args.metricsnames, args.parameterconfigs, args.nasConfig,
        args.workertemplatepath, args.mcollectortemplatepath,
        args.suggestionspec)

    config.load_incluster_config()
    api_client = k8s_client.ApiClient()
    create_response = study_job_client.create_study_job(
        api_client, content_yaml)
    job_name = create_response['metadata']['name']
    job_namespace = create_response['metadata']['namespace']

    expected_condition = ["Completed", "Failed"]
    wait_response = study_job_client.wait_for_condition(
        api_client,
        job_namespace,
        job_name,
        expected_condition,
        timeout=datetime.timedelta(minutes=args.studyjobtimeoutminutes))
    succ = False
    if wait_response.get("status", {}).get("condition") == "Completed":
        succ = True
        trial = get_best_trial(wait_response["status"]["bestTrialId"])
        if not os.path.exists(os.path.dirname(args.outputfile)):
            os.makedirs(os.path.dirname(args.outputfile))
        with open(args.outputfile, 'w') as f:
            ps_dict = {}
            for ps in trial.parameter_set:
                ps_dict[ps.name] = ps.value
            f.write(json.dumps(ps_dict))
    if succ:
        logging.info('Study success.')
    if args.deleteAfterDone:
        study_job_client.delete_study_job(api_client, job_name, job_namespace)