Ejemplo n.º 1
0
def _lambda_handler(env, k8s_config, k8s_client, event):
    kube_config_bucket = env['kube_config_bucket']
    cluster_name = env['cluster_name']

    if not os.path.exists(KUBE_FILEPATH):
        if kube_config_bucket:
            logger.info('No kubeconfig file found. Downloading...')
            s3.download_file(kube_config_bucket, env['kube_config_object'],
                             KUBE_FILEPATH)
        else:
            logger.info('No kubeconfig file found. Generating...')
            create_kube_config(eks, cluster_name)

    lifecycle_hook_name = event['detail']['LifecycleHookName']
    auto_scaling_group_name = event['detail']['AutoScalingGroupName']

    instance_id = event['detail']['EC2InstanceId']
    logger.info('Instance ID: ' + instance_id)
    instance = ec2.describe_instances(
        InstanceIds=[instance_id])['Reservations'][0]['Instances'][0]

    node_name = instance['PrivateDnsName']
    logger.info('Node name: ' + node_name)

    # Configure
    k8s_config.load_kube_config(KUBE_FILEPATH)
    configuration = k8s_client.Configuration()
    if not kube_config_bucket:
        configuration.api_key['authorization'] = get_bearer_token(
            cluster_name, region)
        configuration.api_key_prefix['authorization'] = 'Bearer'
    # API
    api = k8s_client.ApiClient(configuration)
    v1 = k8s_client.CoreV1Api(api)

    try:
        if not node_exists(v1, node_name):
            logger.error('Node not found.')
            abandon_lifecycle_action(asg, auto_scaling_group_name,
                                     lifecycle_hook_name, instance_id)
            return

        cordon_node(v1, node_name)

        remove_all_pods(v1, node_name)
        print("all pods terminated")
        logger.info("all pods terminated")
        asg.complete_lifecycle_action(
            LifecycleHookName=lifecycle_hook_name,
            AutoScalingGroupName=auto_scaling_group_name,
            LifecycleActionResult='CONTINUE',
            InstanceId=instance_id)
        print("lifecycling hooks over")
        logger.info("lifecycling hooks over")
    except ApiException:
        logger.exception(
            'There was an error removing the pods from the node {}'.format(
                node_name))
        abandon_lifecycle_action(asg, auto_scaling_group_name,
                                 lifecycle_hook_name, instance_id)
Ejemplo n.º 2
0
def _lambda_handler(k8s_config, k8s_client, event):
    if not os.path.exists(KUBE_FILEPATH):
        if KUBE_CONFIG_BUCKET:
            logger.info('No kubeconfig file found. Downloading...')
            get_kube_config(s3)
        else:
            logger.info('No kubeconfig file found. Generating...')
            create_kube_config(eks)

    lifecycle_hook_name = event['detail']['LifecycleHookName']
    auto_scaling_group_name = event['detail']['AutoScalingGroupName']

    instance_id = event['detail']['EC2InstanceId']
    logger.info('Instance ID: ' + instance_id)
    instance = ec2.describe_instances(
        InstanceIds=[instance_id])['Reservations'][0]['Instances'][0]

    node_name = instance['PrivateDnsName']
    logger.info('Node name: ' + node_name)

    # Configure
    k8s_config.load_kube_config(KUBE_FILEPATH)
    configuration = k8s_client.Configuration()
    if CLUSTER_NAME:
        configuration.api_key['authorization'] = get_bearer_token(
            CLUSTER_NAME, REGION)
        configuration.api_key_prefix['authorization'] = 'Bearer'
    # API
    api = k8s_client.ApiClient(configuration)
    v1 = k8s_client.CoreV1Api(api)
    version_api = k8s_client.VersionApi(
        api_client=k8s_config.load_kube_config(KUBE_FILEPATH))
    k8s_version = version_api.get_code()

    try:
        if not node_exists(v1, node_name):
            logger.error('Node not found.')
            abandon_lifecycle_action(asg, auto_scaling_group_name,
                                     lifecycle_hook_name, instance_id)
            return

        cordon_node(v1, node_name)

        remove_all_pods(v1, node_name, k8s_version)

        asg.complete_lifecycle_action(
            LifecycleHookName=lifecycle_hook_name,
            AutoScalingGroupName=auto_scaling_group_name,
            LifecycleActionResult='CONTINUE',
            InstanceId=instance_id)
    except ApiException:
        logger.exception(
            'There was an error removing the pods from the node {}'.format(
                node_name))
        abandon_lifecycle_action(asg, auto_scaling_group_name,
                                 lifecycle_hook_name, instance_id)
Ejemplo n.º 3
0
def _lambda_handler(env, k8s_config, k8s_client, event):
    kube_config_bucket = env['kube_config_bucket']
    cluster_name = env['cluster_name']

    if not os.path.exists(KUBE_FILEPATH):
        if kube_config_bucket:
            logger.info('No kubeconfig file found. Downloading...')
            s3.download_file(kube_config_bucket, env['kube_config_object'],
                             KUBE_FILEPATH)
        else:
            logger.info('No kubeconfig file found. Generating...')
            create_kube_config(eks, cluster_name)

    detail_type = event['detail-type']
    logger.info('Event Type: ' + detail_type)
    lifecycle_hook_name = event['detail']['LifecycleHookName']
    logger.info('Lifecycle Hook: ' + lifecycle_hook_name)
    auto_scaling_group_name = event['detail']['AutoScalingGroupName']
    instance_id = event['detail']['EC2InstanceId']
    logger.info('Instance ID: ' + instance_id)

    # Configure
    k8s_config.load_kube_config(KUBE_FILEPATH)
    configuration = k8s_client.Configuration()
    if not kube_config_bucket:
        configuration.api_key['authorization'] = get_bearer_token(
            cluster_name, REGION)
        configuration.api_key_prefix['authorization'] = 'Bearer'
    # API
    api = k8s_client.ApiClient(configuration)
    v1 = k8s_client.CoreV1Api(api)
    apps_v1 = k8s_client.AppsV1Api(api)
    batch_v1 = k8s_client.BatchV1Api(api)
    custom_obj_api = k8s_client.CustomObjectsApi(api)

    node_name = get_node_name_from_instance_id(v1, env, cluster_name,
                                               instance_id)

    if not node_name:
        logger.info('Node name not found. Unable to drain node.')
        k8s_utils.abandon_lifecycle_action(asg, auto_scaling_group_name,
                                           lifecycle_hook_name, instance_id)
        return

    logger.info('Node name: ' + node_name)

    if detail_type == ASG_ACTION['terminate']:
        logger.info('Processing terminate event...')

        try:
            # if not k8s_utils.node_exists(v1, node_name):
            #     logger.error('Node not found.')

            k8s_utils.cordon_node(v1, node_name)

            timeout = None if not env['pod_eviction_timeout'] else int(
                env['pod_eviction_timeout'])
            grace_period = None if not env['pod_delete_grace_period'] else int(
                env['pod_delete_grace_period'])
            k8s_utils.remove_all_pods(v1,
                                      node_name,
                                      pod_eviction_timeout=timeout,
                                      pod_delete_grace_period=grace_period)

            if env['delete_node'].lower() == 'true':
                # we dont check status, because if it fails, we would just continue anyways
                k8s_utils.delete_node(v1, node_name)

            logger.info('Completing lifecycle action')
            asg.complete_lifecycle_action(
                LifecycleHookName=lifecycle_hook_name,
                AutoScalingGroupName=auto_scaling_group_name,
                LifecycleActionResult='CONTINUE',
                InstanceId=instance_id)
        except ApiException as e:
            # the node can finish terminating (node not found) while we run the operations above,
            # continue if we have more to process
            if (e.status != 404 and
                ((env['detach_rook_volumes'].lower() == 'true'
                  and env['rook_ceph_volumes_namespace']) or
                 (env['update_ceph_crushmap'].lower() == 'true') or
                 (env['delete_rook_ceph_crashcollector'].lower() == 'true'))):
                if node_name:
                    logger.exception(
                        'There was an error removing the pods from the node'.
                        format(node_name))
                else:
                    logger.exception(
                        'There was an error removing the pods from the instance {} node'
                        .format(instance_id))
                logger.info('Abandoning lifecycle action')
                k8s_utils.abandon_lifecycle_action(asg,
                                                   auto_scaling_group_name,
                                                   lifecycle_hook_name,
                                                   instance_id)

        try:
            if env['detach_rook_volumes'].lower(
            ) == 'true' and env['rook_ceph_volumes_namespace']:
                k8s_utils.detach_node_rook_volumes(
                    custom_obj_api, env['rook_ceph_volumes_namespace'],
                    node_name)

            osd_ids = []
            if env['rook_ceph_osd_namespace']:
                osd_ids = k8s_utils.get_host_associated_osd_ids(
                    apps_v1, node_name, env['rook_ceph_osd_namespace'])

            if (env['update_ceph_crushmap'].lower() == 'true'
                    and env['rook_ceph_osd_namespace']
                    and env['rook_ceph_operator_namespace']):
                # TODO: add retries if received 500 status (in fact, add to any stream/exec api)

                k8s_utils.remove_host_and_osd_from_ceph_crushmap(
                    v1, node_name, osd_ids,
                    env['rook_ceph_operator_namespace'])

            if env['rook_ceph_osd_namespace']:
                k8s_utils.delete_rook_ceph_osd_deployment(
                    apps_v1, osd_ids, env['rook_ceph_osd_namespace'])
                k8s_utils.cleanup_rook_ceph_osd_status_configmaps(
                    v1, node_name, env['rook_ceph_osd_namespace'])
                k8s_utils.cleanup_rook_ceph_osd_prepare_jobs(
                    batch_v1, node_name, env['rook_ceph_osd_namespace'])

            if env['delete_rook_ceph_crashcollector'].lower() == 'true':
                k8s_utils.delete_rook_ceph_crashcollector(
                    apps_v1, env['rook_ceph_crashcollectors_namespace'],
                    node_name)

            if env['rook_ceph_mon_namespace']:
                k8s_utils.remove_node_from_mon_endpoints_configmap_and_secret(
                    v1, node_name, env['rook_ceph_mon_namespace'])

            if env['rook_ceph_mon_namespace']:
                # k8s_utils.scale_node_rook_ceph_mon_deployment(apps_v1, node_name, env['rook_ceph_mon_namespace'], 0)
                k8s_utils.delete_node_rook_ceph_mon_deployment(
                    apps_v1, node_name, env['rook_ceph_mon_namespace'])

            # there is an issue with the crashcollector looking for nodes that are gone and
            # stopping rook ceph operator from continuing
            # if we reload the config, rook will refresh and continue
            # the simplest way to do so is to actually toggle the ceph version unsupported flag as it won't affect the
            # cluster unless you explicitly use unsupported ceph versions, otherwise you need to bounce the operator
            if env['reload_rook_cephcluster'].lower(
            ) == 'true' and env['rook_ceph_osd_namespace']:
                k8s_utils.toggle_rook_ceph_version_allow_unsupported_flag(
                    custom_obj_api, env['rook_ceph_osd_namespace'])

            # crashcollector reconciler will get stuck in a loop if we dont bounce the operator pods
            #if env['rook_ceph_operator_namespace']:
            #    k8s_utils.delete_rook_ceph_operator_pods(v1, env['rook_ceph_operator_namespace'])

            if env['wait_for_rook_ceph_health_ok_retries'] and env[
                    'rook_ceph_operator_namespace']:
                k8s_utils.wait_for_rook_ceph_health_ok(
                    v1,
                    env['rook_ceph_operator_namespace'],
                    retries=int(env['wait_for_rook_ceph_health_ok_retries']))

        except ApiException:
            logger.exception(
                'There was an error cleaning up rook resources on node {}'.
                format(node_name))
            k8s_utils.abandon_lifecycle_action(asg, auto_scaling_group_name,
                                               lifecycle_hook_name,
                                               instance_id)

        try:
            asg.complete_lifecycle_action(
                LifecycleHookName=lifecycle_hook_name,
                AutoScalingGroupName=auto_scaling_group_name,
                LifecycleActionResult='CONTINUE',
                InstanceId=instance_id)
        except:
            # if you terminate a EC2 instance outside of the ASG scaling,
            # then sometimes you will get "No active Lifecycle Action found" ?
            pass
    else:
        logger.info('No event to process, continuing...')
        try:
            asg.complete_lifecycle_action(
                LifecycleHookName=lifecycle_hook_name,
                AutoScalingGroupName=auto_scaling_group_name,
                LifecycleActionResult='CONTINUE',
                InstanceId=instance_id)
        except:
            # if you terminate a EC2 instance outside of the ASG scaling,
            # then sometimes you will get "No active Lifecycle Action found" ?
            pass
Ejemplo n.º 4
0
def _lambda_handler(env, k8s_config, k8s_client, event):
    kube_config_bucket = env['kube_config_bucket']
    cluster_name = env['cluster_name']

    if not os.path.exists(KUBE_FILEPATH):
        if kube_config_bucket:
            logger.info('No kubeconfig file found. Downloading...')
            s3.download_file(kube_config_bucket, env['kube_config_object'],
                             KUBE_FILEPATH)
        else:
            logger.info('No kubeconfig file found. Generating...')
            create_kube_config(eks, cluster_name)

    lifecycle_hook_name = event['detail']['LifecycleHookName']
    auto_scaling_group_name = event['detail']['AutoScalingGroupName']

    instance_id = event['detail']['EC2InstanceId']
    logger.info('Instance ID: ' + instance_id)
    instance = ec2.describe_instances(
        InstanceIds=[instance_id])['Reservations'][0]['Instances'][0]

    node_name = instance['PrivateDnsName']
    logger.info('Node name: ' + node_name)

    # Configure
    k8s_config.load_kube_config(KUBE_FILEPATH)
    configuration = k8s_client.Configuration()
    if not kube_config_bucket:
        configuration.api_key['authorization'] = get_bearer_token(
            cluster_name, REGION)
        configuration.api_key_prefix['authorization'] = 'Bearer'
    # API
    api = k8s_client.ApiClient(configuration)
    v1 = k8s_client.CoreV1Api(api)

    try:
        if not node_exists(v1, node_name):
            logger.error('Node not found.')
            abandon_lifecycle_action(asg, auto_scaling_group_name,
                                     lifecycle_hook_name, instance_id)
            return

        logger.info('Call cordon node')
        cordon_node(v1, node_name)

        logger.info('Adding exclude-balancer label')
        label_body = {
            "metadata": {
                "labels": {
                    "alpha.service-controller.kubernetes.io/exclude-balancer":
                    "true"
                }
            }
        }
        try:
            response_kube = v1.patch_node(node_name, label_body)
        except Exception as e:
            print(e)
            exit(1)

        logger.info('Call remove all pods')
        remove_all_pods(v1, node_name)

        logger.info('Sleeping')
        time.sleep(360)

        logger.info('Call complete lifecycle action')
        asg.complete_lifecycle_action(
            LifecycleHookName=lifecycle_hook_name,
            AutoScalingGroupName=auto_scaling_group_name,
            LifecycleActionResult='CONTINUE',
            InstanceId=instance_id)
    except ApiException:
        logger.exception(
            'There was an error removing the pods from the node {}'.format(
                node_name))
        abandon_lifecycle_action(asg, auto_scaling_group_name,
                                 lifecycle_hook_name, instance_id)