def _lambda_handler(env, k8s_config, k8s_client, event): kube_config_bucket = env['kube_config_bucket'] cluster_name = env['cluster_name'] if not os.path.exists(KUBE_FILEPATH): if kube_config_bucket: logger.info('No kubeconfig file found. Downloading...') s3.download_file(kube_config_bucket, env['kube_config_object'], KUBE_FILEPATH) else: logger.info('No kubeconfig file found. Generating...') create_kube_config(eks, cluster_name) lifecycle_hook_name = event['detail']['LifecycleHookName'] auto_scaling_group_name = event['detail']['AutoScalingGroupName'] instance_id = event['detail']['EC2InstanceId'] logger.info('Instance ID: ' + instance_id) instance = ec2.describe_instances( InstanceIds=[instance_id])['Reservations'][0]['Instances'][0] node_name = instance['PrivateDnsName'] logger.info('Node name: ' + node_name) # Configure k8s_config.load_kube_config(KUBE_FILEPATH) configuration = k8s_client.Configuration() if not kube_config_bucket: configuration.api_key['authorization'] = get_bearer_token( cluster_name, region) configuration.api_key_prefix['authorization'] = 'Bearer' # API api = k8s_client.ApiClient(configuration) v1 = k8s_client.CoreV1Api(api) try: if not node_exists(v1, node_name): logger.error('Node not found.') abandon_lifecycle_action(asg, auto_scaling_group_name, lifecycle_hook_name, instance_id) return cordon_node(v1, node_name) remove_all_pods(v1, node_name) print("all pods terminated") logger.info("all pods terminated") asg.complete_lifecycle_action( LifecycleHookName=lifecycle_hook_name, AutoScalingGroupName=auto_scaling_group_name, LifecycleActionResult='CONTINUE', InstanceId=instance_id) print("lifecycling hooks over") logger.info("lifecycling hooks over") except ApiException: logger.exception( 'There was an error removing the pods from the node {}'.format( node_name)) abandon_lifecycle_action(asg, auto_scaling_group_name, lifecycle_hook_name, instance_id)
def _lambda_handler(k8s_config, k8s_client, event): if not os.path.exists(KUBE_FILEPATH): if KUBE_CONFIG_BUCKET: logger.info('No kubeconfig file found. Downloading...') get_kube_config(s3) else: logger.info('No kubeconfig file found. Generating...') create_kube_config(eks) lifecycle_hook_name = event['detail']['LifecycleHookName'] auto_scaling_group_name = event['detail']['AutoScalingGroupName'] instance_id = event['detail']['EC2InstanceId'] logger.info('Instance ID: ' + instance_id) instance = ec2.describe_instances( InstanceIds=[instance_id])['Reservations'][0]['Instances'][0] node_name = instance['PrivateDnsName'] logger.info('Node name: ' + node_name) # Configure k8s_config.load_kube_config(KUBE_FILEPATH) configuration = k8s_client.Configuration() if CLUSTER_NAME: configuration.api_key['authorization'] = get_bearer_token( CLUSTER_NAME, REGION) configuration.api_key_prefix['authorization'] = 'Bearer' # API api = k8s_client.ApiClient(configuration) v1 = k8s_client.CoreV1Api(api) version_api = k8s_client.VersionApi( api_client=k8s_config.load_kube_config(KUBE_FILEPATH)) k8s_version = version_api.get_code() try: if not node_exists(v1, node_name): logger.error('Node not found.') abandon_lifecycle_action(asg, auto_scaling_group_name, lifecycle_hook_name, instance_id) return cordon_node(v1, node_name) remove_all_pods(v1, node_name, k8s_version) asg.complete_lifecycle_action( LifecycleHookName=lifecycle_hook_name, AutoScalingGroupName=auto_scaling_group_name, LifecycleActionResult='CONTINUE', InstanceId=instance_id) except ApiException: logger.exception( 'There was an error removing the pods from the node {}'.format( node_name)) abandon_lifecycle_action(asg, auto_scaling_group_name, lifecycle_hook_name, instance_id)
def _lambda_handler(env, k8s_config, k8s_client, event): kube_config_bucket = env['kube_config_bucket'] cluster_name = env['cluster_name'] if not os.path.exists(KUBE_FILEPATH): if kube_config_bucket: logger.info('No kubeconfig file found. Downloading...') s3.download_file(kube_config_bucket, env['kube_config_object'], KUBE_FILEPATH) else: logger.info('No kubeconfig file found. Generating...') create_kube_config(eks, cluster_name) detail_type = event['detail-type'] logger.info('Event Type: ' + detail_type) lifecycle_hook_name = event['detail']['LifecycleHookName'] logger.info('Lifecycle Hook: ' + lifecycle_hook_name) auto_scaling_group_name = event['detail']['AutoScalingGroupName'] instance_id = event['detail']['EC2InstanceId'] logger.info('Instance ID: ' + instance_id) # Configure k8s_config.load_kube_config(KUBE_FILEPATH) configuration = k8s_client.Configuration() if not kube_config_bucket: configuration.api_key['authorization'] = get_bearer_token( cluster_name, REGION) configuration.api_key_prefix['authorization'] = 'Bearer' # API api = k8s_client.ApiClient(configuration) v1 = k8s_client.CoreV1Api(api) apps_v1 = k8s_client.AppsV1Api(api) batch_v1 = k8s_client.BatchV1Api(api) custom_obj_api = k8s_client.CustomObjectsApi(api) node_name = get_node_name_from_instance_id(v1, env, cluster_name, instance_id) if not node_name: logger.info('Node name not found. Unable to drain node.') k8s_utils.abandon_lifecycle_action(asg, auto_scaling_group_name, lifecycle_hook_name, instance_id) return logger.info('Node name: ' + node_name) if detail_type == ASG_ACTION['terminate']: logger.info('Processing terminate event...') try: # if not k8s_utils.node_exists(v1, node_name): # logger.error('Node not found.') k8s_utils.cordon_node(v1, node_name) timeout = None if not env['pod_eviction_timeout'] else int( env['pod_eviction_timeout']) grace_period = None if not env['pod_delete_grace_period'] else int( env['pod_delete_grace_period']) k8s_utils.remove_all_pods(v1, node_name, pod_eviction_timeout=timeout, pod_delete_grace_period=grace_period) if env['delete_node'].lower() == 'true': # we dont check status, because if it fails, we would just continue anyways k8s_utils.delete_node(v1, node_name) logger.info('Completing lifecycle action') asg.complete_lifecycle_action( LifecycleHookName=lifecycle_hook_name, AutoScalingGroupName=auto_scaling_group_name, LifecycleActionResult='CONTINUE', InstanceId=instance_id) except ApiException as e: # the node can finish terminating (node not found) while we run the operations above, # continue if we have more to process if (e.status != 404 and ((env['detach_rook_volumes'].lower() == 'true' and env['rook_ceph_volumes_namespace']) or (env['update_ceph_crushmap'].lower() == 'true') or (env['delete_rook_ceph_crashcollector'].lower() == 'true'))): if node_name: logger.exception( 'There was an error removing the pods from the node'. format(node_name)) else: logger.exception( 'There was an error removing the pods from the instance {} node' .format(instance_id)) logger.info('Abandoning lifecycle action') k8s_utils.abandon_lifecycle_action(asg, auto_scaling_group_name, lifecycle_hook_name, instance_id) try: if env['detach_rook_volumes'].lower( ) == 'true' and env['rook_ceph_volumes_namespace']: k8s_utils.detach_node_rook_volumes( custom_obj_api, env['rook_ceph_volumes_namespace'], node_name) osd_ids = [] if env['rook_ceph_osd_namespace']: osd_ids = k8s_utils.get_host_associated_osd_ids( apps_v1, node_name, env['rook_ceph_osd_namespace']) if (env['update_ceph_crushmap'].lower() == 'true' and env['rook_ceph_osd_namespace'] and env['rook_ceph_operator_namespace']): # TODO: add retries if received 500 status (in fact, add to any stream/exec api) k8s_utils.remove_host_and_osd_from_ceph_crushmap( v1, node_name, osd_ids, env['rook_ceph_operator_namespace']) if env['rook_ceph_osd_namespace']: k8s_utils.delete_rook_ceph_osd_deployment( apps_v1, osd_ids, env['rook_ceph_osd_namespace']) k8s_utils.cleanup_rook_ceph_osd_status_configmaps( v1, node_name, env['rook_ceph_osd_namespace']) k8s_utils.cleanup_rook_ceph_osd_prepare_jobs( batch_v1, node_name, env['rook_ceph_osd_namespace']) if env['delete_rook_ceph_crashcollector'].lower() == 'true': k8s_utils.delete_rook_ceph_crashcollector( apps_v1, env['rook_ceph_crashcollectors_namespace'], node_name) if env['rook_ceph_mon_namespace']: k8s_utils.remove_node_from_mon_endpoints_configmap_and_secret( v1, node_name, env['rook_ceph_mon_namespace']) if env['rook_ceph_mon_namespace']: # k8s_utils.scale_node_rook_ceph_mon_deployment(apps_v1, node_name, env['rook_ceph_mon_namespace'], 0) k8s_utils.delete_node_rook_ceph_mon_deployment( apps_v1, node_name, env['rook_ceph_mon_namespace']) # there is an issue with the crashcollector looking for nodes that are gone and # stopping rook ceph operator from continuing # if we reload the config, rook will refresh and continue # the simplest way to do so is to actually toggle the ceph version unsupported flag as it won't affect the # cluster unless you explicitly use unsupported ceph versions, otherwise you need to bounce the operator if env['reload_rook_cephcluster'].lower( ) == 'true' and env['rook_ceph_osd_namespace']: k8s_utils.toggle_rook_ceph_version_allow_unsupported_flag( custom_obj_api, env['rook_ceph_osd_namespace']) # crashcollector reconciler will get stuck in a loop if we dont bounce the operator pods #if env['rook_ceph_operator_namespace']: # k8s_utils.delete_rook_ceph_operator_pods(v1, env['rook_ceph_operator_namespace']) if env['wait_for_rook_ceph_health_ok_retries'] and env[ 'rook_ceph_operator_namespace']: k8s_utils.wait_for_rook_ceph_health_ok( v1, env['rook_ceph_operator_namespace'], retries=int(env['wait_for_rook_ceph_health_ok_retries'])) except ApiException: logger.exception( 'There was an error cleaning up rook resources on node {}'. format(node_name)) k8s_utils.abandon_lifecycle_action(asg, auto_scaling_group_name, lifecycle_hook_name, instance_id) try: asg.complete_lifecycle_action( LifecycleHookName=lifecycle_hook_name, AutoScalingGroupName=auto_scaling_group_name, LifecycleActionResult='CONTINUE', InstanceId=instance_id) except: # if you terminate a EC2 instance outside of the ASG scaling, # then sometimes you will get "No active Lifecycle Action found" ? pass else: logger.info('No event to process, continuing...') try: asg.complete_lifecycle_action( LifecycleHookName=lifecycle_hook_name, AutoScalingGroupName=auto_scaling_group_name, LifecycleActionResult='CONTINUE', InstanceId=instance_id) except: # if you terminate a EC2 instance outside of the ASG scaling, # then sometimes you will get "No active Lifecycle Action found" ? pass
def _lambda_handler(env, k8s_config, k8s_client, event): kube_config_bucket = env['kube_config_bucket'] cluster_name = env['cluster_name'] if not os.path.exists(KUBE_FILEPATH): if kube_config_bucket: logger.info('No kubeconfig file found. Downloading...') s3.download_file(kube_config_bucket, env['kube_config_object'], KUBE_FILEPATH) else: logger.info('No kubeconfig file found. Generating...') create_kube_config(eks, cluster_name) lifecycle_hook_name = event['detail']['LifecycleHookName'] auto_scaling_group_name = event['detail']['AutoScalingGroupName'] instance_id = event['detail']['EC2InstanceId'] logger.info('Instance ID: ' + instance_id) instance = ec2.describe_instances( InstanceIds=[instance_id])['Reservations'][0]['Instances'][0] node_name = instance['PrivateDnsName'] logger.info('Node name: ' + node_name) # Configure k8s_config.load_kube_config(KUBE_FILEPATH) configuration = k8s_client.Configuration() if not kube_config_bucket: configuration.api_key['authorization'] = get_bearer_token( cluster_name, REGION) configuration.api_key_prefix['authorization'] = 'Bearer' # API api = k8s_client.ApiClient(configuration) v1 = k8s_client.CoreV1Api(api) try: if not node_exists(v1, node_name): logger.error('Node not found.') abandon_lifecycle_action(asg, auto_scaling_group_name, lifecycle_hook_name, instance_id) return logger.info('Call cordon node') cordon_node(v1, node_name) logger.info('Adding exclude-balancer label') label_body = { "metadata": { "labels": { "alpha.service-controller.kubernetes.io/exclude-balancer": "true" } } } try: response_kube = v1.patch_node(node_name, label_body) except Exception as e: print(e) exit(1) logger.info('Call remove all pods') remove_all_pods(v1, node_name) logger.info('Sleeping') time.sleep(360) logger.info('Call complete lifecycle action') asg.complete_lifecycle_action( LifecycleHookName=lifecycle_hook_name, AutoScalingGroupName=auto_scaling_group_name, LifecycleActionResult='CONTINUE', InstanceId=instance_id) except ApiException: logger.exception( 'There was an error removing the pods from the node {}'.format( node_name)) abandon_lifecycle_action(asg, auto_scaling_group_name, lifecycle_hook_name, instance_id)