def update_asgs(asgs, cluster_name): run_mode = app_config['RUN_MODE'] asg_outdated_instance_dict = plan_asgs(asgs) asg_original_state_dict = {} if run_mode == 2: # Scale up all the ASGs with outdated nodes (by the number of outdated nodes) for asg_name, asg_tuple in asg_outdated_instance_dict.items(): outdated_instances, asg = asg_tuple outdated_instance_count = len(outdated_instances) logger.info( f'Setting the scale of ASG {asg_name} based on {outdated_instance_count} outdated instances.' ) asg_original_state_dict[asg_name] = scale_up_asg( cluster_name, asg, outdated_instance_count) k8s_nodes = get_k8s_nodes() if (run_mode == 2) or (run_mode == 3): for asg_name, asg_tuple in asg_outdated_instance_dict.items(): outdated_instances, asg = asg_tuple for outdated in outdated_instances: node_name = "" try: # get the k8s node name instead of instance id node_name = get_node_by_instance_id( k8s_nodes, outdated['InstanceId']) cordon_node(node_name) except Exception as cordon_exception: logger.error( f"Encountered an error when cordoning node {node_name}" ) logger.error(cordon_exception) exit(1) # Drain, Delete and Terminate the outdated nodes and return the ASGs back to their original state for asg_name, asg_tuple in asg_outdated_instance_dict.items(): outdated_instances, asg = asg_tuple outdated_instance_count = len(outdated_instances) if (run_mode == 1) or (run_mode == 3): logger.info( f'Setting the scale of ASG {asg_name} based on {outdated_instance_count} outdated instances.' ) asg_original_state_dict[asg_name] = scale_up_asg( cluster_name, asg, outdated_instance_count) if run_mode == 1: for outdated in outdated_instances: node_name = "" try: # get the k8s node name instead of instance id node_name = get_node_by_instance_id( k8s_nodes, outdated['InstanceId']) cordon_node(node_name) except Exception as cordon_exception: logger.error( f"Encountered an error when cordoning node {node_name}" ) logger.error(cordon_exception) exit(1) if len(outdated_instances) != 0: # pause aws autoscaling so new instances dont try # to spawn while instances are being terminated modify_aws_autoscaling(asg_name, "suspend") # start draining and terminating for outdated in outdated_instances: # catch any failures so we can resume aws autoscaling try: # get the k8s node name instead of instance id node_name = get_node_by_instance_id(k8s_nodes, outdated['InstanceId']) drain_node(node_name) delete_node(node_name) terminate_instance(outdated['InstanceId']) if not instance_terminated(outdated['InstanceId']): raise Exception( 'Instance is failing to terminate. Cancelling out.') detach_instance(outdated['InstanceId'], asg_name) if app_config[ 'ASG_WAIT_FOR_DETACHMENT'] and not instance_detached( outdated['InstanceId']): raise Exception( 'Instance is failing to detach from ASG. Cancelling out.' ) between_nodes_wait = app_config['BETWEEN_NODES_WAIT'] if between_nodes_wait != 0: logger.info( f'Waiting for {between_nodes_wait} seconds before continuing...' ) time.sleep(between_nodes_wait) except Exception as drain_exception: logger.info(drain_exception) raise RollingUpdateException("Rolling update on ASG failed", asg_name) # scaling cluster back down logger.info("Scaling asg back down to original state") asg_desired_capacity, asg_orig_desired_capacity, asg_orig_max_capacity = asg_original_state_dict[ asg_name] scale_asg(asg_name, asg_desired_capacity, asg_orig_desired_capacity, asg_orig_max_capacity) # resume aws autoscaling modify_aws_autoscaling(asg_name, "resume") # remove aws tag delete_asg_tags(asg_name, app_config["ASG_DESIRED_STATE_TAG"]) delete_asg_tags(asg_name, app_config["ASG_ORIG_CAPACITY_TAG"]) delete_asg_tags(asg_name, app_config["ASG_ORIG_MAX_CAPACITY_TAG"]) logger.info(f'*** Rolling update of asg {asg_name} is complete! ***') logger.info('All asgs processed')
parser.add_argument( '--plan', '-p', nargs='?', const=True, help='perform a dry run to see which instances are out of date') args = parser.parse_args() # check kubectl is installed kctl = shutil.which('kubectl') if not kctl: logger.info('kubectl is required to be installed before proceeding') quit(1) filtered_asgs = get_asgs(args.cluster_name) # perform a dry run if args.plan: plan_asgs(filtered_asgs) else: # perform real update if app_config['K8S_AUTOSCALER_ENABLED']: # pause k8s autoscaler modify_k8s_autoscaler("pause") try: update_asgs(filtered_asgs, args.cluster_name) if app_config['K8S_AUTOSCALER_ENABLED']: # resume autoscaler after asg updated modify_k8s_autoscaler("resume") logger.info('*** Rolling update of all asg is complete! ***') except RollingUpdateException as e: logger.info( "Rolling update encountered an exception. Resuming aws autoscaling." )