def _setup_test_asg_to_be_deleted(self): """ Setup a test ASG that is tagged to be deleted. """ # pylint: disable=attribute-defined-outside-init self.test_asg_name = "test-asg-random-tags" self.test_autoscale = boto.connect_autoscale() launch_config = boto.ec2.autoscale.LaunchConfiguration( name='my-launch_config', image_id='my-ami', key_name='my_key_name', security_groups=['my_security_groups'] ) self.test_autoscale.create_launch_configuration(launch_config) asg = boto.ec2.autoscale.AutoScalingGroup( group_name=self.test_asg_name, load_balancers=['my-lb'], availability_zones=['us-east-1a', 'us-east-1b'], launch_config=launch_config, min_size=4, max_size=8, connection=self.test_autoscale ) create_elb('my-lb') self.test_autoscale.create_auto_scaling_group(asg) ec2.tag_asg_for_deletion(self.test_asg_name, 0) self.test_asg = self.test_autoscale.get_all_groups([self.test_asg_name])[0]
def _setup_test_asg_to_be_deleted(self): """ Setup a test ASG that is tagged to be deleted. """ # pylint: disable=attribute-defined-outside-init self.test_asg_name = "test-asg-random-tags" self.test_autoscale = boto.connect_autoscale() launch_config = boto.ec2.autoscale.LaunchConfiguration( name='my-launch_config', image_id='my-ami', key_name='my_key_name', security_groups=['my_security_groups']) self.test_autoscale.create_launch_configuration(launch_config) asg = boto.ec2.autoscale.AutoScalingGroup( group_name=self.test_asg_name, load_balancers=['my-lb'], availability_zones=['us-east-1a', 'us-east-1b'], launch_config=launch_config, min_size=4, max_size=8, connection=self.test_autoscale) create_elb('my-lb') self.test_autoscale.create_auto_scaling_group(asg) ec2.tag_asg_for_deletion(self.test_asg_name, 0) self.test_asg = self.test_autoscale.get_all_groups( [self.test_asg_name])[0]
def _red_black_deploy( new_cluster_asgs, baseline_cluster_asgs, secs_before_old_asgs_disabled=DISABLE_OLD_ASG_WAIT_TIME ): """ Takes two dicts of autoscale groups, new and baseline. Each dict key is a cluster name. Each dict value is a list of ASGs for that cluster. Enables the new ASGs, then disables the old ASGs. Red/black deploy refers to: - Existing ASG is "red", meaning active. - New ASG begins as "black", meaning inactive. - The new ASG is added to the ELB, making it "red". - The baseline and new ASGs are now existing as "red/red". - The baseline ASG is removed from the ELB. - As traffic has ceased to be directed to the baseline ASG, it becomes "black". Workflow: - enable new ASGs - wait for instances to be healthy in the load balancer - ensure the new ASGs are not pending delete or disabled - tag and disable current asgs Args: new_asgs (dict): List of new ASGs to be added to the ELB, keyed by cluster. baseline_asgs (dict): List of existing ASGs already added to the ELB, keyed by cluster. Returns: success (bool): True if red/black operation succeeded, else False. asgs_enabled (dict): List of ASGs that are added to the ELB, keyed by cluster. asgs_disabled (dict): List of ASGs that are removed from the ELB, keyed by cluster. """ asgs_enabled = copy.deepcopy(baseline_cluster_asgs) asgs_disabled = copy.deepcopy(new_cluster_asgs) def _enable_cluster_asg(cluster, asg): """ Shifts ASG from disabled to enabled. """ enable_asg(asg) _move_asg_from_disabled_to_enabled(cluster, asg) def _disable_cluster_asg(cluster, asg): """ Shifts ASG from enabled to disabled. """ disable_asg(asg) _move_asg_from_enabled_to_disabled(cluster, asg) def _move_asg_from_disabled_to_enabled(cluster, asg): """ Shifts ASG from disabled to enabled. """ asgs_enabled[cluster].append(asg) asgs_disabled[cluster].remove(asg) def _move_asg_from_enabled_to_disabled(cluster, asg): """ Shifts ASG from enabled to disabled. """ asgs_enabled[cluster].remove(asg) asgs_disabled[cluster].append(asg) def _disable_clustered_asgs(clustered_asgs, failure_msg): """ Disable all the ASGs in the lists, keyed by cluster. """ for cluster, asgs in six.iteritems(clustered_asgs): for asg in asgs: try: _disable_cluster_asg(cluster, asg) except: # pylint: disable=bare-except LOG.warning(failure_msg, asg, exc_info=True) elbs_to_monitor = [] newly_enabled_asgs = defaultdict(list) for cluster, asgs in six.iteritems(new_cluster_asgs): for asg in asgs: try: _enable_cluster_asg(cluster, asg) elbs_to_monitor.extend(elbs_for_asg(asg)) newly_enabled_asgs[cluster].append(asg) except: # pylint: disable=bare-except LOG.error("Error enabling ASG '%s'. Disabling traffic to all new ASGs.", asg, exc_info=True) # Disable the ASG which failed first. _disable_cluster_asg(cluster, asg) # Then disable any new other ASGs that have been newly enabled. _disable_clustered_asgs( newly_enabled_asgs, "Unable to disable ASG '%s' after failure." ) return (False, asgs_enabled, asgs_disabled) LOG.info("New ASGs {} are active and will be available after passing the healthchecks.".format( dict(newly_enabled_asgs) )) # Wait for all instances to be in service in all ELBs. try: ec2.wait_for_healthy_elbs(elbs_to_monitor, 600) except: # pylint: disable=bare-except LOG.info("Some ASGs are failing ELB health checks. Disabling traffic to all new ASGs.", exc_info=True) _disable_clustered_asgs( newly_enabled_asgs, "Unable to disable ASG '%s' after waiting for healthy ELBs." ) return (False, asgs_enabled, asgs_disabled) # Add a sleep delay here to wait and see how the new ASGs react to traffic. # A flawed release would likely make the new ASGs fail the health checks below # and, if any new ASGs fail the health checks, the old ASGs would *not be disabled. time.sleep(secs_before_old_asgs_disabled) # Ensure the new ASGs are still healthy and not pending delete before disabling the old ASGs. for cluster, asgs in six.iteritems(newly_enabled_asgs): for asg in asgs: err_msg = None if is_asg_pending_delete(asg): err_msg = "New ASG '{}' is pending delete.".format(asg) elif not is_asg_enabled(asg): err_msg = "New ASG '{}' is not enabled.".format(asg) if err_msg: LOG.error("{} Aborting disabling of old ASGs.".format(err_msg)) return (False, asgs_enabled, asgs_disabled) LOG.info("New ASGs have passed the healthchecks. Now disabling old ASGs.") for cluster, asgs in six.iteritems(baseline_cluster_asgs): for asg in asgs: try: if is_asg_enabled(asg): try: _disable_cluster_asg(cluster, asg) except: # pylint: disable=bare-except LOG.warning("Unable to disable ASG '%s' after enabling new ASGs.", asg, exc_info=True) elif asg in asgs_enabled[cluster]: # If the asg is not enabled, but we have it in the enabled list remove it. This may occur by # pulling from 2 different sources of truth at different intervals. The asg could have been disabled # in the intervening time. _move_asg_from_enabled_to_disabled(cluster, asg) except ASGDoesNotExistException: # This operation should not fail if one of the baseline ASGs was removed during the deployment process LOG.info("ASG {asg} in cluster {cluster} no longer exists, removing it from the enabled cluster list" .format(asg=asg, cluster=cluster)) _move_asg_from_enabled_to_disabled(cluster, asg) try: ec2.tag_asg_for_deletion(asg) except ASGDoesNotExistException: LOG.info("Unable to tag ASG '{}' as it no longer exists, skipping.".format(asg)) return (True, asgs_enabled, asgs_disabled)
def _red_black_deploy( new_cluster_asgs, baseline_cluster_asgs, secs_before_old_asgs_disabled=DISABLE_OLD_ASG_WAIT_TIME ): """ Takes two dicts of autoscale groups, new and baseline. Each dict key is a cluster name. Each dict value is a list of ASGs for that cluster. Enables the new ASGs, then disables the old ASGs. Red/black deploy refers to: - Existing ASG is "red", meaning active. - New ASG begins as "black", meaning inactive. - The new ASG is added to the ELB, making it "red". - The baseline and new ASGs are now existing as "red/red". - The baseline ASG is removed from the ELB. - As traffic has ceased to be directed to the baseline ASG, it becomes "black". Workflow: - enable new ASGs - wait for instances to be healthy in the load balancer - ensure the new ASGs are not pending delete or disabled - tag and disable current asgs Args: new_asgs (dict): List of new ASGs to be added to the ELB, keyed by cluster. baseline_asgs (dict): List of existing ASGs already added to the ELB, keyed by cluster. Returns: success (bool): True if red/black operation succeeded, else False. asgs_enabled (dict): List of ASGs that are added to the ELB, keyed by cluster. asgs_disabled (dict): List of ASGs that are removed from the ELB, keyed by cluster. """ asgs_enabled = copy.deepcopy(baseline_cluster_asgs) asgs_disabled = copy.deepcopy(new_cluster_asgs) def _enable_cluster_asg(cluster, asg): """ Shifts ASG from disabled to enabled. """ enable_asg(asg) _move_asg_from_disabled_to_enabled(cluster, asg) def _disable_cluster_asg(cluster, asg): """ Shifts ASG from enabled to disabled. """ disable_asg(asg) _move_asg_from_enabled_to_disabled(cluster, asg) def _move_asg_from_disabled_to_enabled(cluster, asg): """ Shifts ASG from disabled to enabled. """ asgs_enabled[cluster].append(asg) asgs_disabled[cluster].remove(asg) def _move_asg_from_enabled_to_disabled(cluster, asg): """ Shifts ASG from enabled to disabled. """ asgs_enabled[cluster].remove(asg) asgs_disabled[cluster].append(asg) def _disable_clustered_asgs(clustered_asgs, failure_msg): """ Disable all the ASGs in the lists, keyed by cluster. """ for cluster, asgs in clustered_asgs.iteritems(): for asg in asgs: try: _disable_cluster_asg(cluster, asg) except: # pylint: disable=bare-except LOG.warning(failure_msg.format(asg)) elbs_to_monitor = [] newly_enabled_asgs = defaultdict(list) for cluster, asgs in new_cluster_asgs.iteritems(): for asg in asgs: try: _enable_cluster_asg(cluster, asg) elbs_to_monitor.extend(elbs_for_asg(asg)) newly_enabled_asgs[cluster].append(asg) except: # pylint: disable=bare-except LOG.error("Error enabling ASG '{}'. Disabling traffic to all new ASGs.".format(asg)) LOG.error(traceback.format_exc()) # Disable the ASG which failed first. _disable_cluster_asg(cluster, asg) # Then disable any new other ASGs that have been newly enabled. _disable_clustered_asgs( newly_enabled_asgs, "Unable to disable ASG '{}' after failure." ) return (False, asgs_enabled, asgs_disabled) LOG.info("New ASGs {} are active and will be available after passing the healthchecks.".format( dict(newly_enabled_asgs) )) # Wait for all instances to be in service in all ELBs. try: ec2.wait_for_healthy_elbs(elbs_to_monitor, 600) except: # pylint: disable=bare-except LOG.info("Some ASGs are failing ELB health checks. Disabling traffic to all new ASGs.") _disable_clustered_asgs( newly_enabled_asgs, "Unable to disable ASG '{}' after waiting for healthy ELBs." ) return (False, asgs_enabled, asgs_disabled) # Add a sleep delay here to wait and see how the new ASGs react to traffic. # A flawed release would likely make the new ASGs fail the health checks below # and, if any new ASGs fail the health checks, the old ASGs would *not be disabled. time.sleep(secs_before_old_asgs_disabled) # Ensure the new ASGs are still healthy and not pending delete before disabling the old ASGs. for cluster, asgs in newly_enabled_asgs.iteritems(): for asg in asgs: err_msg = None if is_asg_pending_delete(asg): err_msg = "New ASG '{}' is pending delete.".format(asg) elif not is_asg_enabled(asg): err_msg = "New ASG '{}' is not enabled.".format(asg) if err_msg: LOG.error("{} Aborting disabling of old ASGs.".format(err_msg)) return (False, asgs_enabled, asgs_disabled) LOG.info("New ASGs have passed the healthchecks. Now disabling old ASGs.") for cluster, asgs in baseline_cluster_asgs.iteritems(): for asg in asgs: try: if is_asg_enabled(asg): try: _disable_cluster_asg(cluster, asg) except: # pylint: disable=bare-except LOG.warning("Unable to disable ASG '{}' after enabling new ASGs.".format(asg)) elif asg in asgs_enabled[cluster]: # If the asg is not enabled, but we have it in the enabled list remove it. This may occur by # pulling from 2 different sources of truth at different intervals. The asg could have been disabled # in the intervening time. _move_asg_from_enabled_to_disabled(cluster, asg) except ASGDoesNotExistException: # This operation should not fail if one of the baseline ASGs was removed during the deployment process LOG.info("ASG {asg} in cluster {cluster} no longer exists, removing it from the enabled cluster list" .format(asg=asg, cluster=cluster)) _move_asg_from_enabled_to_disabled(cluster, asg) try: ec2.tag_asg_for_deletion(asg) except ASGDoesNotExistException: LOG.info("Unable to tag ASG '{}' as it no longer exists, skipping.".format(asg)) return (True, asgs_enabled, asgs_disabled)