Ejemplo n.º 1
0
    def test_wait_for_healthy_elbs(self):
        first_elb_name = "healthy-lb-1"
        second_elb_name = "healthy-lb-2"
        first_elb = create_elb(first_elb_name)
        second_elb = create_elb(second_elb_name)
        mock_function = "boto.ec2.elb.loadbalancer.LoadBalancer.get_instance_health"

        # Setup a side effect to simulate how a instances may come online in the load balancer.
        # 2 load balancers * 2 instances per * 3 iterations (They way these instances come online in to the load
        # balancer will ensure that the ELB will be removed from the list on the second iteration, then the second ELB
        # is removed on the 3rd iteation.
        first_elb_instances = first_elb.get_instance_health()
        second_elb_instances = second_elb.get_instance_health()

        return_vals = [
            clone_elb_instances_with_state(first_elb_instances, "OutOfService"),
            clone_elb_instances_with_state(second_elb_instances, "OutOfService")
        ]
        return_vals += [
            clone_elb_instances_with_state(first_elb_instances, "InService"),
            clone_elb_instances_with_state(second_elb_instances, "OutOfService")
        ]
        return_vals += [clone_elb_instances_with_state(second_elb_instances, "InService")]

        with mock.patch(mock_function, side_effect=return_vals):
            with mock.patch('tubular.ec2.WAIT_SLEEP_TIME', 1):
                self.assertEqual(None, ec2.wait_for_healthy_elbs([first_elb_name, second_elb_name], 3))
Ejemplo n.º 2
0
    def test_wait_for_healthy_elbs(self):
        first_elb_name = "healthy-lb-1"
        second_elb_name = "healthy-lb-2"
        first_elb = create_elb(first_elb_name)
        second_elb = create_elb(second_elb_name)
        mock_function = "boto.ec2.elb.loadbalancer.LoadBalancer.get_instance_health"

        # Setup a side effect to simulate how a instances may come online in the load balancer.
        # 2 load balancers * 2 instances per * 3 iterations (They way these instances come online in to the load
        # balancer will ensure that the ELB will be removed from the list on the second iteration, then the second ELB
        # is removed on the 3rd iteation.
        first_elb_instances = first_elb.get_instance_health()
        second_elb_instances = second_elb.get_instance_health()

        return_vals = [
            clone_elb_instances_with_state(first_elb_instances,
                                           "OutOfService"),
            clone_elb_instances_with_state(second_elb_instances,
                                           "OutOfService")
        ]
        return_vals += [
            clone_elb_instances_with_state(first_elb_instances, "InService"),
            clone_elb_instances_with_state(second_elb_instances,
                                           "OutOfService")
        ]
        return_vals += [
            clone_elb_instances_with_state(second_elb_instances, "InService")
        ]

        with mock.patch(mock_function, side_effect=return_vals):
            with mock.patch('tubular.ec2.WAIT_SLEEP_TIME', 1):
                self.assertEqual(
                    None,
                    ec2.wait_for_healthy_elbs(
                        [first_elb_name, second_elb_name], 3))
Ejemplo n.º 3
0
def _red_black_deploy(
        new_cluster_asgs, baseline_cluster_asgs,
        secs_before_old_asgs_disabled=DISABLE_OLD_ASG_WAIT_TIME
):
    """
    Takes two dicts of autoscale groups, new and baseline.
    Each dict key is a cluster name.
    Each dict value is a list of ASGs for that cluster.
    Enables the new ASGs, then disables the old ASGs.

    Red/black deploy refers to:
        - Existing ASG is "red", meaning active.
        - New ASG begins as "black", meaning inactive.
        - The new ASG is added to the ELB, making it "red".
            - The baseline and new ASGs are now existing as "red/red".
        - The baseline ASG is removed from the ELB.
            - As traffic has ceased to be directed to the baseline ASG, it becomes "black".

    Workflow:
        - enable new ASGs
        - wait for instances to be healthy in the load balancer
        - ensure the new ASGs are not pending delete or disabled
        - tag and disable current asgs

    Args:
        new_asgs (dict): List of new ASGs to be added to the ELB, keyed by cluster.
        baseline_asgs (dict): List of existing ASGs already added to the ELB, keyed by cluster.

    Returns:
        success (bool): True if red/black operation succeeded, else False.
        asgs_enabled (dict): List of ASGs that are added to the ELB, keyed by cluster.
        asgs_disabled (dict): List of ASGs that are removed from the ELB, keyed by cluster.
    """
    asgs_enabled = copy.deepcopy(baseline_cluster_asgs)
    asgs_disabled = copy.deepcopy(new_cluster_asgs)

    def _enable_cluster_asg(cluster, asg):
        """
        Shifts ASG from disabled to enabled.
        """
        enable_asg(asg)
        _move_asg_from_disabled_to_enabled(cluster, asg)

    def _disable_cluster_asg(cluster, asg):
        """
        Shifts ASG from enabled to disabled.
        """
        disable_asg(asg)
        _move_asg_from_enabled_to_disabled(cluster, asg)

    def _move_asg_from_disabled_to_enabled(cluster, asg):
        """
        Shifts ASG from disabled to enabled.
        """
        asgs_enabled[cluster].append(asg)
        asgs_disabled[cluster].remove(asg)

    def _move_asg_from_enabled_to_disabled(cluster, asg):
        """
        Shifts ASG from enabled to disabled.
        """
        asgs_enabled[cluster].remove(asg)
        asgs_disabled[cluster].append(asg)

    def _disable_clustered_asgs(clustered_asgs, failure_msg):
        """
        Disable all the ASGs in the lists, keyed by cluster.
        """
        for cluster, asgs in six.iteritems(clustered_asgs):
            for asg in asgs:
                try:
                    _disable_cluster_asg(cluster, asg)
                except:  # pylint: disable=bare-except
                    LOG.warning(failure_msg, asg, exc_info=True)

    elbs_to_monitor = []
    newly_enabled_asgs = defaultdict(list)
    for cluster, asgs in six.iteritems(new_cluster_asgs):
        for asg in asgs:
            try:
                _enable_cluster_asg(cluster, asg)
                elbs_to_monitor.extend(elbs_for_asg(asg))
                newly_enabled_asgs[cluster].append(asg)
            except:  # pylint: disable=bare-except
                LOG.error("Error enabling ASG '%s'. Disabling traffic to all new ASGs.", asg, exc_info=True)
                # Disable the ASG which failed first.
                _disable_cluster_asg(cluster, asg)
                # Then disable any new other ASGs that have been newly enabled.
                _disable_clustered_asgs(
                    newly_enabled_asgs,
                    "Unable to disable ASG '%s' after failure."
                )
                return (False, asgs_enabled, asgs_disabled)

    LOG.info("New ASGs {} are active and will be available after passing the healthchecks.".format(
        dict(newly_enabled_asgs)
    ))

    # Wait for all instances to be in service in all ELBs.
    try:
        ec2.wait_for_healthy_elbs(elbs_to_monitor, 600)
    except:  # pylint: disable=bare-except
        LOG.info("Some ASGs are failing ELB health checks. Disabling traffic to all new ASGs.", exc_info=True)
        _disable_clustered_asgs(
            newly_enabled_asgs,
            "Unable to disable ASG '%s' after waiting for healthy ELBs."
        )
        return (False, asgs_enabled, asgs_disabled)

    # Add a sleep delay here to wait and see how the new ASGs react to traffic.
    # A flawed release would likely make the new ASGs fail the health checks below
    # and, if any new ASGs fail the health checks, the old ASGs would *not be disabled.
    time.sleep(secs_before_old_asgs_disabled)

    # Ensure the new ASGs are still healthy and not pending delete before disabling the old ASGs.
    for cluster, asgs in six.iteritems(newly_enabled_asgs):
        for asg in asgs:
            err_msg = None
            if is_asg_pending_delete(asg):
                err_msg = "New ASG '{}' is pending delete.".format(asg)
            elif not is_asg_enabled(asg):
                err_msg = "New ASG '{}' is not enabled.".format(asg)
            if err_msg:
                LOG.error("{} Aborting disabling of old ASGs.".format(err_msg))
                return (False, asgs_enabled, asgs_disabled)

    LOG.info("New ASGs have passed the healthchecks. Now disabling old ASGs.")

    for cluster, asgs in six.iteritems(baseline_cluster_asgs):
        for asg in asgs:
            try:
                if is_asg_enabled(asg):
                    try:
                        _disable_cluster_asg(cluster, asg)
                    except:  # pylint: disable=bare-except
                        LOG.warning("Unable to disable ASG '%s' after enabling new ASGs.", asg, exc_info=True)
                elif asg in asgs_enabled[cluster]:
                    # If the asg is not enabled, but we have it in the enabled list remove it. This may occur by
                    # pulling from 2 different sources of truth at different intervals. The asg could have been disabled
                    # in the intervening time.
                    _move_asg_from_enabled_to_disabled(cluster, asg)
            except ASGDoesNotExistException:
                # This operation should not fail if one of the baseline ASGs was removed during the deployment process
                LOG.info("ASG {asg} in cluster {cluster} no longer exists, removing it from the enabled cluster list"
                         .format(asg=asg, cluster=cluster))
                _move_asg_from_enabled_to_disabled(cluster, asg)

            try:
                ec2.tag_asg_for_deletion(asg)
            except ASGDoesNotExistException:
                LOG.info("Unable to tag ASG '{}' as it no longer exists, skipping.".format(asg))

    return (True, asgs_enabled, asgs_disabled)
Ejemplo n.º 4
0
def _red_black_deploy(
        new_cluster_asgs, baseline_cluster_asgs,
        secs_before_old_asgs_disabled=DISABLE_OLD_ASG_WAIT_TIME
):
    """
    Takes two dicts of autoscale groups, new and baseline.
    Each dict key is a cluster name.
    Each dict value is a list of ASGs for that cluster.
    Enables the new ASGs, then disables the old ASGs.

    Red/black deploy refers to:
        - Existing ASG is "red", meaning active.
        - New ASG begins as "black", meaning inactive.
        - The new ASG is added to the ELB, making it "red".
            - The baseline and new ASGs are now existing as "red/red".
        - The baseline ASG is removed from the ELB.
            - As traffic has ceased to be directed to the baseline ASG, it becomes "black".

    Workflow:
        - enable new ASGs
        - wait for instances to be healthy in the load balancer
        - ensure the new ASGs are not pending delete or disabled
        - tag and disable current asgs

    Args:
        new_asgs (dict): List of new ASGs to be added to the ELB, keyed by cluster.
        baseline_asgs (dict): List of existing ASGs already added to the ELB, keyed by cluster.

    Returns:
        success (bool): True if red/black operation succeeded, else False.
        asgs_enabled (dict): List of ASGs that are added to the ELB, keyed by cluster.
        asgs_disabled (dict): List of ASGs that are removed from the ELB, keyed by cluster.
    """
    asgs_enabled = copy.deepcopy(baseline_cluster_asgs)
    asgs_disabled = copy.deepcopy(new_cluster_asgs)

    def _enable_cluster_asg(cluster, asg):
        """
        Shifts ASG from disabled to enabled.
        """
        enable_asg(asg)
        _move_asg_from_disabled_to_enabled(cluster, asg)

    def _disable_cluster_asg(cluster, asg):
        """
        Shifts ASG from enabled to disabled.
        """
        disable_asg(asg)
        _move_asg_from_enabled_to_disabled(cluster, asg)

    def _move_asg_from_disabled_to_enabled(cluster, asg):
        """
        Shifts ASG from disabled to enabled.
        """
        asgs_enabled[cluster].append(asg)
        asgs_disabled[cluster].remove(asg)

    def _move_asg_from_enabled_to_disabled(cluster, asg):
        """
        Shifts ASG from enabled to disabled.
        """
        asgs_enabled[cluster].remove(asg)
        asgs_disabled[cluster].append(asg)

    def _disable_clustered_asgs(clustered_asgs, failure_msg):
        """
        Disable all the ASGs in the lists, keyed by cluster.
        """
        for cluster, asgs in clustered_asgs.iteritems():
            for asg in asgs:
                try:
                    _disable_cluster_asg(cluster, asg)
                except:  # pylint: disable=bare-except
                    LOG.warning(failure_msg.format(asg))

    elbs_to_monitor = []
    newly_enabled_asgs = defaultdict(list)
    for cluster, asgs in new_cluster_asgs.iteritems():
        for asg in asgs:
            try:
                _enable_cluster_asg(cluster, asg)
                elbs_to_monitor.extend(elbs_for_asg(asg))
                newly_enabled_asgs[cluster].append(asg)
            except:  # pylint: disable=bare-except
                LOG.error("Error enabling ASG '{}'. Disabling traffic to all new ASGs.".format(asg))
                LOG.error(traceback.format_exc())
                # Disable the ASG which failed first.
                _disable_cluster_asg(cluster, asg)
                # Then disable any new other ASGs that have been newly enabled.
                _disable_clustered_asgs(
                    newly_enabled_asgs,
                    "Unable to disable ASG '{}' after failure."
                )
                return (False, asgs_enabled, asgs_disabled)

    LOG.info("New ASGs {} are active and will be available after passing the healthchecks.".format(
        dict(newly_enabled_asgs)
    ))

    # Wait for all instances to be in service in all ELBs.
    try:
        ec2.wait_for_healthy_elbs(elbs_to_monitor, 600)
    except:  # pylint: disable=bare-except
        LOG.info("Some ASGs are failing ELB health checks. Disabling traffic to all new ASGs.")
        _disable_clustered_asgs(
            newly_enabled_asgs,
            "Unable to disable ASG '{}' after waiting for healthy ELBs."
        )
        return (False, asgs_enabled, asgs_disabled)

    # Add a sleep delay here to wait and see how the new ASGs react to traffic.
    # A flawed release would likely make the new ASGs fail the health checks below
    # and, if any new ASGs fail the health checks, the old ASGs would *not be disabled.
    time.sleep(secs_before_old_asgs_disabled)

    # Ensure the new ASGs are still healthy and not pending delete before disabling the old ASGs.
    for cluster, asgs in newly_enabled_asgs.iteritems():
        for asg in asgs:
            err_msg = None
            if is_asg_pending_delete(asg):
                err_msg = "New ASG '{}' is pending delete.".format(asg)
            elif not is_asg_enabled(asg):
                err_msg = "New ASG '{}' is not enabled.".format(asg)
            if err_msg:
                LOG.error("{} Aborting disabling of old ASGs.".format(err_msg))
                return (False, asgs_enabled, asgs_disabled)

    LOG.info("New ASGs have passed the healthchecks. Now disabling old ASGs.")

    for cluster, asgs in baseline_cluster_asgs.iteritems():
        for asg in asgs:
            try:
                if is_asg_enabled(asg):
                    try:
                        _disable_cluster_asg(cluster, asg)
                    except:  # pylint: disable=bare-except
                        LOG.warning("Unable to disable ASG '{}' after enabling new ASGs.".format(asg))
                elif asg in asgs_enabled[cluster]:
                    # If the asg is not enabled, but we have it in the enabled list remove it. This may occur by
                    # pulling from 2 different sources of truth at different intervals. The asg could have been disabled
                    # in the intervening time.
                    _move_asg_from_enabled_to_disabled(cluster, asg)
            except ASGDoesNotExistException:
                # This operation should not fail if one of the baseline ASGs was removed during the deployment process
                LOG.info("ASG {asg} in cluster {cluster} no longer exists, removing it from the enabled cluster list"
                         .format(asg=asg, cluster=cluster))
                _move_asg_from_enabled_to_disabled(cluster, asg)

            try:
                ec2.tag_asg_for_deletion(asg)
            except ASGDoesNotExistException:
                LOG.info("Unable to tag ASG '{}' as it no longer exists, skipping.".format(asg))

    return (True, asgs_enabled, asgs_disabled)