Example #1
0
def wait_for_task_completion(task_url, timeout):
    """
    Arguments:
        task_url(str): The URL from which to retrieve task status.
        timeout(int): How many seconds to wait for task completion
                      before throwing an error.

    Returns:
        dict: Parsed json of the task completion or failure status.

    Raises:
        TimeoutException: When we timeout waiting for the task to finish.
    """

    if not task_url.endswith('.json'):
        task_url += ".json"

    LOG.debug("Task URL: {}".format(task_url))
    end_time = datetime.utcnow() + timedelta(seconds=timeout)
    while end_time > datetime.utcnow():
        response = requests.get(task_url, params=ASGARD_API_TOKEN, timeout=REQUESTS_TIMEOUT)
        json_response = _parse_json(task_url, response)
        if json_response['status'] in ('completed', 'failed'):
            return json_response

        time.sleep(WAIT_SLEEP_TIME)

    raise TimeoutException("Timed out while waiting for task {}".format(task_url))
Example #2
0
def wait_for_in_service(all_asgs, timeout):
    """
    Wait for the ASG and all instances in them to be healthy
    according to AWS metrics.

    Arguments:
        all_asgs(list<str>): A list of ASGs we want to be healthy.
        timeout: The amount of time in seconds to wait for healthy state.
    [
        u'test-edx-edxapp-v008',
        u'test-edx-worker-v005',
    ]

    Returns: Nothing if healthy, raises a timeout exception if un-healthy.
    """
    if not all_asgs:
        LOG.info("No ASGs to monitor - skipping health check.")
        return

    asgs_left_to_check = list(all_asgs)
    LOG.info("Waiting for ASGs to be healthy: {}".format(asgs_left_to_check))

    end_time = datetime.utcnow() + timedelta(seconds=timeout)
    while end_time > datetime.utcnow():
        asgs = get_all_autoscale_groups(asgs_left_to_check)
        for asg in asgs:
            all_healthy = True
            for instance in asg.instances:
                if instance.health_status.lower(
                ) != 'healthy' or instance.lifecycle_state.lower(
                ) != 'inservice':
                    # Instance is not ready.
                    all_healthy = False
                    break

            if all_healthy:
                # Then all are healthy we can stop checking this.
                LOG.debug("All instances healthy in ASG: {}".format(asg.name))
                LOG.debug(asgs_left_to_check)
                asgs_left_to_check.remove(asg.name)

        if not asgs_left_to_check:
            return

        time.sleep(1)

    raise TimeoutException(
        "Some instances in the following ASGs never became healthy: {}".format(
            asgs_left_to_check))
Example #3
0
def wait_for_healthy_elbs(elbs_to_monitor, timeout):
    """
    Wait for all instances in all ELBs listed to be healthy. Raise a
    timeout exception if they don't become healthy.

    Arguments:
        elbs_to_monitor(list<str>): Names of ELBs that we are monitoring.
        timeout: Timeout in seconds of how long to wait.

    Returns:
        None: When all ELBs have only healthy instances in them.

    Raises:
        TimeoutException: We we have run out of time.
    """
    @backoff.on_exception(backoff.expo,
                          BotoServerError,
                          max_tries=MAX_ATTEMPTS,
                          giveup=giveup_if_not_throttling,
                          factor=RETRY_FACTOR)
    def _get_elb_health(selected_elb):
        """
        Get the health of an ELB

        Args:
            selected_elb (boto.ec2.elb.loadbalancer.LoadBalancer):

        Returns:
            list of InstanceState <boto.ec2.elb.instancestate.InstanceState>

        """
        return selected_elb.get_instance_health()

    if not elbs_to_monitor:
        LOG.info("No ELBs to monitor - skipping health check.")
        return

    elbs_left = set(elbs_to_monitor)
    end_time = datetime.utcnow() + timedelta(seconds=timeout)
    while end_time > datetime.utcnow():
        elbs = get_all_load_balancers(elbs_left)
        for elb in elbs:
            LOG.info("Checking health for ELB: {}".format(elb.name))
            all_healthy = True
            for instance in _get_elb_health(elb):
                if instance.state != 'InService':
                    all_healthy = False
                    break

            if all_healthy:
                LOG.info(
                    "All instances are healthy, remove {} from list of load balancers {}."
                    .format(elb.name, elbs_left))
                elbs_left.remove(elb.name)

        LOG.info(
            "Number of load balancers remaining with unhealthy instances: {}".
            format(len(elbs_left)))
        if not elbs_left:
            LOG.info("All instances in all ELBs are healthy, returning.")
            return
        time.sleep(WAIT_SLEEP_TIME)

    raise TimeoutException(
        "The following ELBs never became healthy: {}".format(elbs_left))