def wait_for_task_completion(task_url, timeout): """ Arguments: task_url(str): The URL from which to retrieve task status. timeout(int): How many seconds to wait for task completion before throwing an error. Returns: dict: Parsed json of the task completion or failure status. Raises: TimeoutException: When we timeout waiting for the task to finish. """ if not task_url.endswith('.json'): task_url += ".json" LOG.debug("Task URL: {}".format(task_url)) end_time = datetime.utcnow() + timedelta(seconds=timeout) while end_time > datetime.utcnow(): response = requests.get(task_url, params=ASGARD_API_TOKEN, timeout=REQUESTS_TIMEOUT) json_response = _parse_json(task_url, response) if json_response['status'] in ('completed', 'failed'): return json_response time.sleep(WAIT_SLEEP_TIME) raise TimeoutException("Timed out while waiting for task {}".format(task_url))
def wait_for_in_service(all_asgs, timeout): """ Wait for the ASG and all instances in them to be healthy according to AWS metrics. Arguments: all_asgs(list<str>): A list of ASGs we want to be healthy. timeout: The amount of time in seconds to wait for healthy state. [ u'test-edx-edxapp-v008', u'test-edx-worker-v005', ] Returns: Nothing if healthy, raises a timeout exception if un-healthy. """ if not all_asgs: LOG.info("No ASGs to monitor - skipping health check.") return asgs_left_to_check = list(all_asgs) LOG.info("Waiting for ASGs to be healthy: {}".format(asgs_left_to_check)) end_time = datetime.utcnow() + timedelta(seconds=timeout) while end_time > datetime.utcnow(): asgs = get_all_autoscale_groups(asgs_left_to_check) for asg in asgs: all_healthy = True for instance in asg.instances: if instance.health_status.lower( ) != 'healthy' or instance.lifecycle_state.lower( ) != 'inservice': # Instance is not ready. all_healthy = False break if all_healthy: # Then all are healthy we can stop checking this. LOG.debug("All instances healthy in ASG: {}".format(asg.name)) LOG.debug(asgs_left_to_check) asgs_left_to_check.remove(asg.name) if not asgs_left_to_check: return time.sleep(1) raise TimeoutException( "Some instances in the following ASGs never became healthy: {}".format( asgs_left_to_check))
def wait_for_healthy_elbs(elbs_to_monitor, timeout): """ Wait for all instances in all ELBs listed to be healthy. Raise a timeout exception if they don't become healthy. Arguments: elbs_to_monitor(list<str>): Names of ELBs that we are monitoring. timeout: Timeout in seconds of how long to wait. Returns: None: When all ELBs have only healthy instances in them. Raises: TimeoutException: We we have run out of time. """ @backoff.on_exception(backoff.expo, BotoServerError, max_tries=MAX_ATTEMPTS, giveup=giveup_if_not_throttling, factor=RETRY_FACTOR) def _get_elb_health(selected_elb): """ Get the health of an ELB Args: selected_elb (boto.ec2.elb.loadbalancer.LoadBalancer): Returns: list of InstanceState <boto.ec2.elb.instancestate.InstanceState> """ return selected_elb.get_instance_health() if not elbs_to_monitor: LOG.info("No ELBs to monitor - skipping health check.") return elbs_left = set(elbs_to_monitor) end_time = datetime.utcnow() + timedelta(seconds=timeout) while end_time > datetime.utcnow(): elbs = get_all_load_balancers(elbs_left) for elb in elbs: LOG.info("Checking health for ELB: {}".format(elb.name)) all_healthy = True for instance in _get_elb_health(elb): if instance.state != 'InService': all_healthy = False break if all_healthy: LOG.info( "All instances are healthy, remove {} from list of load balancers {}." .format(elb.name, elbs_left)) elbs_left.remove(elb.name) LOG.info( "Number of load balancers remaining with unhealthy instances: {}". format(len(elbs_left))) if not elbs_left: LOG.info("All instances in all ELBs are healthy, returning.") return time.sleep(WAIT_SLEEP_TIME) raise TimeoutException( "The following ELBs never became healthy: {}".format(elbs_left))