def _execute(self): """Executes an actor and yields the results when its finished. raises: gen.Return(True) """ elb = yield self._find_elb(name=self.option('name')) repeating_log = utils.create_repeating_log( self.log.info, 'Still waiting for %s to become healthy' % self.option('name'), seconds=30) while True: healthy = yield self._is_healthy(elb, count=self.option('count')) if healthy is True: self.log.info('ELB is healthy.') break # In dry mode, fake it if self._dry: self.log.info('Pretending that ELB is healthy.') break # Not healthy :( continue looping self.log.debug('Retrying in 3 seconds.') yield utils.tornado_sleep(3) utils.clear_repeating_log(repeating_log) raise gen.Return()
def _run_task(self, task_definition_name): """Runs a task on ECS given a task definition's family and revision. A task can result in multiple running tasks, depending on count and sidekick tasks. Args: task_definition_name: Task Definition string Returns: list: task ARNs. """ repeating_log = utils.create_repeating_log( self.log.info, 'Waiting for task to be found...', seconds=30) while True: response = yield self.thread( self.ecs_conn.run_task, cluster=self.option('cluster'), taskDefinition=task_definition_name, count=self.option('count')) if not response['failures']: break # Error on non-missing failures. self._handle_failures(response['failures'], self.FAILURE_MISSING) yield gen.sleep(2) utils.clear_repeating_log(repeating_log) self.log.info('Scheduled task {}.'.format(task_definition_name)) tasks = [t['taskArn'] for t in response['tasks']] raise gen.Return(tasks)
def _wait_for_deployment_update(self, service_name, task_definition_name): """Wait's for a service's primary deployment to be updated. Args: service_name: Service name to wait for. task_definition_name: Expected Task Definition string. """ repeating_log = utils.create_repeating_log( self.log.info, 'Waiting for primary deployment to be updated to %s ' 'for service with name %s...' % (task_definition_name, service_name), seconds=30) while True: try: service = yield self._describe_service(service_name) except ServiceNotFound as e: self.log.info('Service Not Found: %s' % e.message) yield gen.sleep(2) continue primary_deployment = self._get_primary_deployment(service) if primary_deployment: self.log.info('Primary deployment is %s.' % self._arn_to_name( primary_deployment['taskDefinition'])) if self._is_task_in_deployment( primary_deployment, task_definition_name): self.log.info('Primary deployment updated.') break yield gen.sleep(2) utils.clear_repeating_log(repeating_log)
def _run_task(self, task_definition_name): """Runs a task on ECS given a task definition's family and revision. A task can result in multiple running tasks, depending on count and sidekick tasks. Args: task_definition_name: Task Definition string Returns: list: task ARNs. """ repeating_log = utils.create_repeating_log( self.log.info, 'Waiting for task to be found...', seconds=30) while True: response = yield self.api_call( self.ecs_conn.run_task, cluster=self.option('cluster'), taskDefinition=task_definition_name, count=self.option('count')) if not response['failures']: break # Error on non-missing failures. self._handle_failures(response['failures'], self.FAILURE_MISSING) yield gen.sleep(2) utils.clear_repeating_log(repeating_log) self.log.info('Scheduled task {}.'.format(task_definition_name)) tasks = [t['taskArn'] for t in response['tasks']] raise gen.Return(tasks)
def test_repeating_log(self): logger = mock.Mock() # used for tracking # Repeat this message 10 times per second logid = utils.create_repeating_log(logger.info, 'test', seconds=0.1) yield utils.tornado_sleep(0.45) # Some process takes .4 <> .5 seconds utils.clear_repeating_log(logid) self.assertEquals(logger.info.call_count, 4) # Let's make sure that we don't keep looping our log message. yield utils.tornado_sleep(0.2) self.assertEquals(logger.info.call_count, 4)
def test_repeating_log(self): logger = mock.Mock() # used for tracking # Repeat this message 10 times per second # seconds=0 instructs Tornado to invoke this log on every IO loop # Below we yield gen.moment to allow IO loop iterations. # We do N+1 loops and check N count. logid = utils.create_repeating_log(logger.info, 'test', seconds=0) yield gen.moment yield gen.moment yield gen.moment yield gen.moment yield gen.moment utils.clear_repeating_log(logid) self.assertEqual(logger.info.call_count, 4) # Let's make sure that we don't keep looping our log message. yield gen.moment yield gen.moment self.assertEqual(logger.info.call_count, 4)
def test_repeating_log(self): logger = mock.Mock() # used for tracking # Repeat this message 10 times per second # seconds=0 instructs Tornado to invoke this log on every IO loop # Below we yield gen.moment to allow IO loop iterations. # We do N+1 loops and check N count. logid = utils.create_repeating_log(logger.info, 'test', seconds=0) yield gen.moment yield gen.moment yield gen.moment yield gen.moment yield gen.moment utils.clear_repeating_log(logid) self.assertEquals(logger.info.call_count, 4) # Let's make sure that we don't keep looping our log message. yield gen.moment yield gen.moment self.assertEquals(logger.info.call_count, 4)
def _wait_until_stable(self, delay=3): """Poll and wait until an ElastiGroup has stabalized. Upon group creation, most of the instances will be in a "biding" state. This method watches the list of instances and waits until they are all in the 'fulfilled' state. """ group_id = self._group['group']['id'] # We use the repeating_log to let the user know we're still monitoring # things, while not flooding them every time we make an API call. We # give them a message every 30s, but make an API call every 3 seconds # to check the status. repeating_log = utils.create_repeating_log( self.log.info, 'Waiting for ElastiGroup to become stable', seconds=30) while True: response = yield self._get_group_status(group_id) # Find any nodes that are waiting for spot instance requests to be # fulfilled. pending = [ i for i in response['response']['items'] if i['status'] == 'pending-evaluation' ] fulfilled = [ i['instanceId'] for i in response['response']['items'] if i['status'] == 'fulfilled' and i['instanceId'] is not None ] if len(pending) < 1: self.log.info('All instance requests fulfilled: %s' % ', '.join(fulfilled)) break yield gen.sleep(delay) utils.clear_repeating_log(repeating_log)
def _wait_until_stable(self, delay=3): """Poll and wait until an ElastiGroup has stabalized. Upon group creation, most of the instances will be in a "biding" state. This method watches the list of instances and waits until they are all in the 'fulfilled' state. """ group_id = self._group['group']['id'] # We use the repeating_log to let the user know we're still monitoring # things, while not flooding them every time we make an API call. We # give them a message every 30s, but make an API call every 3 seconds # to check the status. repeating_log = utils.create_repeating_log( self.log.info, 'Waiting for ElastiGroup to become stable', seconds=30) while True: response = yield self._get_group_status(group_id) # Find any nodes that are waiting for spot instance requests to be # fulfilled. pending = [i for i in response['response']['items'] if i['status'] == 'pending-evaluation'] fulfilled = [i['instanceId'] for i in response['response']['items'] if i['status'] == 'fulfilled' and i['instanceId'] is not None] if len(pending) < 1: self.log.info('All instance requests fulfilled: %s' % ', '.join(fulfilled)) break yield gen.sleep(delay) utils.clear_repeating_log(repeating_log)
def wait_for_task(self, task, task_name=None, sleep=5, loc_log=log, instance=None): """Monitors a RightScale task for completion. RightScale tasks are provided as URLs that we can query for the run-status of the task. This method repeatedly queries a task for completion (every 5 seconds), and returns when the task has finished. TODO: Add a task-timeout option. Note: This is a completely retryable operation in the event that an intermittent network connection causes any kind of a connection failure. Args: task: RightScale Task resource object. task_name: Human-readable name of the task to be executed. sleep: Integer of seconds to wait before the first status check. loc_log: logging.getLogger() object to be used to log task status. This is useful when this API call is called from a Kingpin actor, and you want to use the actor's specific logger. If nothing is passed - local `log` object is used. instance: RightScale instance object on which the task is executed. Returns: bool: success status """ if not task: # If there is no task to wait on - don't wait! raise gen.Return(True) timeout_id = None if task_name: timeout_id = utils.create_repeating_log( loc_log.info, 'Still waiting on %s' % task_name, seconds=sleep) # Tracking when the tasks start so we can search by date later # RightScale expects the time to be a string in UTC now = datetime.utcnow() tasks_start = now.strftime('%Y/%m/%d %H:%M:%S +0000') while True: # Get the task status output = yield self._get_task_info(task) summary = output.soul['summary'] stamp = datetime.now() if 'success' in summary or 'completed' in summary: status = True break if 'failed' in summary: status = False break loc_log.debug('Task (%s) status: %s (updated at: %s)' % (output.path, output.soul['summary'], stamp)) yield utils.tornado_sleep(min(sleep, 5)) loc_log.debug('Task (%s) status: %s (updated at: %s)' % (output.path, output.soul['summary'], stamp)) if timeout_id: utils.clear_repeating_log(timeout_id) if status is True: raise gen.Return(True) if not instance: raise gen.Return(status) # If something failed we want to find out why -- get audit logs # Contact RightScale for audit logs of this instance. now = datetime.utcnow() tasks_finish = now.strftime('%Y/%m/%d %H:%M:%S +0000') loc_log.error('Task failed. Instance: "%s".' % instance.soul['name']) audit_logs = yield self.get_audit_logs( instance=instance, start=tasks_start, end=tasks_finish, match='failed') # Print every audit log that was obtained (may be 0) [loc_log.error(l) for l in audit_logs] if not audit_logs: loc_log.error('No audit logs for %s' % instance) loc_log.debug('Task finished, return value: %s, summary: %s' % (status, summary)) raise gen.Return(status)
def wait_for_task(self, task, task_name=None, sleep=5, loc_log=log, instance=None): """Monitors a RightScale task for completion. RightScale tasks are provided as URLs that we can query for the run-status of the task. This method repeatedly queries a task for completion (every 5 seconds), and returns when the task has finished. TODO: Add a task-timeout option. Note: This is a completely retryable operation in the event that an intermittent network connection causes any kind of a connection failure. Args: task: RightScale Task resource object. task_name: Human-readable name of the task to be executed. sleep: Integer of seconds to wait before the first status check. loc_log: logging.getLogger() object to be used to log task status. This is useful when this API call is called from a Kingpin actor, and you want to use the actor's specific logger. If nothing is passed - local `log` object is used. instance: RightScale instance object on which the task is executed. Returns: bool: success status """ if not task: # If there is no task to wait on - don't wait! raise gen.Return(True) timeout_id = None if task_name: timeout_id = utils.create_repeating_log(loc_log.info, 'Still waiting on %s' % task_name, seconds=sleep) # Tracking when the tasks start so we can search by date later # RightScale expects the time to be a string in UTC now = datetime.utcnow() tasks_start = now.strftime('%Y/%m/%d %H:%M:%S +0000') while True: # Get the task status output = yield self._get_task_info(task) summary = output.soul['summary'].lower() stamp = datetime.now() if 'success' in summary or 'completed' in summary: status = True break if 'failed' in summary: status = False break loc_log.debug('Task (%s) status: %s (updated at: %s)' % (output.path, output.soul['summary'], stamp)) yield utils.tornado_sleep(min(sleep, 5)) loc_log.debug('Task (%s) status: %s (updated at: %s)' % (output.path, output.soul['summary'], stamp)) if timeout_id: utils.clear_repeating_log(timeout_id) if status is True: raise gen.Return(True) if not instance: raise gen.Return(status) # If something failed we want to find out why -- get audit logs # Contact RightScale for audit logs of this instance. now = datetime.utcnow() tasks_finish = now.strftime('%Y/%m/%d %H:%M:%S +0000') loc_log.error('Task failed. Instance: "%s".' % instance.soul['name']) audit_logs = yield self.get_audit_logs(instance=instance, start=tasks_start, end=tasks_finish, match='failed') # Print every audit log that was obtained (may be 0) [loc_log.error(l) for l in audit_logs] if not audit_logs: loc_log.error('No audit logs for %s' % instance) loc_log.debug('Task finished, return value: %s, summary: %s' % (status, summary)) raise gen.Return(status)