def statusUpdate(self, driver, update) -> None: self._driver = driver task_id = update.task_id.value task_state = str(update.state) log.info(f"Task update {task_state} received for task {task_id}") if task_id not in self.task_metadata: # We assume that a terminal status update has been # received for this task already. log.info('Ignoring this status update because a terminal status ' 'update has been received for this task already.') self.call_driver('acknowledgeStatusUpdate', update) return md = self.task_metadata[task_id] # If we attempt to accept an offer that has been invalidated by # master for some reason such as offer has been rescinded or we # have exceeded offer_timeout, then we will get TASK_LOST status # update back from mesos master. if task_state == 'TASK_LOST' and str(update.reason) == \ 'REASON_INVALID_OFFERS': # This task has not been launched. Therefore, we are going to # reenqueue it. We are not propogating any event up to the # application. log.warning('Received TASK_LOST from mesos master because we ' 'attempted to accept an invalid offer. Going to ' f're-enqueue this task {task_id}') # Re-enqueue task self.enqueue_task(md.task_config) get_metric(metrics.TASK_LOST_DUE_TO_INVALID_OFFER_COUNT).count(1) self.call_driver('acknowledgeStatusUpdate', update) return # Record state changes, send a new event and emit metrics only if the # task state has actually changed. if md.task_state != task_state: with self._lock: self.task_metadata = self.task_metadata.set( task_id, md.set( task_state=task_state, task_state_history=md.task_state_history.set( task_state, time.time()), ) ) self.event_queue.put( self.callbacks.handle_status_update(update, md.task_config), ) if task_state in self._terminal_task_counts: with self._lock: self.task_metadata = self.task_metadata.discard(task_id) get_metric(self._terminal_task_counts[task_state]).count(1) # We have to do this because we are not using implicit # acknowledgements. self.call_driver('acknowledgeStatusUpdate', update)
def blacklist_slave(self, agent_id, timeout): with self._lock: # A new entry is appended even if the agent is being blacklisted. # This is equivalent to restarting the blacklist timer. log.info(f'Blacklisting slave: {agent_id} for {timeout} seconds.') self.blacklisted_slaves = self.blacklisted_slaves.append(agent_id) get_metric(metrics.BLACKLISTED_AGENTS_COUNT).count(1) unblacklist_thread = threading.Thread( target=self.unblacklist_slave, kwargs={'timeout': timeout, 'agent_id': agent_id}, ) unblacklist_thread.daemon = True unblacklist_thread.start()
def _background_check(self): while True: if self.stopping: return time_now = time.time() with self._lock: for task_id in self.task_metadata.keys(): md = self.task_metadata[task_id] if md.task_state not in ('TASK_STAGING', 'UNKNOWN'): continue # Task is not eligible for killing or reenqueuing if time_now < (md.task_state_history[md.task_state] + self.task_staging_timeout_s): continue if md.task_state == 'UNKNOWN': log.warning( 'Task {id} has been in unknown state for ' 'longer than {timeout}. Re-enqueuing it.'.format( id=task_id, timeout=self.task_staging_timeout_s)) # Re-enqueue task self.enqueue_task(md.task_config) get_metric(TASK_FAILED_TO_LAUNCH_COUNT).count(1) continue else: log.warning( 'Killing stuck task {id}'.format(id=task_id)) self.kill_task(task_id) self.blacklist_slave( agent_id=self.task_metadata[task_id].agent_id, timeout=self.slave_blacklist_timeout_s, ) get_metric(TASK_STUCK_COUNT).count(1) self._reconcile_tasks([ Dict({'task_id': Dict({'value': task_id})}) for task_id in self.task_metadata if self.task_metadata[task_id].task_state is not 'TASK_INITED' ]) time.sleep(10)
def enqueue_task(self, task_config): with self._lock: # task_state and task_state_history get reset every time # a task is enqueued. self.task_metadata = self.task_metadata.set( task_config.task_id, TaskMetadata( task_config=task_config, task_state='TASK_INITED', task_state_history=m(TASK_INITED=time.time()), )) # Need to lock on task_queue to prevent enqueues when getting # tasks to launch self.task_queue.put(task_config) if self.are_offers_suppressed: self.driver.reviveOffers() self.are_offers_suppressed = False log.info('Reviving offers because we have tasks to run.') get_metric(TASK_ENQUEUED_COUNT).count(1)
def launch_tasks_for_offer(self, offer, tasks_to_launch) -> bool: mesos_protobuf_tasks = [ self.callbacks.make_mesos_protobuf( task_config, offer.agent_id.value, self.role) for task_config in tasks_to_launch if task_config.task_id in self.task_metadata ] if not mesos_protobuf_tasks: return False launched = True launch_time = time.time() if self.call_driver('launchTasks', offer.id, mesos_protobuf_tasks) is self.driver_error: tasks = ', '.join(task.task_id for task in tasks_to_launch) log.warning( f'Failed to launch: {tasks}, moving them to UNKNOWN state') get_metric(metrics.TASK_LAUNCH_FAILED_COUNT).count(1) launched = False # 'UNKNOWN' state is for internal tracking. It will not be # propogated to users. current_task_state = 'TASK_STAGING' if launched else 'UNKNOWN' for task in tasks_to_launch: md = self.task_metadata.get(task.task_id) if not md: log.warning( f'trying to launch task {task.task_id}, but it is not in task metadata.' f'current keys in task_metadata: {self.task_metadata.keys()}' ) continue self.task_metadata = self.task_metadata.set( task.task_id, md.set( task_state=current_task_state, task_state_history=md.task_state_history.set( current_task_state, launch_time), agent_id=str(offer.agent_id.value), ) ) get_metric(metrics.TASK_QUEUED_TIME_TIMER).record( launch_time - md.task_state_history['TASK_INITED'] ) # Emit the staging event for successful launches if launched: self.event_queue.put( self.callbacks.handle_status_update( Dict(state='TASK_STAGING', offer=offer), md.task_config, ) ) get_metric(metrics.TASK_LAUNCHED_COUNT).count(1) return launched
def _background_check(self): while True: if self.stopping: return time_now = time.time() tasks_to_reconcile = [] with self._lock: for task_id, md in self.task_metadata.items(): self._background_check_task( time_now, tasks_to_reconcile, task_id, md, ) self._reconcile_tasks( [Dict({'task_id': Dict({'value': task_id})}) for task_id in tasks_to_reconcile] ) elapsed = time.time() - time_now log.info(f'background check done in {elapsed}s') get_metric(metrics.BGCHECK_TIME_TIMER).record(elapsed) time.sleep(10)
def emit_counter_metric(counter_name, service, instance): create_counter(counter_name, {'service': service, 'instance': instance}) get_metric(counter_name).count(1)
def resourceOffers(self, driver, offers) -> None: self._driver = driver current_offer_time = time.time() if self._last_offer_time is not None: get_metric(metrics.OFFER_DELAY_TIMER).record( current_offer_time - self._last_offer_time ) self._last_offer_time = current_offer_time # Give user some time to enqueue tasks if self.task_queue.empty() and current_offer_time < self.decline_after: time.sleep(self.decline_after - current_offer_time) declined: dict = defaultdict(list) declined_offer_ids = [] accepted = [] with self._lock: if self.task_queue.empty(): # Always suppress offers when there is nothing to run if self.call_driver('suppressOffers') is not self.driver_error: self.are_offers_suppressed = True log.info("Suppressing offers, no more tasks to run.") for offer in offers: declined['no tasks'].append(offer.id.value) declined_offer_ids.append(offer.id) self.call_driver( 'declineOffer', declined_offer_ids, self.offer_decline_filter) log.info( f"Offers declined because of no tasks: {','.join(declined['no tasks'])}") return with_maintenance_window = [ offer for offer in offers if offer.unavailability ] for offer in with_maintenance_window: start_time = offer.unavailability.start['nanoseconds'] completion_time = int( (start_time + offer.unavailability.duration['nanoseconds']) / 1000000000 ) now = int(time.time()) duration = completion_time - now if duration > 0: self.blacklist_slave( agent_id=offer.agent_id.value, timeout=duration, ) without_maintenance_window = [ offer for offer in offers if offer not in with_maintenance_window ] for offer in without_maintenance_window: with self._lock: if offer.agent_id.value in self.blacklisted_slaves: declined['blacklisted'].append( f'offer {offer.id.value} agent {offer.agent_id.value}' ) declined_offer_ids.append(offer.id) continue offer_pool_match, offer_pool = self.offer_matches_pool(offer) if not offer_pool_match: log.info( f"Declining offer {offer.id.value}, required pool " f"{self.pool} doesn't match offered pool {offer_pool}" ) declined['bad pool'].append(offer.id.value) declined_offer_ids.append(offer.id) continue # Need to lock here even though we are only reading the task_queue, since # we are predicating on the queue's emptiness. If not locked, other # threads can continue enqueueing, and we never terminate the loop. task_configs = [] with self._lock: while not self.task_queue.empty(): task_configs.append(self.task_queue.get()) offer_resources = get_offer_resources(offer, self.role) offer_attributes = { attribute.name: attribute.text.value for attribute in offer.attributes } log.info( f'Received offer {offer.id.value} for role {self.role}: {offer_resources}') tasks_to_launch, tasks_to_defer = self.callbacks.get_tasks_for_offer( task_configs, offer_resources, offer_attributes, self.role, ) for task in tasks_to_defer: self.task_queue.put(task) get_metric(metrics.TASK_INSUFFICIENT_OFFER_COUNT).count(len(tasks_to_defer)) if len(tasks_to_launch) == 0: declined['bad resources'].append(offer.id.value) declined_offer_ids.append(offer.id) continue ignored_tasks = ','.join( task_config.task_id for task_config in tasks_to_launch if task_config.task_id not in self.task_metadata ) if ignored_tasks: log.warning( f'ignoring tasks not in metadata: {ignored_tasks}') tasks_to_launch = [ task_config for task_config in tasks_to_launch if task_config.task_id in self.task_metadata ] if len(tasks_to_launch) == 0: declined['nothing to launch'].append(offer.id.value) declined_offer_ids.append(offer.id) elif not self.launch_tasks_for_offer(offer, tasks_to_launch): declined['launch failed'].append(offer.id.value) declined_offer_ids.append(offer.id) else: accepted.append( f'offer: {offer.id.value} ' f'agent: {offer.agent_id.value} ' f'tasks: {len(tasks_to_launch)}' ) if len(declined_offer_ids) > 0: self.call_driver( 'declineOffer', declined_offer_ids, self.offer_decline_filter ) for reason, items in declined.items(): log.info(f"Offers declined because {reason}: {', '.join(items)}") if accepted: log.info(f"Offers accepted: {', '.join(accepted)}")
def _background_check_task(self, time_now, tasks_to_reconcile, task_id, md): if md.task_state != 'TASK_INITED': tasks_to_reconcile.append(task_id) if md.task_state == 'TASK_INITED': # give up if the task hasn't launched after # offer_timeout inited_at = md.task_state_history['TASK_INITED'] offer_timeout = md.task_config.offer_timeout expires_at = inited_at + offer_timeout if time_now >= expires_at: log.warning( f'Task {task_id} has been waiting for offers ' 'for longer than configured timeout ' f'{offer_timeout}. Giving up and removing the ' 'task from the task queue.' ) # killing the task will also remove them from the queue self.kill_task(task_id) # we are not expecting mesos to send terminal update # for this task, so cleaning it up manually self.task_metadata = self.task_metadata.discard( task_id ) self.event_queue.put( task_event( task_id=task_id, terminal=True, timestamp=time_now, success=False, message='stop', task_config=md.task_config, raw='Failed due to offer timeout', ) ) get_metric(metrics.TASK_OFFER_TIMEOUT).count(1) # Task is not eligible for killing or reenqueuing in_current_state_since = md.task_state_history[md.task_state] if time_now < in_current_state_since + self.task_staging_timeout_s: return if md.task_state == 'UNKNOWN': log.warning( f'Re-enqueuing task {task_id} in unknown state for ' f'longer than {self.task_staging_timeout_s}' ) # Re-enqueue task self.enqueue_task(md.task_config) get_metric( metrics.TASK_FAILED_TO_LAUNCH_COUNT).count(1) elif md.task_state == 'TASK_STAGING': log.warning(f'Killing stuck task {task_id}') self.kill_task(task_id) self.task_metadata = self.task_metadata.set( task_id, md.set( task_state='TASK_STUCK', task_state_history=md.task_state_history.set( 'TASK_STUCK', time_now), ) ) self.blacklist_slave( agent_id=self.task_metadata[task_id].agent_id, timeout=self.slave_blacklist_timeout_s, ) get_metric(metrics.TASK_STUCK_COUNT).count(1) elif md.task_state == 'TASK_STUCK': t = time.time() # 10s since last iteration + time we spent in current one time_delta = 10 + t - time_now # seconds since task was put in TASK_STUCK state time_stuck = t - md.task_state_history['TASK_STUCK'] # seconds since `time_stuck` crossed another hour # boundary hour_rolled = time_stuck % 3600 # if `time_stuck` crossed hour boundary since last # background check - lets re-send kill request if hour_rolled < time_delta: hours_stuck = time_stuck // 3600 log.warning( f'Task {task_id} is stuck, waiting for terminal ' f'state for {hours_stuck}h, sending another kill' ) self.kill_task(task_id)
def resourceOffers(self, driver, offers): if self.driver is None: self.driver = driver current_offer_time = time.time() if self._last_offer_time is not None: get_metric(OFFER_DELAY_TIMER).record(current_offer_time - self._last_offer_time) self._last_offer_time = current_offer_time # Give user some time to enqueue tasks if self.task_queue.empty() and current_offer_time < self.decline_after: time.sleep(self.decline_after - current_offer_time) declined = { 'blacklisted': [], 'bad pool': [], 'bad resources': [], 'no tasks': [] } declined_offer_ids = [] accepted = [] if self.task_queue.empty(): if not self.are_offers_suppressed: driver.suppressOffers() self.are_offers_suppressed = True log.info("Suppressing offers, no more tasks to run.") for offer in offers: declined['no tasks'].append(offer.id.value) declined_offer_ids.append(offer.id) driver.declineOffer(declined_offer_ids, self.offer_decline_filter) log.info("Offers declined because of no tasks: {}".format(','.join( declined['no tasks']))) return with_maintenance_window = [ offer for offer in offers if offer.unavailability ] for offer in with_maintenance_window: start_time = offer.unavailability.start['nanoseconds'] completion_time = int( (start_time + offer.unavailability.duration['nanoseconds']) / 1000000000) now = int(time.time()) duration = completion_time - now if duration > 0: self.blacklist_slave( agent_id=offer.agent_id.value, timeout=duration, ) without_maintenance_window = [ offer for offer in offers if offer not in with_maintenance_window ] for offer in without_maintenance_window: with self._lock: if offer.agent_id.value in self.blacklisted_slaves: declined['blacklisted'].append('offer {} agent {}'.format( offer.id.value, offer.agent_id.value)) declined_offer_ids.append(offer.id) continue if not self.offer_matches_pool(offer): log.info("Declining offer {id} because it is not for pool " "{pool}.".format(id=offer.id.value, pool=self.pool)) declined['bad pool'].append(offer.id.value) declined_offer_ids.append(offer.id) continue tasks_to_launch = self.get_tasks_to_launch(offer) if len(tasks_to_launch) == 0: if self.task_queue.empty(): if offer.id.value not in declined['no tasks']: declined['no tasks'].append(offer.id.value) else: declined['bad resources'].append(offer.id.value) declined_offer_ids.append(offer.id) continue accepted.append('offer: {} agent: {} tasks: {}'.format( offer.id.value, offer.agent_id.value, len(tasks_to_launch))) task_launch_failed = False try: driver.launchTasks(offer.id, tasks_to_launch) except (socket.timeout, Exception): log.warning('Failed to launch following tasks {tasks}.' 'Thus, moving them to UNKNOWN state'.format( tasks=', '.join([ task.task_id.value for task in tasks_to_launch ]), )) task_launch_failed = True get_metric(TASK_LAUNCH_FAILED_COUNT).count(1) # 'UNKNOWN' state is for internal tracking. It will not be # propogated to users. current_task_state = 'UNKNOWN' if task_launch_failed else \ 'TASK_STAGING' with self._lock: for task in tasks_to_launch: md = self.task_metadata[task.task_id.value] self.task_metadata = self.task_metadata.set( task.task_id.value, md.set( task_state=current_task_state, task_state_history=md.task_state_history.set( current_task_state, time.time()), )) if not task_launch_failed: get_metric(TASK_LAUNCHED_COUNT).count(1) if len(declined_offer_ids) > 0: driver.declineOffer(declined_offer_ids, self.offer_decline_filter) for reason, items in declined.items(): if items: log.info("Offers declined because of {}: {}".format( reason, ', '.join(items))) if accepted: log.info("Offers accepted: {}".format(', '.join(accepted)))
def get_tasks_to_launch(self, offer): tasks_to_launch = [] remaining_cpus = 0 remaining_mem = 0 remaining_disk = 0 remaining_gpus = 0 available_ports = [] for resource in offer.resources: if resource.name == "cpus" and resource.role == self.role: remaining_cpus += resource.scalar.value elif resource.name == "mem" and resource.role == self.role: remaining_mem += resource.scalar.value elif resource.name == "disk" and resource.role == self.role: remaining_disk += resource.scalar.value elif resource.name == "gpus" and resource.role == self.role: remaining_gpus += resource.scalar.value elif resource.name == "ports" and resource.role == self.role: # TODO: Validate if the ports available > ports required available_ports = self.get_available_ports(resource) log.info("Received offer {id} with cpus: {cpu}, mem: {mem}, " "disk: {disk} gpus: {gpu} role: {role}".format( id=offer.id.value, cpu=remaining_cpus, mem=remaining_mem, disk=remaining_disk, gpu=remaining_gpus, role=self.role)) tasks_to_put_back_in_queue = [] # Need to lock here even though we are working on the task_queue, since # we are predicating on the queue's emptiness. If not locked, other # threads can continue enqueueing, and we never terminate the loop. with self._lock: # Get all the tasks of the queue while not self.task_queue.empty(): task = self.task_queue.get() if ((remaining_cpus >= task.cpus and remaining_mem >= task.mem and remaining_disk >= task.disk and remaining_gpus >= task.gpus and len(available_ports) > 0)): # This offer is sufficient for us to launch task tasks_to_launch.append( self.create_new_docker_task(offer, task, available_ports)) # Deduct the resources taken by this task from the total # available resources. remaining_cpus -= task.cpus remaining_mem -= task.mem remaining_disk -= task.disk remaining_gpus -= task.gpus md = self.task_metadata[task.task_id] get_metric(TASK_QUEUED_TIME_TIMER).record( time.time() - md.task_state_history['TASK_INITED']) else: # This offer is insufficient for this task. We need to put # it back in the queue tasks_to_put_back_in_queue.append(task) for task in tasks_to_put_back_in_queue: self.task_queue.put(task) get_metric(TASK_INSUFFICIENT_OFFER_COUNT).count(1) return tasks_to_launch
def emit_counter_metric(counter_name, service, instance): create_counter(counter_name, {"service": service, "instance": instance}) get_metric(counter_name).count(1)