def __init__(self, id, root_task, job_dir, state, job_pool, job_options, journal=True): self.id = id self.root_task = root_task self.job_dir = job_dir self.job_pool = job_pool self.history = [] self.state = state self.runnable_queue = Queue.Queue() self.global_queues = {} self.result_ref = None self.task_journal_fp = None self.journal = journal self.job_options = job_options try: self.journal = self.job_options['journal'] except KeyError: pass # Start journalling immediately to capture the root task. if self.journal and self.task_journal_fp is None and self.job_dir is not None: self.task_journal_fp = open(os.path.join(self.job_dir, 'task_journal'), 'wb') self._lock = Lock() self._condition = Condition(self._lock) # Counters for each task state. self.task_state_counts = {} for state in TASK_STATES.values(): self.task_state_counts[state] = 0 self.all_tasks = RunningAverage() self.all_tasks_by_type = {} try: self.scheduling_policy = get_scheduling_policy(self.job_options['scheduler']) except KeyError: self.scheduling_policy = LocalitySchedulingPolicy() try: self.journal_sync_buffer = self.job_options['journal_sync_buffer'] except KeyError: self.journal_sync_buffer = None self.journal_sync_counter = 0 self.task_graph = JobTaskGraph(self, self.runnable_queue) self.workers = {}
def __init__(self, id, root_task, job_dir, state, job_pool, job_options): self.id = id self.root_task = root_task self.job_dir = job_dir self.job_pool = job_pool self.history = [] self.state = state self.runnable_queue = Queue.Queue() self.global_queues = {} self.result_ref = None self.task_journal_fp = None self.job_options = job_options self._lock = Lock() self._condition = Condition(self._lock) # Counters for each task state. self.task_state_counts = {} for state in TASK_STATES.values(): self.task_state_counts[state] = 0 self.all_tasks = RunningAverage() self.all_tasks_by_type = {} try: self.scheduling_policy = get_scheduling_policy( self.job_options['scheduler']) except KeyError: self.scheduling_policy = LocalitySchedulingPolicy() self.task_graph = JobTaskGraph(self, self.runnable_queue) self.workers = {} self.job_pool.worker_pool.notify_job_about_current_workers(self)
def __init__(self, id, root_task, job_dir, state, job_pool, job_options): self.id = id self.root_task = root_task self.job_dir = job_dir self.job_pool = job_pool self.history = [] self.state = state self.runnable_queue = Queue.Queue() self.global_queues = {} self.result_ref = None self.task_journal_fp = None self.job_options = job_options self._lock = Lock() self._condition = Condition(self._lock) # Counters for each task state. self.task_state_counts = {} for state in TASK_STATES.values(): self.task_state_counts[state] = 0 self.all_tasks = RunningAverage() self.all_tasks_by_type = {} try: self.scheduling_policy = get_scheduling_policy(self.job_options['scheduler']) except KeyError: self.scheduling_policy = LocalitySchedulingPolicy() self.task_graph = JobTaskGraph(self, self.runnable_queue) self.workers = {} self.job_pool.worker_pool.notify_job_about_current_workers(self)
class Job: def __init__(self, id, root_task, job_dir, state, job_pool, job_options): self.id = id self.root_task = root_task self.job_dir = job_dir self.job_pool = job_pool self.history = [] self.state = state self.runnable_queue = Queue.Queue() self.global_queues = {} self.result_ref = None self.task_journal_fp = None self.job_options = job_options self._lock = Lock() self._condition = Condition(self._lock) # Counters for each task state. self.task_state_counts = {} for state in TASK_STATES.values(): self.task_state_counts[state] = 0 self.all_tasks = RunningAverage() self.all_tasks_by_type = {} try: self.scheduling_policy = get_scheduling_policy( self.job_options['scheduler']) except KeyError: self.scheduling_policy = LocalitySchedulingPolicy() self.task_graph = JobTaskGraph(self, self.runnable_queue) self.workers = {} self.job_pool.worker_pool.notify_job_about_current_workers(self) def schedule(self): self.job_pool.deferred_worker.do_deferred(lambda: self._schedule()) def _schedule(self): ciel.log('Beginning to schedule job %s' % self.id, 'JOB', logging.INFO) with self._lock: # 1. Assign runnable tasks to worker queues. while True: try: task = self.runnable_queue.get_nowait() self.assign_scheduling_class_to_task(task) worker = self.select_worker_for_task(task) self.workers[worker].queue_task(task) self.push_task_on_global_queue(task) except Queue.Empty: break # 2. For each worker, check if we need to assign any tasks. total_assigned = 0 undersubscribed_worker_classes = [] for worker, wstate in self.workers.items(): for scheduling_class, capacity in worker.scheduling_classes.items( ): num_assigned = wstate.tasks_assigned_in_class( scheduling_class) while num_assigned < capacity: task = wstate.pop_task_from_queue(scheduling_class) if task is None: break elif task.state not in (TASK_QUEUED, TASK_QUEUED_STREAMING): continue task.add_worker(worker) wstate.assign_task(task) self.job_pool.worker_pool.execute_task_on_worker( worker, task) num_assigned += 1 total_assigned += 1 if num_assigned < capacity: undersubscribed_worker_classes.append( (worker, scheduling_class, capacity - num_assigned)) for worker, scheduling_class, deficit in undersubscribed_worker_classes: num_global_assigned = 0 while num_global_assigned < deficit: task = self.pop_task_from_global_queue(scheduling_class) print task if task is None: break elif task.state not in (TASK_QUEUED, TASK_QUEUED_STREAMING): continue task.add_worker(worker) self.workers[worker].assign_task(task) self.job_pool.worker_pool.execute_task_on_worker( worker, task) num_global_assigned += 1 ciel.log( 'Finished scheduling job %s. Tasks assigned = %d' % (self.id, total_assigned), 'JOB', logging.INFO) def pop_task_from_global_queue(self, scheduling_class): if scheduling_class == '*': for queue in self.global_queues.values(): try: return queue.popleft() except IndexError: pass return None else: try: return self.global_queues[scheduling_class].popleft() except IndexError: return None except KeyError: return None def push_task_on_global_queue(self, task): try: class_queue = self.global_queues[task.scheduling_class] except KeyError: class_queue = collections.deque() self.global_queues[task.scheduling_class] = class_queue class_queue.append(task) def select_worker_for_task(self, task): if task.has_constrained_location(): fixed_netloc = task.get_constrained_location() worker = self.job_pool.worker_pool.get_worker_at_netloc( fixed_netloc) elif task.state in (TASK_QUEUED_STREAMING, TASK_QUEUED): worker, _ = self.scheduling_policy.select_worker_for_task( task, self.job_pool.worker_pool) else: ciel.log.error( "Task %s scheduled in bad state %s; ignored" % (task, task.state), "SCHEDULER", logging.ERROR) raise return worker def assign_scheduling_class_to_task(self, task): if task.scheduling_class is not None: return elif task.handler == 'swi': task.scheduling_class = 'cpu' elif task.handler == 'init': task.scheduling_class = 'cpu' elif task.handler == 'sync': task.scheduling_class = 'cpu' elif task.handler == 'grab': task.scheduling_class = 'disk' elif task.handler == 'java': task.scheduling_class = 'disk' else: task.scheduling_class = 'disk' def record_event(self, description): self.history.append((datetime.datetime.now(), description)) def set_state(self, state): self.record_event(JOB_STATE_NAMES[state]) self.state = state evt_time = self.history[-1][0] ciel.log( '%s %s @ %f' % (self.id, JOB_STATE_NAMES[self.state], time.mktime(evt_time.timetuple()) + evt_time.microsecond / 1e6), 'JOB', logging.INFO) def failed(self): # Done under self._lock (via _report_tasks()). self.set_state(JOB_FAILED) self.stop_journalling() self._condition.notify_all() def enqueued(self): self.set_state(JOB_QUEUED) def completed(self, result_ref): # Done under self._lock (via _report_tasks()). self.set_state(JOB_COMPLETED) self.result_ref = result_ref self._condition.notify_all() self.stop_journalling() self.job_pool.job_completed(self) def activated(self): self.set_state(JOB_ACTIVE) if self.task_journal_fp is None and self.job_dir is not None: self.task_journal_fp = open( os.path.join(self.job_dir, 'task_journal'), 'ab') mjo = MasterJobOutput(self.root_task.expected_outputs, self) for output in self.root_task.expected_outputs: self.task_graph.subscribe(output, mjo) self.task_graph.reduce_graph_for_references( self.root_task.expected_outputs) self.schedule() def cancelled(self): self.set_state(JOB_CANCELLED) self.stop_journalling() def stop_journalling(self): # Done under self._lock (via _report_tasks()). if self.task_journal_fp is not None: self.task_journal_fp.close() self.task_journal_fp = None if self.job_dir is not None: with open(os.path.join(self.job_dir, 'result'), 'w') as result_file: simplejson.dump(self.result_ref, result_file, cls=SWReferenceJSONEncoder) def flush_journal(self): with self._lock: if self.task_journal_fp is not None: self.task_journal_fp.flush() os.fsync(self.task_journal_fp.fileno()) def add_reference(self, id, ref, should_sync=False): # Called under self._lock (from _report_tasks()). if self.task_journal_fp is not None: ref_details = simplejson.dumps({ 'id': id, 'ref': ref }, cls=SWReferenceJSONEncoder) self.task_journal_fp.write( RECORD_HEADER_STRUCT.pack('R', len(ref_details))) self.task_journal_fp.write(ref_details) if should_sync: self.task_journal_fp.flush() os.fsync(self.task_journal_fp.fileno()) def add_task(self, task, should_sync=False): # Called under self._lock (from _report_tasks()). self.task_state_counts[ task.state] = self.task_state_counts[task.state] + 1 if self.task_journal_fp is not None: task_details = simplejson.dumps(task.as_descriptor(), cls=SWReferenceJSONEncoder) self.task_journal_fp.write( RECORD_HEADER_STRUCT.pack('T', len(task_details))) self.task_journal_fp.write(task_details) if should_sync: self.task_journal_fp.flush() os.fsync(self.task_journal_fp.fileno()) # def steal_task(self, worker, scheduling_class): # ciel.log('In steal_task(%s, %s)' % (worker.id, scheduling_class), 'LOG', logging.INFO) # # Stealing policy: prefer task with fewest replicas, then lowest cost on this worker. # best_candidate = (sys.maxint, 0, None) # for victim in self.workers.values(): # if victim.worker == worker: # continue # task = victim.get_last_task_in_class(scheduling_class) # if task is None: # continue # num_workers = len(task.get_workers()) # cost = self.guess_task_cost_on_worker(task, worker) # best_candidate = min(best_candidate, (num_workers, cost, task)) # # task = best_candidate[2] # if task is not None: # task.add_worker(worker) # self.workers[worker].add_task(task) # self.job_pool.worker_pool.execute_task_on_worker(worker, task) def record_state_change(self, prev_state, next_state): # Done under self._lock (from _report_tasks()). self.task_state_counts[ prev_state] = self.task_state_counts[prev_state] - 1 self.task_state_counts[ next_state] = self.task_state_counts[next_state] + 1 def as_descriptor(self): counts = {} ret = { 'job_id': self.id, 'task_counts': counts, 'state': JOB_STATE_NAMES[self.state], 'root_task': self.root_task.task_id if self.root_task is not None else None, 'expected_outputs': self.root_task.expected_outputs if self.root_task is not None else None, 'result_ref': self.result_ref, 'job_options': self.job_options } with self._lock: for (name, state_index) in TASK_STATES.items(): counts[name] = self.task_state_counts[state_index] return ret def report_tasks(self, report, toplevel_task, worker): self.job_pool.deferred_worker.do_deferred( lambda: self._report_tasks(report, toplevel_task, worker)) def _report_tasks(self, report, toplevel_task, worker): with self._lock: tx = TaskGraphUpdate() root_task = self.task_graph.get_task(report[0][0]) for assigned_worker in root_task.get_workers(): if assigned_worker is worker: self.workers[worker].deassign_task(root_task) else: self.workers[assigned_worker].deassign_task(root_task) assigned_worker.worker_pool.abort_task_on_worker( root_task, assigned_worker) # XXX: Need to abort the task running on other workers. pass for (parent_id, success, payload) in report: parent_task = self.task_graph.get_task(parent_id) if success: (spawned, published, profiling) = payload parent_task.set_profiling(profiling) parent_task.set_state(TASK_COMMITTED) self.record_task_stats(parent_task, worker) for child in spawned: child_task = build_taskpool_task_from_descriptor( child, parent_task) tx.spawn(child_task) parent_task.children.append(child_task) for ref in published: tx.publish(ref, parent_task) else: # Only one failed task per-report, at the moment. self.investigate_task_failure(parent_task, payload) self.schedule() return tx.commit(self.task_graph) self.task_graph.reduce_graph_for_references( toplevel_task.expected_outputs) # XXX: Need to remove assigned task from worker(s). self.schedule() def record_task_stats(self, task, worker): try: task_profiling = task.get_profiling() task_type = task.get_type() task_execution_time = task_profiling['FINISHED'] - task_profiling[ 'STARTED'] self.all_tasks.update(task_execution_time) try: self.all_tasks_by_type[task_type].update(task_execution_time) except KeyError: self.all_tasks_by_type[task_type] = RunningAverage( task_execution_time) self.workers[worker].record_task_stats(task) except: ciel.log( 'Error recording task statistics for task: %s' % task.task_id, 'JOB', logging.WARNING) def guess_task_cost_on_worker(self, task, worker): return self.workers[worker].load(task.scheduling_class, True) def investigate_task_failure(self, task, payload): self.job_pool.task_failure_investigator.investigate_task_failure( task, payload) def notify_worker_added(self, worker): with self._lock: try: _ = self.workers[worker] return except KeyError: ciel.log('Job %s notified that worker being added' % self.id, 'JOB', logging.INFO) worker_state = JobWorkerState(worker) self.workers[worker] = worker_state self.schedule() def notify_worker_failed(self, worker): with self._lock: try: worker_state = self.workers[worker] del self.workers[worker] ciel.log( 'Reassigning tasks from failed worker %s for job %s' % (worker.id, self.id), 'JOB', logging.WARNING) for assigned in worker_state.assigned_tasks.values(): for failed_task in assigned: failed_task.remove_worker(worker) self.investigate_task_failure( failed_task, ('WORKER_FAILED', None, {})) for scheduling_class in worker_state.queues: while True: queued_task = worker_state.pop_task_from_queue( scheduling_class) if queued_task is None: break self.runnable_queue.put(queued_task) #self.investigate_task_failure(failed_task, ('WORKER_FAILED', None, {})) #self.runnable_queue.put(queued_task) self.schedule() except KeyError: ciel.log('Weird keyerror coming out of notify_worker_failed', 'JOB', logging.WARNING, True) pass
class Job: def __init__(self, id, root_task, job_dir, state, job_pool, job_options): self.id = id self.root_task = root_task self.job_dir = job_dir self.job_pool = job_pool self.history = [] self.state = state self.runnable_queue = Queue.Queue() self.global_queues = {} self.result_ref = None self.task_journal_fp = None # Start journalling immediately to capture the root task. if self.task_journal_fp is None and self.job_dir is not None: self.task_journal_fp = open(os.path.join(self.job_dir, 'task_journal'), 'ab') self.job_options = job_options self._lock = Lock() self._condition = Condition(self._lock) # Counters for each task state. self.task_state_counts = {} for state in TASK_STATES.values(): self.task_state_counts[state] = 0 self.all_tasks = RunningAverage() self.all_tasks_by_type = {} try: self.scheduling_policy = get_scheduling_policy(self.job_options['scheduler']) except KeyError: self.scheduling_policy = LocalitySchedulingPolicy() self.task_graph = JobTaskGraph(self, self.runnable_queue) self.workers = {} self.job_pool.worker_pool.notify_job_about_current_workers(self) def schedule(self): self.job_pool.deferred_worker.do_deferred(lambda: self._schedule()) def _schedule(self): ciel.log('Beginning to schedule job %s' % self.id, 'JOB', logging.INFO) with self._lock: # 1. Assign runnable tasks to worker queues. while True: try: task = self.runnable_queue.get_nowait() self.assign_scheduling_class_to_task(task) worker = self.select_worker_for_task(task) self.workers[worker].queue_task(task) if task.get_constrained_location() is None: self.push_task_on_global_queue(task) except Queue.Empty: break # 2. For each worker, check if we need to assign any tasks. total_assigned = 0 undersubscribed_worker_classes = [] for worker, wstate in self.workers.items(): for scheduling_class, capacity in worker.scheduling_classes.items(): num_assigned = wstate.tasks_assigned_in_class(scheduling_class) while num_assigned < capacity: task = wstate.pop_task_from_queue(scheduling_class) if task is None: break elif task.state not in (TASK_QUEUED, TASK_QUEUED_STREAMING): continue task.add_worker(worker) wstate.assign_task(task) self.job_pool.worker_pool.execute_task_on_worker(worker, task) num_assigned += 1 total_assigned += 1 if num_assigned < capacity: undersubscribed_worker_classes.append((worker, scheduling_class, capacity - num_assigned)) for worker, scheduling_class, deficit in undersubscribed_worker_classes: num_global_assigned = 0 while num_global_assigned < deficit: task = self.pop_task_from_global_queue(scheduling_class) if task is None: break elif task.state not in (TASK_QUEUED, TASK_QUEUED_STREAMING): continue task.add_worker(worker) self.workers[worker].assign_task(task) self.job_pool.worker_pool.execute_task_on_worker(worker, task) num_global_assigned += 1 ciel.log('Finished scheduling job %s. Tasks assigned = %d' % (self.id, total_assigned), 'JOB', logging.INFO) def pop_task_from_global_queue(self, scheduling_class): if scheduling_class == '*': for queue in self.global_queues.values(): try: return queue.popleft() except IndexError: pass return None else: try: return self.global_queues[scheduling_class].popleft() except IndexError: return None except KeyError: return None def push_task_on_global_queue(self, task): try: class_queue = self.global_queues[task.scheduling_class] except KeyError: class_queue = collections.deque() self.global_queues[task.scheduling_class] = class_queue class_queue.append(task) def select_worker_for_task(self, task): constrained_location = task.get_constrained_location() if constrained_location is not None: worker = self.job_pool.worker_pool.get_worker_at_netloc(constrained_location) elif task.state in (TASK_QUEUED_STREAMING, TASK_QUEUED): worker, _ = self.scheduling_policy.select_worker_for_task(task, self.job_pool.worker_pool) else: ciel.log.error("Task %s scheduled in bad state %s; ignored" % (task, task.state), "SCHEDULER", logging.ERROR) raise return worker def assign_scheduling_class_to_task(self, task): if task.scheduling_class is not None: return elif task.handler == 'swi': task.scheduling_class = 'cpu' elif task.handler == 'init': task.scheduling_class = 'cpu' elif task.handler == 'sync': task.scheduling_class = 'cpu' elif task.handler == 'grab': task.scheduling_class = 'disk' elif task.handler == 'java': task.scheduling_class = 'disk' else: task.scheduling_class = 'disk' def record_event(self, description): self.history.append((datetime.datetime.now(), description)) def set_state(self, state): self.record_event(JOB_STATE_NAMES[state]) self.state = state evt_time = self.history[-1][0] ciel.log('%s %s @ %f' % (self.id, JOB_STATE_NAMES[self.state], time.mktime(evt_time.timetuple()) + evt_time.microsecond / 1e6), 'JOB', logging.INFO) def failed(self): # Done under self._lock (via _report_tasks()). self.set_state(JOB_FAILED) self.stop_journalling() self._condition.notify_all() def enqueued(self): self.set_state(JOB_QUEUED) def completed(self, result_ref): # Done under self._lock (via _report_tasks()). self.set_state(JOB_COMPLETED) self.result_ref = result_ref self._condition.notify_all() self.stop_journalling() self.job_pool.job_completed(self) def activated(self): self.set_state(JOB_ACTIVE) mjo = MasterJobOutput(self.root_task.expected_outputs, self) for output in self.root_task.expected_outputs: self.task_graph.subscribe(output, mjo) self.task_graph.reduce_graph_for_references(self.root_task.expected_outputs) self.schedule() def cancelled(self): self.set_state(JOB_CANCELLED) self.stop_journalling() def stop_journalling(self): # Done under self._lock (via _report_tasks()). if self.task_journal_fp is not None: self.task_journal_fp.close() self.task_journal_fp = None if self.job_dir is not None: with open(os.path.join(self.job_dir, 'result'), 'w') as result_file: simplejson.dump(self.result_ref, result_file, cls=SWReferenceJSONEncoder) def flush_journal(self): with self._lock: if self.task_journal_fp is not None: self.task_journal_fp.flush() os.fsync(self.task_journal_fp.fileno()) def add_reference(self, id, ref, should_sync=False): # Called under self._lock (from _report_tasks()). if self.task_journal_fp is not None: ref_details = simplejson.dumps({'id': id, 'ref': ref}, cls=SWReferenceJSONEncoder) self.task_journal_fp.write(RECORD_HEADER_STRUCT.pack('R', len(ref_details))) self.task_journal_fp.write(ref_details) if should_sync: self.task_journal_fp.flush() os.fsync(self.task_journal_fp.fileno()) def add_task(self, task, should_sync=False): # Called under self._lock (from _report_tasks()). self.task_state_counts[task.state] = self.task_state_counts[task.state] + 1 if self.task_journal_fp is not None: task_details = simplejson.dumps(task.as_descriptor(), cls=SWReferenceJSONEncoder) self.task_journal_fp.write(RECORD_HEADER_STRUCT.pack('T', len(task_details))) self.task_journal_fp.write(task_details) if should_sync: self.task_journal_fp.flush() os.fsync(self.task_journal_fp.fileno()) # def steal_task(self, worker, scheduling_class): # ciel.log('In steal_task(%s, %s)' % (worker.id, scheduling_class), 'LOG', logging.INFO) # # Stealing policy: prefer task with fewest replicas, then lowest cost on this worker. # best_candidate = (sys.maxint, 0, None) # for victim in self.workers.values(): # if victim.worker == worker: # continue # task = victim.get_last_task_in_class(scheduling_class) # if task is None: # continue # num_workers = len(task.get_workers()) # cost = self.guess_task_cost_on_worker(task, worker) # best_candidate = min(best_candidate, (num_workers, cost, task)) # # task = best_candidate[2] # if task is not None: # task.add_worker(worker) # self.workers[worker].add_task(task) # self.job_pool.worker_pool.execute_task_on_worker(worker, task) def record_state_change(self, prev_state, next_state): # Done under self._lock (from _report_tasks()). self.task_state_counts[prev_state] = self.task_state_counts[prev_state] - 1 self.task_state_counts[next_state] = self.task_state_counts[next_state] + 1 def as_descriptor(self): counts = {} ret = {'job_id': self.id, 'task_counts': counts, 'state': JOB_STATE_NAMES[self.state], 'root_task': self.root_task.task_id if self.root_task is not None else None, 'expected_outputs': self.root_task.expected_outputs if self.root_task is not None else None, 'result_ref': self.result_ref, 'job_options' : self.job_options} with self._lock: for (name, state_index) in TASK_STATES.items(): counts[name] = self.task_state_counts[state_index] return ret def report_tasks(self, report, toplevel_task, worker): self.job_pool.deferred_worker.do_deferred(lambda: self._report_tasks(report, toplevel_task, worker)) def _report_tasks(self, report, toplevel_task, worker): with self._lock: tx = TaskGraphUpdate() root_task = self.task_graph.get_task(report[0][0]) for assigned_worker in root_task.get_workers(): if assigned_worker is worker: self.workers[worker].deassign_task(root_task) else: self.workers[assigned_worker].deassign_task(root_task) assigned_worker.worker_pool.abort_task_on_worker(root_task, assigned_worker) # XXX: Need to abort the task running on other workers. pass for (parent_id, success, payload) in report: parent_task = self.task_graph.get_task(parent_id) if success: (spawned, published, profiling) = payload parent_task.set_profiling(profiling) parent_task.set_state(TASK_COMMITTED) self.record_task_stats(parent_task, worker) for child in spawned: child_task = build_taskpool_task_from_descriptor(child, parent_task) tx.spawn(child_task) parent_task.children.append(child_task) for ref in published: tx.publish(ref, parent_task) else: # Only one failed task per-report, at the moment. self.investigate_task_failure(parent_task, payload) self.schedule() return tx.commit(self.task_graph) self.task_graph.reduce_graph_for_references(toplevel_task.expected_outputs) # XXX: Need to remove assigned task from worker(s). self.schedule() def record_task_stats(self, task, worker): try: task_profiling = task.get_profiling() task_type = task.get_type() task_execution_time = task_profiling['FINISHED'] - task_profiling['STARTED'] self.all_tasks.update(task_execution_time) try: self.all_tasks_by_type[task_type].update(task_execution_time) except KeyError: self.all_tasks_by_type[task_type] = RunningAverage(task_execution_time) self.workers[worker].record_task_stats(task) except: ciel.log('Error recording task statistics for task: %s' % task.task_id, 'JOB', logging.WARNING) def guess_task_cost_on_worker(self, task, worker): return self.workers[worker].load(task.scheduling_class, True) def investigate_task_failure(self, task, payload): self.job_pool.task_failure_investigator.investigate_task_failure(task, payload) def notify_worker_added(self, worker): with self._lock: try: _ = self.workers[worker] return except KeyError: ciel.log('Job %s notified that worker being added' % self.id, 'JOB', logging.INFO) worker_state = JobWorkerState(worker) self.workers[worker] = worker_state self.schedule() def notify_worker_failed(self, worker): with self._lock: try: worker_state = self.workers[worker] del self.workers[worker] ciel.log('Reassigning tasks from failed worker %s for job %s' % (worker.id, self.id), 'JOB', logging.WARNING) for assigned in worker_state.assigned_tasks.values(): for failed_task in assigned: failed_task.remove_worker(worker) self.investigate_task_failure(failed_task, ('WORKER_FAILED', None, {})) for scheduling_class in worker_state.queues: while True: queued_task = worker_state.pop_task_from_queue(scheduling_class) if queued_task is None: break self.runnable_queue.put(queued_task) #self.investigate_task_failure(failed_task, ('WORKER_FAILED', None, {})) #self.runnable_queue.put(queued_task) self.schedule() except KeyError: ciel.log('Weird keyerror coming out of notify_worker_failed', 'JOB', logging.WARNING, True) pass