def create_job_for_task(self, task_descriptor, job_options, job_id=None): """ Convert a task descriptor into a job. Allocates a new job id, creates a Job, and entrains it to the JobPool. This is always called with job_id = None; not sure why there's even an argument for it. """ with self._lock: if job_id is None: job_id = self.allocate_job_id() task_id = 'root:%s' % (job_id, ) task_descriptor['task_id'] = task_id # TODO: Here is where we will set up the job journal, etc. job_dir = self.make_job_directory(job_id) try: expected_outputs = task_descriptor['expected_outputs'] except KeyError: expected_outputs = ['%s:job_output' % job_id] task_descriptor['expected_outputs'] = expected_outputs task = build_taskpool_task_from_descriptor(task_descriptor, None) job = Job(job_id, task, job_dir, JOB_CREATED, self, job_options) task.job = job self.add_job(job) ciel.log('Added job: %s' % job.id, 'JOB_POOL', logging.INFO) return job
def create_job_for_task(self, task_descriptor, job_options, job_id=None): with self._lock: if job_id is None: job_id = self.allocate_job_id() task_id = 'root:%s' % (job_id, ) task_descriptor['task_id'] = task_id # TODO: Here is where we will set up the job journal, etc. job_dir = self.make_job_directory(job_id) try: expected_outputs = task_descriptor['expected_outputs'] except KeyError: expected_outputs = ['%s:job_output' % job_id] task_descriptor['expected_outputs'] = expected_outputs task = build_taskpool_task_from_descriptor(task_descriptor, None) job = Job(job_id, task, job_dir, JOB_CREATED, self, job_options) task.job = job self.add_job(job) ciel.log('Added job: %s' % job.id, 'JOB_POOL', logging.INFO) return job
def spawn_and_publish(self, spawns, refs, producer=None, taskset=None): producer_task = None if producer is not None: producer_task = self.get_task(producer["task_id"]) taskset = producer_task.taskset upd = TaskGraphUpdate() for spawn in spawns: task_object = build_taskpool_task_from_descriptor(spawn, producer_task, taskset) upd.spawn(task_object) for ref in refs: upd.publish(ref, producer_task) upd.commit(self)
def _report_tasks(self, report, toplevel_task, worker): # XXX SOS22 This does a damn sight more than just reporting the tasks! with self._lock: tx = TaskGraphUpdate() root_task = self.task_graph.get_task(report[0][0]) ciel.log('Received report from task %s with %d entries' % (root_task.task_id, len(report)), 'SCHED', logging.DEBUG) try: self.workers[worker].deassign_task(root_task) except KeyError: # This can happen if we recieve the report after the worker is deemed to have failed. In this case, we should # accept the report and ignore the failed worker. pass for (parent_id, success, payload) in report: ciel.log('Processing report record from task %s' % (parent_id), 'SCHED', logging.DEBUG) parent_task = self.task_graph.get_task(parent_id) if success: ciel.log('Task %s was successful' % (parent_id), 'SCHED', logging.DEBUG) (spawned, published, profiling) = payload parent_task.set_profiling(profiling) parent_task.set_state(TASK_COMMITTED) self.record_task_stats(parent_task, worker) for child in spawned: child_task = build_taskpool_task_from_descriptor(child, parent_task) ciel.log('Task %s spawned task %s' % (parent_id, child_task.task_id), 'SCHED', logging.DEBUG) tx.spawn(child_task) #parent_task.children.append(child_task) for ref in published: ciel.log('Task %s published reference %s' % (parent_id, str(ref)), 'SCHED', logging.DEBUG) tx.publish(ref, parent_task) else: ciel.log('Task %s failed' % (parent_id), 'SCHED', logging.WARN) # Only one failed task per-report, at the moment. self.investigate_task_failure(parent_task, payload) self.schedule() return tx.commit(self.task_graph) self.task_graph.reduce_graph_for_references(toplevel_task.expected_outputs) # XXX: Need to remove assigned task from worker(s). self.schedule()
def recover_job_descriptors(self): root = self.job_pool.journal_root if root is None: return for job_id in os.listdir(root): try: job_dir = os.path.join(root, job_id) result_path = os.path.join(job_dir, 'result') if os.path.exists(result_path): with open(result_path, 'r') as result_file: result = simplejson.load(result_file, object_hook=json_decode_object_hook) else: result = None journal_path = os.path.join(job_dir, 'task_journal') journal_file = open(journal_path, 'rb') record_type, root_task_descriptor_length = RECORD_HEADER_STRUCT.unpack(journal_file.read(RECORD_HEADER_STRUCT.size)) root_task_descriptor_string = journal_file.read(root_task_descriptor_length) assert record_type == 'T' assert len(root_task_descriptor_string) == root_task_descriptor_length root_task_descriptor = simplejson.loads(root_task_descriptor_string, object_hook=json_decode_object_hook) root_task = build_taskpool_task_from_descriptor(root_task_descriptor, None) # FIXME: Get the job pool to create this job, because it has access to the scheduler queue and task failure investigator. # FIXME: Store job options somewhere for recovered job. job = Job(job_id, root_task, job_dir, JOB_RECOVERED, self.job_pool, {}, journal=False) root_task.job = job if result is not None: with job._lock: job.completed(result) self.job_pool.add_job(job) # Adding the job to the job pool should add the root task. #self.task_pool.add_task(root_task) if result is None: self.load_other_tasks_defer(job, journal_file) ciel.log.error('Recovered job %s' % job_id, 'RECOVERY', logging.INFO, False) ciel.log.error('Recovered task %s for job %s' % (root_task.task_id, job_id), 'RECOVERY', logging.INFO, False) else: journal_file.close() ciel.log.error('Found information about job %s' % job_id, 'RECOVERY', logging.INFO, False) except: # We have lost critical data for the job, so we must fail it. ciel.log.error('Error recovering job %s' % job_id, 'RECOVERY', logging.ERROR, True) self.job_pool.add_failed_job(job_id)
def spawn_and_publish(self, spawns, refs, producer=None, taskset=None): producer_task = None if producer is not None: producer_task = self.get_task(producer["task_id"]) taskset = producer_task.taskset upd = TaskGraphUpdate() for spawn in spawns: task_object = build_taskpool_task_from_descriptor( spawn, producer_task, taskset) upd.spawn(task_object) for ref in refs: upd.publish(ref, producer_task) upd.commit(self)
def add_task(self, task_descriptor, parent_task=None, job=None, may_reduce=True): try: task_id = task_descriptor['task_id'] except: task_id = self.generate_task_id() task_descriptor['task_id'] = task_id task = build_taskpool_task_from_descriptor(task_descriptor, parent_task) task.job = job self.lazy_task_pool.add_task(task, parent_task is None, may_reduce) #add_event = self.new_event(task) #add_event["task_descriptor"] = task.as_descriptor(long=True) #add_event["action"] = "CREATED" #self.events.append(add_event) return task
def load_other_tasks_for_job(self, job, journal_file): ''' Process a the task journal for a recovered job. ''' try: while True: record_header = journal_file.read(RECORD_HEADER_STRUCT.size) if len(record_header) != RECORD_HEADER_STRUCT.size: ciel.log.error('Journal entry truncated for job %s' % job.id, 'RECOVERY', logging.WARNING, False) # XXX: Need to truncate the journal file. break record_type, record_length = RECORD_HEADER_STRUCT.unpack(record_header) record_string = journal_file.read(record_length) if len(record_string) != record_length: ciel.log.error('Journal entry truncated for job %s' % job.id, 'RECOVERY', logging.WARNING, False) # XXX: Need to truncate the journal file. break rec = simplejson.loads(record_string, object_hook=json_decode_object_hook) if record_type == 'R': job.task_graph.publish(rec['ref']) elif record_type == 'T': task_id = rec['task_id'] parent_task = job.task_graph.get_task(rec['parent']) task = build_taskpool_task_from_descriptor(rec, parent_task) task.job = job task.parent.children.append(task) ciel.log.error('Recovered task %s for job %s' % (task_id, job.id), 'RECOVERY', logging.INFO, False) job.task_graph.spawn(task) else: ciel.log.error('Got invalid record type in job %s' % job.id, 'RECOVERY', logging.WARNING, False) except: ciel.log.error('Error recovering task_journal for job %s' % job.id, 'RECOVERY', logging.WARNING, True) finally: journal_file.close() job.restart_journalling() if job.state == JOB_ACTIVE: ciel.log.error('Restarting recovered job %s' % job.id, 'RECOVERY', logging.INFO)
def _report_tasks(self, report, toplevel_task, worker): with self._lock: tx = TaskGraphUpdate() root_task = self.task_graph.get_task(report[0][0]) ciel.log( 'Received report from task %s with %d entries' % (root_task.task_id, len(report)), 'SCHED', logging.DEBUG) try: self.workers[worker].deassign_task(root_task) except KeyError: # This can happen if we recieve the report after the worker is deemed to have failed. In this case, we should # accept the report and ignore the failed worker. pass for (parent_id, success, payload) in report: ciel.log('Processing report record from task %s' % (parent_id), 'SCHED', logging.DEBUG) parent_task = self.task_graph.get_task(parent_id) if success: ciel.log('Task %s was successful' % (parent_id), 'SCHED', logging.DEBUG) (spawned, published, profiling) = payload parent_task.set_profiling(profiling) parent_task.set_state(TASK_COMMITTED) self.record_task_stats(parent_task, worker) for child in spawned: child_task = build_taskpool_task_from_descriptor( child, parent_task) ciel.log( 'Task %s spawned task %s' % (parent_id, child_task.task_id), 'SCHED', logging.DEBUG) tx.spawn(child_task) #parent_task.children.append(child_task) for ref in published: ciel.log( 'Task %s published reference %s' % (parent_id, str(ref)), 'SCHED', logging.DEBUG) tx.publish(ref, parent_task) else: ciel.log('Task %s failed' % (parent_id), 'SCHED', logging.WARN) # Only one failed task per-report, at the moment. self.investigate_task_failure(parent_task, payload) self.schedule() return tx.commit(self.task_graph) self.task_graph.reduce_graph_for_references( toplevel_task.expected_outputs) # XXX: Need to remove assigned task from worker(s). self.schedule()