Example #1
0
    def create_job_for_task(self, task_descriptor, job_options, job_id=None):
        """
        Convert a task descriptor into a job.  Allocates a new job id,
        creates a Job, and entrains it to the JobPool.

        This is always called with job_id = None; not sure why there's
        even an argument for it.
        """
        with self._lock:
        
            if job_id is None:
                job_id = self.allocate_job_id()
            task_id = 'root:%s' % (job_id, )
    
            task_descriptor['task_id'] = task_id
    
            # TODO: Here is where we will set up the job journal, etc.
            job_dir = self.make_job_directory(job_id)
            
            try:
                expected_outputs = task_descriptor['expected_outputs']
            except KeyError:
                expected_outputs = ['%s:job_output' % job_id]
                task_descriptor['expected_outputs'] = expected_outputs
                
            task = build_taskpool_task_from_descriptor(task_descriptor, None)
            job = Job(job_id, task, job_dir, JOB_CREATED, self, job_options)
            task.job = job
            
            self.add_job(job)
            
            ciel.log('Added job: %s' % job.id, 'JOB_POOL', logging.INFO)
    
            return job
Example #2
0
    def create_job_for_task(self, task_descriptor, job_options, job_id=None):

        with self._lock:

            if job_id is None:
                job_id = self.allocate_job_id()
            task_id = 'root:%s' % (job_id, )

            task_descriptor['task_id'] = task_id

            # TODO: Here is where we will set up the job journal, etc.
            job_dir = self.make_job_directory(job_id)

            try:
                expected_outputs = task_descriptor['expected_outputs']
            except KeyError:
                expected_outputs = ['%s:job_output' % job_id]
                task_descriptor['expected_outputs'] = expected_outputs

            task = build_taskpool_task_from_descriptor(task_descriptor, None)
            job = Job(job_id, task, job_dir, JOB_CREATED, self, job_options)
            task.job = job

            self.add_job(job)

            ciel.log('Added job: %s' % job.id, 'JOB_POOL', logging.INFO)

            return job
Example #3
0
 def create_job_for_task(self, task_descriptor, job_options, job_id=None):
     
     with self._lock:
     
         if job_id is None:
             job_id = self.allocate_job_id()
         task_id = 'root:%s' % (job_id, )
 
         task_descriptor['task_id'] = task_id
 
         # TODO: Here is where we will set up the job journal, etc.
         job_dir = self.make_job_directory(job_id)
         
         try:
             expected_outputs = task_descriptor['expected_outputs']
         except KeyError:
             expected_outputs = ['%s:job_output' % job_id]
             task_descriptor['expected_outputs'] = expected_outputs
             
         task = build_taskpool_task_from_descriptor(task_descriptor, None)
         job = Job(job_id, task, job_dir, JOB_CREATED, self, job_options)
         task.job = job
         
         self.add_job(job)
         
         ciel.log('Added job: %s' % job.id, 'JOB_POOL', logging.INFO)
 
         return job
Example #4
0
 def spawn_and_publish(self, spawns, refs, producer=None, taskset=None):
     
     producer_task = None
     if producer is not None:
         producer_task = self.get_task(producer["task_id"])
         taskset = producer_task.taskset
     upd = TaskGraphUpdate()
     for spawn in spawns:
         task_object = build_taskpool_task_from_descriptor(spawn, producer_task, taskset)
         upd.spawn(task_object)
     for ref in refs:
         upd.publish(ref, producer_task)
     upd.commit(self)
Example #5
0
    def _report_tasks(self, report, toplevel_task, worker):
        # XXX SOS22 This does a damn sight more than just reporting the tasks!
        with self._lock:
    
            tx = TaskGraphUpdate()
            
            root_task = self.task_graph.get_task(report[0][0])
            
            ciel.log('Received report from task %s with %d entries' % (root_task.task_id, len(report)), 'SCHED', logging.DEBUG)
            
            try:
                self.workers[worker].deassign_task(root_task)
            except KeyError:
                # This can happen if we recieve the report after the worker is deemed to have failed. In this case, we should
                # accept the report and ignore the failed worker.
                pass

            for (parent_id, success, payload) in report:
                
                ciel.log('Processing report record from task %s' % (parent_id), 'SCHED', logging.DEBUG)
                
                parent_task = self.task_graph.get_task(parent_id)
                
                if success:
                    ciel.log('Task %s was successful' % (parent_id), 'SCHED', logging.DEBUG)
                    (spawned, published, profiling) = payload
                    parent_task.set_profiling(profiling)
                    parent_task.set_state(TASK_COMMITTED)
                    self.record_task_stats(parent_task, worker)
                    for child in spawned:
                        child_task = build_taskpool_task_from_descriptor(child, parent_task)
                        ciel.log('Task %s spawned task %s' % (parent_id, child_task.task_id), 'SCHED', logging.DEBUG)
                        tx.spawn(child_task)
                        #parent_task.children.append(child_task)
                    
                    for ref in published:
                        ciel.log('Task %s published reference %s' % (parent_id, str(ref)), 'SCHED', logging.DEBUG)
                        tx.publish(ref, parent_task)
                
                else:
                    ciel.log('Task %s failed' % (parent_id), 'SCHED', logging.WARN)
                    # Only one failed task per-report, at the moment.
                    self.investigate_task_failure(parent_task, payload)
                    self.schedule()
                    return
                    
            tx.commit(self.task_graph)
            self.task_graph.reduce_graph_for_references(toplevel_task.expected_outputs)
            
        # XXX: Need to remove assigned task from worker(s).
        self.schedule()
Example #6
0
    def recover_job_descriptors(self):
        root = self.job_pool.journal_root
        if root is None:
            return
        
        for job_id in os.listdir(root):

            try:
                job_dir = os.path.join(root, job_id)
                result_path = os.path.join(job_dir, 'result')
                if os.path.exists(result_path):
                    with open(result_path, 'r') as result_file:
                        result = simplejson.load(result_file, object_hook=json_decode_object_hook)
                else:
                    result = None
                    
                journal_path = os.path.join(job_dir, 'task_journal')
                journal_file = open(journal_path, 'rb')
                record_type, root_task_descriptor_length = RECORD_HEADER_STRUCT.unpack(journal_file.read(RECORD_HEADER_STRUCT.size))
                root_task_descriptor_string = journal_file.read(root_task_descriptor_length)
                assert record_type == 'T'
                assert len(root_task_descriptor_string) == root_task_descriptor_length
                root_task_descriptor = simplejson.loads(root_task_descriptor_string, object_hook=json_decode_object_hook)
                root_task = build_taskpool_task_from_descriptor(root_task_descriptor, None)
                
                # FIXME: Get the job pool to create this job, because it has access to the scheduler queue and task failure investigator.
                # FIXME: Store job options somewhere for recovered job.
                job = Job(job_id, root_task, job_dir, JOB_RECOVERED, self.job_pool, {}, journal=False)
                
                root_task.job = job
                if result is not None:
                    with job._lock:
                        job.completed(result)
                self.job_pool.add_job(job)
                # Adding the job to the job pool should add the root task.
                #self.task_pool.add_task(root_task)
                
                if result is None:
                    self.load_other_tasks_defer(job, journal_file)
                    ciel.log.error('Recovered job %s' % job_id, 'RECOVERY', logging.INFO, False)
                    ciel.log.error('Recovered task %s for job %s' % (root_task.task_id, job_id), 'RECOVERY', logging.INFO, False)
                else:
                    journal_file.close()
                    ciel.log.error('Found information about job %s' % job_id, 'RECOVERY', logging.INFO, False)
                
                
            except:
                # We have lost critical data for the job, so we must fail it.
                ciel.log.error('Error recovering job %s' % job_id, 'RECOVERY', logging.ERROR, True)
                self.job_pool.add_failed_job(job_id)
Example #7
0
    def spawn_and_publish(self, spawns, refs, producer=None, taskset=None):

        producer_task = None
        if producer is not None:
            producer_task = self.get_task(producer["task_id"])
            taskset = producer_task.taskset
        upd = TaskGraphUpdate()
        for spawn in spawns:
            task_object = build_taskpool_task_from_descriptor(
                spawn, producer_task, taskset)
            upd.spawn(task_object)
        for ref in refs:
            upd.publish(ref, producer_task)
        upd.commit(self)
Example #8
0
    def recover_job_descriptors(self):
        root = self.job_pool.journal_root
        if root is None:
            return
        
        for job_id in os.listdir(root):

            try:
                job_dir = os.path.join(root, job_id)
                result_path = os.path.join(job_dir, 'result')
                if os.path.exists(result_path):
                    with open(result_path, 'r') as result_file:
                        result = simplejson.load(result_file, object_hook=json_decode_object_hook)
                else:
                    result = None
                    
                journal_path = os.path.join(job_dir, 'task_journal')
                journal_file = open(journal_path, 'rb')
                record_type, root_task_descriptor_length = RECORD_HEADER_STRUCT.unpack(journal_file.read(RECORD_HEADER_STRUCT.size))
                root_task_descriptor_string = journal_file.read(root_task_descriptor_length)
                assert record_type == 'T'
                assert len(root_task_descriptor_string) == root_task_descriptor_length
                root_task_descriptor = simplejson.loads(root_task_descriptor_string, object_hook=json_decode_object_hook)
                root_task = build_taskpool_task_from_descriptor(root_task_descriptor, None)
                
                # FIXME: Get the job pool to create this job, because it has access to the scheduler queue and task failure investigator.
                # FIXME: Store job options somewhere for recovered job.
                job = Job(job_id, root_task, job_dir, JOB_RECOVERED, self.job_pool, {}, journal=False)
                
                root_task.job = job
                if result is not None:
                    with job._lock:
                        job.completed(result)
                self.job_pool.add_job(job)
                # Adding the job to the job pool should add the root task.
                #self.task_pool.add_task(root_task)
                
                if result is None:
                    self.load_other_tasks_defer(job, journal_file)
                    ciel.log.error('Recovered job %s' % job_id, 'RECOVERY', logging.INFO, False)
                    ciel.log.error('Recovered task %s for job %s' % (root_task.task_id, job_id), 'RECOVERY', logging.INFO, False)
                else:
                    journal_file.close()
                    ciel.log.error('Found information about job %s' % job_id, 'RECOVERY', logging.INFO, False)
                
                
            except:
                # We have lost critical data for the job, so we must fail it.
                ciel.log.error('Error recovering job %s' % job_id, 'RECOVERY', logging.ERROR, True)
                self.job_pool.add_failed_job(job_id)
Example #9
0
    def add_task(self, task_descriptor, parent_task=None, job=None, may_reduce=True):
        try:
            task_id = task_descriptor['task_id']
        except:
            task_id = self.generate_task_id()
            task_descriptor['task_id'] = task_id
        
        task = build_taskpool_task_from_descriptor(task_descriptor, parent_task)
        task.job = job
        
        self.lazy_task_pool.add_task(task, parent_task is None, may_reduce)
        
        #add_event = self.new_event(task)
        #add_event["task_descriptor"] = task.as_descriptor(long=True)
        #add_event["action"] = "CREATED"
    
        #self.events.append(add_event)

        return task
Example #10
0
    def load_other_tasks_for_job(self, job, journal_file):
        '''
        Process a the task journal for a recovered job.
        '''
        try:
            while True:
                record_header = journal_file.read(RECORD_HEADER_STRUCT.size)
                if len(record_header) != RECORD_HEADER_STRUCT.size:
                    ciel.log.error('Journal entry truncated for job %s' % job.id, 'RECOVERY', logging.WARNING, False)
                    # XXX: Need to truncate the journal file.
                    break
                record_type, record_length = RECORD_HEADER_STRUCT.unpack(record_header)
                record_string = journal_file.read(record_length)
                if len(record_string) != record_length:
                    ciel.log.error('Journal entry truncated for job %s' % job.id, 'RECOVERY', logging.WARNING, False)
                    # XXX: Need to truncate the journal file.
                    break
                rec = simplejson.loads(record_string, object_hook=json_decode_object_hook)
                if record_type == 'R':
                    job.task_graph.publish(rec['ref'])
                elif record_type == 'T':
                    task_id = rec['task_id']
                    parent_task = job.task_graph.get_task(rec['parent'])
                    task = build_taskpool_task_from_descriptor(rec, parent_task)
                    task.job = job
                    task.parent.children.append(task)
    
                    ciel.log.error('Recovered task %s for job %s' % (task_id, job.id), 'RECOVERY', logging.INFO, False)
                    job.task_graph.spawn(task)
                else:
                    ciel.log.error('Got invalid record type in job %s' % job.id, 'RECOVERY', logging.WARNING, False)
                
        except:
            ciel.log.error('Error recovering task_journal for job %s' % job.id, 'RECOVERY', logging.WARNING, True)

        finally:
            journal_file.close()
            job.restart_journalling()
            if job.state == JOB_ACTIVE:
                ciel.log.error('Restarting recovered job %s' % job.id, 'RECOVERY', logging.INFO)
Example #11
0
    def load_other_tasks_for_job(self, job, journal_file):
        '''
        Process a the task journal for a recovered job.
        '''
        try:
            while True:
                record_header = journal_file.read(RECORD_HEADER_STRUCT.size)
                if len(record_header) != RECORD_HEADER_STRUCT.size:
                    ciel.log.error('Journal entry truncated for job %s' % job.id, 'RECOVERY', logging.WARNING, False)
                    # XXX: Need to truncate the journal file.
                    break
                record_type, record_length = RECORD_HEADER_STRUCT.unpack(record_header)
                record_string = journal_file.read(record_length)
                if len(record_string) != record_length:
                    ciel.log.error('Journal entry truncated for job %s' % job.id, 'RECOVERY', logging.WARNING, False)
                    # XXX: Need to truncate the journal file.
                    break
                rec = simplejson.loads(record_string, object_hook=json_decode_object_hook)
                if record_type == 'R':
                    job.task_graph.publish(rec['ref'])
                elif record_type == 'T':
                    task_id = rec['task_id']
                    parent_task = job.task_graph.get_task(rec['parent'])
                    task = build_taskpool_task_from_descriptor(rec, parent_task)
                    task.job = job
                    task.parent.children.append(task)
    
                    ciel.log.error('Recovered task %s for job %s' % (task_id, job.id), 'RECOVERY', logging.INFO, False)
                    job.task_graph.spawn(task)
                else:
                    ciel.log.error('Got invalid record type in job %s' % job.id, 'RECOVERY', logging.WARNING, False)
                
        except:
            ciel.log.error('Error recovering task_journal for job %s' % job.id, 'RECOVERY', logging.WARNING, True)

        finally:
            journal_file.close()
            job.restart_journalling()
            if job.state == JOB_ACTIVE:
                ciel.log.error('Restarting recovered job %s' % job.id, 'RECOVERY', logging.INFO)
Example #12
0
    def add_task(self,
                 task_descriptor,
                 parent_task=None,
                 job=None,
                 may_reduce=True):
        try:
            task_id = task_descriptor['task_id']
        except:
            task_id = self.generate_task_id()
            task_descriptor['task_id'] = task_id

        task = build_taskpool_task_from_descriptor(task_descriptor,
                                                   parent_task)
        task.job = job

        self.lazy_task_pool.add_task(task, parent_task is None, may_reduce)

        #add_event = self.new_event(task)
        #add_event["task_descriptor"] = task.as_descriptor(long=True)
        #add_event["action"] = "CREATED"

        #self.events.append(add_event)

        return task
Example #13
0
    def _report_tasks(self, report, toplevel_task, worker):
        with self._lock:

            tx = TaskGraphUpdate()

            root_task = self.task_graph.get_task(report[0][0])

            ciel.log(
                'Received report from task %s with %d entries' %
                (root_task.task_id, len(report)), 'SCHED', logging.DEBUG)

            try:
                self.workers[worker].deassign_task(root_task)
            except KeyError:
                # This can happen if we recieve the report after the worker is deemed to have failed. In this case, we should
                # accept the report and ignore the failed worker.
                pass

            for (parent_id, success, payload) in report:

                ciel.log('Processing report record from task %s' % (parent_id),
                         'SCHED', logging.DEBUG)

                parent_task = self.task_graph.get_task(parent_id)

                if success:
                    ciel.log('Task %s was successful' % (parent_id), 'SCHED',
                             logging.DEBUG)
                    (spawned, published, profiling) = payload
                    parent_task.set_profiling(profiling)
                    parent_task.set_state(TASK_COMMITTED)
                    self.record_task_stats(parent_task, worker)
                    for child in spawned:
                        child_task = build_taskpool_task_from_descriptor(
                            child, parent_task)
                        ciel.log(
                            'Task %s spawned task %s' %
                            (parent_id, child_task.task_id), 'SCHED',
                            logging.DEBUG)
                        tx.spawn(child_task)
                        #parent_task.children.append(child_task)

                    for ref in published:
                        ciel.log(
                            'Task %s published reference %s' %
                            (parent_id, str(ref)), 'SCHED', logging.DEBUG)
                        tx.publish(ref, parent_task)

                else:
                    ciel.log('Task %s failed' % (parent_id), 'SCHED',
                             logging.WARN)
                    # Only one failed task per-report, at the moment.
                    self.investigate_task_failure(parent_task, payload)
                    self.schedule()
                    return

            tx.commit(self.task_graph)
            self.task_graph.reduce_graph_for_references(
                toplevel_task.expected_outputs)

        # XXX: Need to remove assigned task from worker(s).
        self.schedule()