Exemple #1
0
    def report_tasks(self, report, toplevel_task_id):

        task = self.task_graph.get_task(toplevel_task_id)

        tx = TaskGraphUpdate()
        
        for (parent_id, success, payload) in report:
            parent_task = self.task_graph.get_task(parent_id)
            if success:
                (spawned, published) = payload
                
                for child in spawned:
                    child_task = build_taskpool_task_from_descriptor(child, parent_task)
                    tx.spawn(child_task)
                    parent_task.children.append(child_task)
                
                for ref in published:
                    tx.publish(ref, parent_task)
            
            else:
                # Only one failed task per-report, at the moment.
                self.investigate_task_failure(parent_task, payload)
                self.lazy_task_pool.worker_pool.worker_idle(toplevel_task_id.worker)
                ciel.engine.publish('schedule')
                return
                
        tx.commit(self.task_graph)
Exemple #2
0
    def create_job_for_task(self, task_descriptor, job_id=None):
        
        if job_id is None:
            job_id = self.allocate_job_id()
        task_id = 'root:%s' % (job_id, )

        # TODO: Here is where we will set up the job journal, etc.
        job_dir = self.make_job_directory(job_id)
        
        # TODO: Remove the global name directory dependency.
        try:
            expected_outputs = task_descriptor['expected_outputs']
            for output in expected_outputs:
                self.global_name_directory.create_global_id(task_id, output)
        except KeyError:
            try:
                num_outputs = task_descriptor['num_outputs']
                expected_outputs = map(lambda x: self.global_name_directory.create_global_id(task_id), range(0, num_outputs))
            except:
                expected_outputs = [self.global_name_directory.create_global_id()]
            task_descriptor['expected_outputs'] = expected_outputs
            
        task = build_taskpool_task_from_descriptor(task_id, task_descriptor, self, None)
        job = Job(job_id, task, job_dir)
        task.job = job
        
        self.add_job(job)
        
        cherrypy.log('Added job: %s' % job.id, 'JOB_POOL', logging.INFO)

        return job
Exemple #3
0
    def add_task(self, task_descriptor, parent_task_id=None):
        with self._lock:
            try:
                task_id = task_descriptor['task_id']
            except:
                task_id = self.generate_task_id()
            
            task = build_taskpool_task_from_descriptor(task_id, task_descriptor, self, parent_task_id)
            self.tasks[task_id] = task
            add_event = self.new_event(task)
            add_event["task_descriptor"] = task.as_descriptor(long=True)
            add_event["action"] = "CREATED"
        
            task.check_dependencies(self.global_name_directory)

            if task.is_blocked():
                for global_id in task.blocked_on():
                    try:
                        self.references_blocking_tasks[global_id].add(task_id)
                    except KeyError:
                        self.references_blocking_tasks[global_id] = set([task_id])
            else:
                task.state = TASK_RUNNABLE
                self.add_task_to_queues(task)

            self.events.append(add_event)
                
        self.bus.publish('schedule')
        return task
Exemple #4
0
 def create_job_for_task(self, task_descriptor, job_options, job_id=None):
     
     with self._lock:
     
         if job_id is None:
             job_id = self.allocate_job_id()
         task_id = 'root:%s' % (job_id, )
 
         task_descriptor['task_id'] = task_id
 
         # TODO: Here is where we will set up the job journal, etc.
         job_dir = self.make_job_directory(job_id)
         
         try:
             expected_outputs = task_descriptor['expected_outputs']
         except KeyError:
             expected_outputs = ['%s:job_output' % job_id]
             task_descriptor['expected_outputs'] = expected_outputs
             
         task = build_taskpool_task_from_descriptor(task_descriptor, None)
         job = Job(job_id, task, job_dir, JOB_CREATED, self, job_options)
         task.job = job
         
         self.add_job(job)
         
         ciel.log('Added job: %s' % job.id, 'JOB_POOL', logging.INFO)
 
         return job
Exemple #5
0
    def create_job_for_task(self, task_descriptor, job_options, job_id=None):

        with self._lock:

            if job_id is None:
                job_id = self.allocate_job_id()
            task_id = 'root:%s' % (job_id, )

            task_descriptor['task_id'] = task_id

            # TODO: Here is where we will set up the job journal, etc.
            job_dir = self.make_job_directory(job_id)

            try:
                expected_outputs = task_descriptor['expected_outputs']
            except KeyError:
                expected_outputs = ['%s:job_output' % job_id]
                task_descriptor['expected_outputs'] = expected_outputs

            task = build_taskpool_task_from_descriptor(task_descriptor, None)
            job = Job(job_id, task, job_dir, JOB_CREATED, self, job_options)
            task.job = job

            self.add_job(job)

            ciel.log('Added job: %s' % job.id, 'JOB_POOL', logging.INFO)

            return job
Exemple #6
0
def allinone_main(options, args):
    
    ciel.log = CielLogger()
    
    script_filename = args[0]
    run_id = args[1] if len(args) > 1 else 'allinone'
    
    base_dir = tempfile.mkdtemp(prefix=os.getenv('TEMP', default='/tmp/sw-files-'))
    ciel.log('Writing block store files to %s' % base_dir, 'ALLINONE', logging.INFO)
    
    if options.blockstore is not None:
        base_dir = options.blockstore
    else:
        base_dir = tempfile.mkdtemp(prefix=os.getenv('TEMP', default='/tmp/sw-files-'))
        options.blockstore = base_dir
        
    block_store = BlockStore(ciel.engine, 'localhost', 8000, base_dir, True)
    
    initial_task_descriptor, cont_ref = build_initial_task_descriptor(script_filename, block_store, 'root', 'root_cont', 'root_output')
        
    initial_task_object = build_taskpool_task_from_descriptor(initial_task_descriptor, None)
    
    task_runner = TaskRunner(initial_task_object, cont_ref, block_store, options)
    
    try:
        print run_id, 'SUBMITTED_JOB', now_as_timestamp()
        result = task_runner.run()
        print run_id, 'GOT_RESULT', now_as_timestamp()
        print block_store.retrieve_object_for_ref(result, 'json')
        
    except:
        pass
Exemple #7
0
    def report_tasks(self, report, toplevel_task_id):

        task = self.task_graph.get_task(toplevel_task_id)

        tx = TaskGraphUpdate()

        for (parent_id, success, payload) in report:
            parent_task = self.task_graph.get_task(parent_id)
            if success:
                (spawned, published) = payload

                for child in spawned:
                    child_task = build_taskpool_task_from_descriptor(
                        child, parent_task)
                    tx.spawn(child_task)
                    parent_task.children.append(child_task)

                for ref in published:
                    tx.publish(ref, parent_task)

            else:
                # Only one failed task per-report, at the moment.
                self.investigate_task_failure(parent_task, payload)
                self.lazy_task_pool.worker_pool.worker_idle(
                    toplevel_task_id.worker)
                ciel.engine.publish('schedule')
                return

        tx.commit(self.task_graph)
Exemple #8
0
def allinone_main(options, args):

    ciel.log = CielLogger()

    script_filename = args[0]
    run_id = args[1] if len(args) > 1 else "allinone"

    base_dir = tempfile.mkdtemp(prefix=os.getenv("TEMP", default="/tmp/sw-files-"))
    ciel.log("Writing block store files to %s" % base_dir, "ALLINONE", logging.INFO)

    if options.blockstore is not None:
        base_dir = options.blockstore
    else:
        base_dir = tempfile.mkdtemp(prefix=os.getenv("TEMP", default="/tmp/sw-files-"))
        options.blockstore = base_dir

    block_store = BlockStore(ciel.engine, "localhost", 8000, base_dir, True)

    initial_task_descriptor, cont_ref = build_initial_task_descriptor(
        script_filename, block_store, "root", "root_cont", "root_output"
    )

    initial_task_object = build_taskpool_task_from_descriptor(initial_task_descriptor, None)

    task_runner = TaskRunner(initial_task_object, cont_ref, block_store, options)

    try:
        print run_id, "SUBMITTED_JOB", now_as_timestamp()
        result = task_runner.run()
        print run_id, "GOT_RESULT", now_as_timestamp()
        print block_store.retrieve_object_for_ref(result, "json", None)

    except:
        pass
Exemple #9
0
 def spawn_tasks(self, parent_task_id, tasks):
     parent_task = self.task_graph.get_task(parent_task_id)
     
     tx = TaskGraphUpdate()
     
     for task_descriptor in tasks:
         task_object = build_taskpool_task_from_descriptor(task_descriptor, None, parent_task)
         tx.spawn(task_object)
     
     tx.commit(self.task_graph)
Exemple #10
0
    def spawn_tasks(self, parent_task_id, tasks):
        parent_task = self.task_graph.get_task(parent_task_id)

        tx = TaskGraphUpdate()

        for task_descriptor in tasks:
            task_object = build_taskpool_task_from_descriptor(
                task_descriptor, None, parent_task)
            tx.spawn(task_object)

        tx.commit(self.task_graph)
Exemple #11
0
    def load_other_tasks_for_job(self, job, journal_file):
        '''
        Process a the task journal for a recovered job.
        '''
        try:
            while True:
                record_header = journal_file.read(RECORD_HEADER_STRUCT.size)
                if len(record_header) != RECORD_HEADER_STRUCT.size:
                    ciel.log.error(
                        'Journal entry truncated for job %s' % job.id,
                        'RECOVERY', logging.WARNING, False)
                    break
                record_type, record_length = RECORD_HEADER_STRUCT.unpack(
                    record_header)
                record_string = journal_file.read(record_length)
                if len(record_string) != record_length:
                    ciel.log.error(
                        'Journal entry truncated for job %s' % job.id,
                        'RECOVERY', logging.WARNING, False)
                    break
                rec = simplejson.loads(record_string,
                                       object_hook=json_decode_object_hook)
                if record_type == 'R':
                    self.task_pool.publish_single_ref(rec['id'],
                                                      rec['ref'],
                                                      job,
                                                      should_journal=False)
                elif record_type == 'T':
                    task_id = rec['task_id']
                    parent_task = self.task_pool.get_task_by_id(rec['parent'])
                    task = build_taskpool_task_from_descriptor(
                        rec, parent_task)
                    task.job = job
                    task.parent.children.append(task)

                    ciel.log.error(
                        'Recovered task %s for job %s' % (task_id, job.id),
                        'RECOVERY', logging.INFO, False)
                    self.task_pool.add_task(task)
                else:
                    ciel.log.error(
                        'Got invalid record type in job %s' % job.id,
                        'RECOVERY', logging.WARNING, False)

        except:
            ciel.log.error('Error recovering task_journal for job %s' % job.id,
                           'RECOVERY', logging.WARNING, True)

        finally:
            journal_file.close()
            if job.state == JOB_ACTIVE:
                ciel.log.error('Restarting recovered job %s' % job.id,
                               'RECOVERY', logging.INFO)
Exemple #12
0
 def spawn_and_publish(self, spawns, refs, producer=None, taskset=None):
     
     producer_task = None
     if producer is not None:
         producer_task = self.get_task(producer["task_id"])
         taskset = producer_task.taskset
     upd = TaskGraphUpdate()
     for spawn in spawns:
         task_object = build_taskpool_task_from_descriptor(spawn, producer_task, taskset)
         upd.spawn(task_object)
     for ref in refs:
         upd.publish(ref, producer_task)
     upd.commit(self)
Exemple #13
0
    def recover_job_descriptors(self):
        root = self.job_pool.journal_root
        if root is None:
            return
        
        for job_id in os.listdir(root):

            try:
                job_dir = os.path.join(root, job_id)
                result_path = os.path.join(job_dir, 'result')
                if os.path.exists(result_path):
                    with open(result_path, 'r') as result_file:
                        result = simplejson.load(result_file, object_hook=json_decode_object_hook)
                else:
                    result = None
                    
                journal_path = os.path.join(job_dir, 'task_journal')
                journal_file = open(journal_path, 'rb')
                record_type, root_task_descriptor_length = RECORD_HEADER_STRUCT.unpack(journal_file.read(RECORD_HEADER_STRUCT.size))
                root_task_descriptor_string = journal_file.read(root_task_descriptor_length)
                assert record_type == 'T'
                assert len(root_task_descriptor_string) == root_task_descriptor_length
                root_task_descriptor = simplejson.loads(root_task_descriptor_string, object_hook=json_decode_object_hook)
                root_task = build_taskpool_task_from_descriptor(root_task_descriptor, None)
                
                # FIXME: Get the job pool to create this job, because it has access to the scheduler queue and task failure investigator.
                # FIXME: Store job options somewhere for recovered job.
                job = Job(job_id, root_task, job_dir, JOB_RECOVERED, self.job_pool, {})
                
                root_task.job = job
                if result is not None:
                    with job._lock:
                        job.completed(result)
                self.job_pool.add_job(job)
                # Adding the job to the job pool should add the root task.
                #self.task_pool.add_task(root_task)
                
                if result is None:
                    self.load_other_tasks_defer(job, journal_file)
                    ciel.log.error('Recovered job %s' % job_id, 'RECOVERY', logging.INFO, False)
                    ciel.log.error('Recovered task %s for job %s' % (root_task['task_id'], job_id), 'RECOVERY', logging.INFO, False)
                else:
                    journal_file.close()
                    ciel.log.error('Found information about job %s' % job_id, 'RECOVERY', logging.INFO, False)
                
                
            except:
                # We have lost critical data for the job, so we must fail it.
                ciel.log.error('Error recovering job %s' % job_id, 'RECOVERY', logging.ERROR, True)
                self.job_pool.add_failed_job(job_id)
Exemple #14
0
    def spawn_and_publish(self, spawns, refs, producer=None, taskset=None):

        producer_task = None
        if producer is not None:
            producer_task = self.get_task(producer["task_id"])
            taskset = producer_task.taskset
        upd = TaskGraphUpdate()
        for spawn in spawns:
            task_object = build_taskpool_task_from_descriptor(
                spawn, producer_task, taskset)
            upd.spawn(task_object)
        for ref in refs:
            upd.publish(ref, producer_task)
        upd.commit(self)
Exemple #15
0
    def _report_tasks(self, report, toplevel_task, worker):
        with self._lock:

            tx = TaskGraphUpdate()

            root_task = self.task_graph.get_task(report[0][0])
            for assigned_worker in root_task.get_workers():
                if assigned_worker is worker:
                    self.workers[worker].deassign_task(root_task)
                else:
                    self.workers[assigned_worker].deassign_task(root_task)
                    assigned_worker.worker_pool.abort_task_on_worker(
                        root_task, assigned_worker)

                    # XXX: Need to abort the task running on other workers.
                    pass

            for (parent_id, success, payload) in report:

                parent_task = self.task_graph.get_task(parent_id)

                if success:
                    (spawned, published, profiling) = payload
                    parent_task.set_profiling(profiling)
                    parent_task.set_state(TASK_COMMITTED)
                    self.record_task_stats(parent_task, worker)
                    for child in spawned:
                        child_task = build_taskpool_task_from_descriptor(
                            child, parent_task)
                        tx.spawn(child_task)
                        parent_task.children.append(child_task)

                    for ref in published:
                        tx.publish(ref, parent_task)

                else:
                    # Only one failed task per-report, at the moment.
                    self.investigate_task_failure(parent_task, payload)
                    self.schedule()
                    return

            tx.commit(self.task_graph)
            self.task_graph.reduce_graph_for_references(
                toplevel_task.expected_outputs)

        # XXX: Need to remove assigned task from worker(s).
        self.schedule()
Exemple #16
0
    def recover_job_descriptors(self):
        root = self.job_pool.journal_root
        if root is None:
            return
        
        for job_id in os.listdir(root):

            try:
                job_dir = os.path.join(root, job_id)
                result_path = os.path.join(job_dir, 'result')
                if os.path.exists(result_path):
                    with open(result_path, 'r') as result_file:
                        result = simplejson.load(result_file, object_hook=json_decode_object_hook)
                else:
                    result = None
                    
                journal_path = os.path.join(job_dir, 'task_journal')
                journal_file = open(journal_path, 'rb')
                record_type, root_task_descriptor_length = RECORD_HEADER_STRUCT.unpack(journal_file.read(RECORD_HEADER_STRUCT.size))
                root_task_descriptor_string = journal_file.read(root_task_descriptor_length)
                assert record_type == 'T'
                assert len(root_task_descriptor_string) == root_task_descriptor_length
                root_task_descriptor = simplejson.loads(root_task_descriptor_string, object_hook=json_decode_object_hook)
                root_task_id = root_task_descriptor['task_id']
                root_task = build_taskpool_task_from_descriptor(root_task_id, root_task_descriptor, self.task_pool, None)
                job = Job(job_id, root_task, job_dir)
                root_task.job = job
                if result is not None:
                    job.completed(result)
                self.job_pool.add_job(job)
                self.task_pool.add_task(root_task)
                
                if result is None:
                    self.load_other_tasks_defer(job, journal_file)
                    cherrypy.log.error('Recovered job %s' % job_id, 'RECOVERY', logging.INFO, False)
                    cherrypy.log.error('Recovered task %s for job %s' % (root_task_id, job_id), 'RECOVERY', logging.INFO, False)
                else:
                    journal_file.close()
                    cherrypy.log.error('Found information about job %s' % job_id, 'RECOVERY', logging.INFO, False)
                
                
            except:
                # We have lost critical data for the job, so we must fail it.
                cherrypy.log.error('Error recovering job %s' % job_id, 'RECOVERY', logging.ERROR, True)
                self.job_pool.add_failed_job(job_id)
Exemple #17
0
    def add_task(self, task_descriptor, parent_task=None, job=None):
        try:
            task_id = task_descriptor['task_id']
        except:
            task_id = self.generate_task_id()
        
        task = build_taskpool_task_from_descriptor(task_id, task_descriptor, self, parent_task)
        task.job = job
        
        self.lazy_task_pool.add_task(task, parent_task is None)
        
        #add_event = self.new_event(task)
        #add_event["task_descriptor"] = task.as_descriptor(long=True)
        #add_event["action"] = "CREATED"
    
        #self.events.append(add_event)

        return task
Exemple #18
0
 def _report_tasks(self, report, toplevel_task, worker):
     with self._lock:
 
         tx = TaskGraphUpdate()
         
         root_task = self.task_graph.get_task(report[0][0])
         for assigned_worker in root_task.get_workers():
             if assigned_worker is worker:
                 self.workers[worker].deassign_task(root_task)
             else:
                 self.workers[assigned_worker].deassign_task(root_task)
                 assigned_worker.worker_pool.abort_task_on_worker(root_task, assigned_worker)
                 
                 # XXX: Need to abort the task running on other workers.
                 pass
         
         for (parent_id, success, payload) in report:
             
             parent_task = self.task_graph.get_task(parent_id)
             
             if success:
                 (spawned, published, profiling) = payload
                 parent_task.set_profiling(profiling)
                 parent_task.set_state(TASK_COMMITTED)
                 self.record_task_stats(parent_task, worker)
                 for child in spawned:
                     child_task = build_taskpool_task_from_descriptor(child, parent_task)
                     tx.spawn(child_task)
                     parent_task.children.append(child_task)
                 
                 for ref in published:
                     tx.publish(ref, parent_task)
             
             else:
                 # Only one failed task per-report, at the moment.
                 self.investigate_task_failure(parent_task, payload)
                 self.schedule()
                 return
                 
         tx.commit(self.task_graph)
         self.task_graph.reduce_graph_for_references(toplevel_task.expected_outputs)
         
     # XXX: Need to remove assigned task from worker(s).
     self.schedule()
Exemple #19
0
    def add_task(self, task_descriptor, parent_task=None, job=None):
        try:
            task_id = task_descriptor['task_id']
        except:
            task_id = self.generate_task_id()

        task = build_taskpool_task_from_descriptor(task_id, task_descriptor,
                                                   self, parent_task)
        task.job = job

        self.lazy_task_pool.add_task(task, parent_task is None)

        #add_event = self.new_event(task)
        #add_event["task_descriptor"] = task.as_descriptor(long=True)
        #add_event["action"] = "CREATED"

        #self.events.append(add_event)

        return task
Exemple #20
0
    def load_other_tasks_for_job(self, job, journal_file):
        '''
        Process a the task journal for a recovered job.
        '''
        try:
            while True:
                record_header = journal_file.read(RECORD_HEADER_STRUCT.size)
                if len(record_header) != RECORD_HEADER_STRUCT.size:
                    ciel.log.error('Journal entry truncated for job %s' % job.id, 'RECOVERY', logging.WARNING, False)
                    # XXX: Need to truncate the journal file.
                    break
                record_type, record_length = RECORD_HEADER_STRUCT.unpack(record_header)
                record_string = journal_file.read(record_length)
                if len(record_string) != record_length:
                    ciel.log.error('Journal entry truncated for job %s' % job.id, 'RECOVERY', logging.WARNING, False)
                    # XXX: Need to truncate the journal file.
                    break
                rec = simplejson.loads(record_string, object_hook=json_decode_object_hook)
                if record_type == 'R':
                    job.task_graph.publish(rec['ref'])
                elif record_type == 'T':
                    task_id = rec['task_id']
                    parent_task = job.task_graph.get_task(rec['parent'])
                    task = build_taskpool_task_from_descriptor(rec, parent_task)
                    task.job = job
                    task.parent.children.append(task)
    
                    ciel.log.error('Recovered task %s for job %s' % (task_id, job.id), 'RECOVERY', logging.INFO, False)
                    job.task_graph.spawn(task)
                else:
                    ciel.log.error('Got invalid record type in job %s' % job.id, 'RECOVERY', logging.WARNING, False)
                
        except:
            ciel.log.error('Error recovering task_journal for job %s' % job.id, 'RECOVERY', logging.WARNING, True)

        finally:
            journal_file.close()
            job.restart_journalling()
            if job.state == JOB_ACTIVE:
                ciel.log.error('Restarting recovered job %s' % job.id, 'RECOVERY', logging.INFO)
Exemple #21
0
    def create_job_for_task(self, task_descriptor, job_id=None):

        if job_id is None:
            job_id = self.allocate_job_id()
        task_id = 'root:%s' % (job_id, )

        # TODO: Here is where we will set up the job journal, etc.
        job_dir = self.make_job_directory(job_id)

        # TODO: Remove the global name directory dependency.
        try:
            expected_outputs = task_descriptor['expected_outputs']
            for output in expected_outputs:
                self.global_name_directory.create_global_id(task_id, output)
        except KeyError:
            try:
                num_outputs = task_descriptor['num_outputs']
                expected_outputs = map(
                    lambda x: self.global_name_directory.create_global_id(
                        task_id), range(0, num_outputs))
            except:
                expected_outputs = [
                    self.global_name_directory.create_global_id()
                ]
            task_descriptor['expected_outputs'] = expected_outputs

        task = build_taskpool_task_from_descriptor(task_id, task_descriptor,
                                                   self, None)
        job = Job(job_id, task, job_dir)
        task.job = job

        self.add_job(job)

        cherrypy.log('Added job: %s' % job.id, 'JOB_POOL', logging.INFO)

        return job
Exemple #22
0
    def recover_job_descriptors(self):
        root = self.job_pool.journal_root
        if root is None:
            return

        for job_id in os.listdir(root):

            try:
                job_dir = os.path.join(root, job_id)
                result_path = os.path.join(job_dir, 'result')
                if os.path.exists(result_path):
                    with open(result_path, 'r') as result_file:
                        result = simplejson.load(
                            result_file, object_hook=json_decode_object_hook)
                else:
                    result = None

                journal_path = os.path.join(job_dir, 'task_journal')
                journal_file = open(journal_path, 'rb')
                record_type, root_task_descriptor_length = RECORD_HEADER_STRUCT.unpack(
                    journal_file.read(RECORD_HEADER_STRUCT.size))
                root_task_descriptor_string = journal_file.read(
                    root_task_descriptor_length)
                assert record_type == 'T'
                assert len(
                    root_task_descriptor_string) == root_task_descriptor_length
                root_task_descriptor = simplejson.loads(
                    root_task_descriptor_string,
                    object_hook=json_decode_object_hook)
                root_task = build_taskpool_task_from_descriptor(
                    root_task_descriptor, None)

                # FIXME: Get the job pool to create this job, because it has access to the scheduler queue and task failure investigator.
                job = Job(job_id, root_task, job_dir, JOB_RECOVERED,
                          self.job_pool)

                root_task.job = job
                if result is not None:
                    job.completed(result)
                self.job_pool.add_job(job)
                # Adding the job to the job pool should add the root task.
                #self.task_pool.add_task(root_task)

                if result is None:
                    self.load_other_tasks_defer(job, journal_file)
                    ciel.log.error('Recovered job %s' % job_id, 'RECOVERY',
                                   logging.INFO, False)
                    ciel.log.error(
                        'Recovered task %s for job %s' %
                        (root_task['task_id'], job_id), 'RECOVERY',
                        logging.INFO, False)
                else:
                    journal_file.close()
                    ciel.log.error('Found information about job %s' % job_id,
                                   'RECOVERY', logging.INFO, False)

            except:
                # We have lost critical data for the job, so we must fail it.
                ciel.log.error('Error recovering job %s' % job_id, 'RECOVERY',
                               logging.ERROR, True)
                self.job_pool.add_failed_job(job_id)