Example #1
0
 def handle_completed_jobs(self):
     finished_jobs = set(self.running_jobs.keys()) - set(self.queued_jobs())
     print('Processing {} finished jobs...'.format(len(finished_jobs)))
     for job in finished_jobs:
         cmd_id, num_rerun = self.running_jobs[job]
         cmdline, dependencies, outputs = self.commands[cmd_id]
         okfile, failfile, donefile = gridutils.get_statusfile(outputs[0])
         if os.path.exists(okfile):
             print('Command {}({}) successfully completed'.format(cmd_id, job))
             del self.running_jobs[job]
             self.successfully_completed.add(cmd_id)
         elif os.path.exists(failfile):
             print('Fatal error running {}({}). Retrying.'.format(cmd_id, job))
             if num_rerun >= self.max_retry:
                 raise ValueError('Command {} exceeded the maximum number of retries'.format(cmd_id))
             del self.running_jobs[job]
             self.launch(cmd_id, 1 + num_rerun)
         else:
             if not os.path.exists(donefile):    
                 print('Node failure running {}({}). This is a known issue. Assuming everything is OK...'.format(cmd_id, job))
                 if num_rerun >= self.max_retry:
                     raise ValueError('Command {} exceeded the maximum number of retries'.format(cmd_id))
                 #self.launch(cmd_id, 1 + num_rerun)
                 #TODO FIXME TODO FIXME
                 del self.running_jobs[job]
                 self.successfully_completed.add(cmd_id)
             else:
                print('OK file does not exist for {}({}). This file will not be checkpointed.'.format(cmd_id, job))
                del self.running_jobs[job]
                self.successfully_completed.add(cmd_id)
 def handle_completed_jobs(self):
     finished_jobs = set(self.running_jobs.keys()) - set(self.queued_jobs())
     print('Processing {} finished jobs...'.format(len(finished_jobs)))
     for job in finished_jobs:
         cmd_id, num_rerun = self.running_jobs[job]
         cmdline, dependencies, outputs = self.commands[cmd_id]
         okfile, failfile, donefile = gridutils.get_statusfile(outputs[0])
         if os.path.exists(okfile):
             print('Command {}({}) successfully completed'.format(
                 cmd_id, job))
             del self.running_jobs[job]
             self.successfully_completed.add(cmd_id)
         elif os.path.exists(failfile):
             print('Fatal error running {}({}). Retrying.'.format(
                 cmd_id, job))
             if num_rerun >= self.max_retry:
                 raise ValueError(
                     'Command {} exceeded the maximum number of retries'.
                     format(cmd_id))
             del self.running_jobs[job]
             self.launch(cmd_id, 1 + num_rerun)
         else:
             if not os.path.exists(donefile):
                 print(
                     'Node failure running {}({}). This is a known issue. Assuming everything is OK...'
                     .format(cmd_id, job))
                 if num_rerun >= self.max_retry:
                     raise ValueError(
                         'Command {} exceeded the maximum number of retries'
                         .format(cmd_id))
                 #self.launch(cmd_id, 1 + num_rerun)
                 #TODO FIXME TODO FIXME
                 del self.running_jobs[job]
                 self.successfully_completed.add(cmd_id)
             else:
                 print(
                     'OK file does not exist for {}({}). This file will not be checkpointed.'
                     .format(cmd_id, job))
                 del self.running_jobs[job]
                 self.successfully_completed.add(cmd_id)
Example #3
0
 def completed(self, cmd_id):
     cmdline, dependencies, outputs = self.commands[cmd_id]
     okfile, failfile, donefile = gridutils.get_statusfile(outputs[0])
     return (os.path.exists(okfile) and not self.force_rerun) or cmd_id in self.successfully_completed
 def completed(self, cmd_id):
     cmdline, dependencies, outputs = self.commands[cmd_id]
     okfile, failfile, donefile = gridutils.get_statusfile(outputs[0])
     return (os.path.exists(okfile) and
             not self.force_rerun) or cmd_id in self.successfully_completed