def handle_completed_jobs(self): finished_jobs = set(self.running_jobs.keys()) - set(self.queued_jobs()) print('Processing {} finished jobs...'.format(len(finished_jobs))) for job in finished_jobs: cmd_id, num_rerun = self.running_jobs[job] cmdline, dependencies, outputs = self.commands[cmd_id] okfile, failfile, donefile = gridutils.get_statusfile(outputs[0]) if os.path.exists(okfile): print('Command {}({}) successfully completed'.format(cmd_id, job)) del self.running_jobs[job] self.successfully_completed.add(cmd_id) elif os.path.exists(failfile): print('Fatal error running {}({}). Retrying.'.format(cmd_id, job)) if num_rerun >= self.max_retry: raise ValueError('Command {} exceeded the maximum number of retries'.format(cmd_id)) del self.running_jobs[job] self.launch(cmd_id, 1 + num_rerun) else: if not os.path.exists(donefile): print('Node failure running {}({}). This is a known issue. Assuming everything is OK...'.format(cmd_id, job)) if num_rerun >= self.max_retry: raise ValueError('Command {} exceeded the maximum number of retries'.format(cmd_id)) #self.launch(cmd_id, 1 + num_rerun) #TODO FIXME TODO FIXME del self.running_jobs[job] self.successfully_completed.add(cmd_id) else: print('OK file does not exist for {}({}). This file will not be checkpointed.'.format(cmd_id, job)) del self.running_jobs[job] self.successfully_completed.add(cmd_id)
def handle_completed_jobs(self): finished_jobs = set(self.running_jobs.keys()) - set(self.queued_jobs()) print('Processing {} finished jobs...'.format(len(finished_jobs))) for job in finished_jobs: cmd_id, num_rerun = self.running_jobs[job] cmdline, dependencies, outputs = self.commands[cmd_id] okfile, failfile, donefile = gridutils.get_statusfile(outputs[0]) if os.path.exists(okfile): print('Command {}({}) successfully completed'.format( cmd_id, job)) del self.running_jobs[job] self.successfully_completed.add(cmd_id) elif os.path.exists(failfile): print('Fatal error running {}({}). Retrying.'.format( cmd_id, job)) if num_rerun >= self.max_retry: raise ValueError( 'Command {} exceeded the maximum number of retries'. format(cmd_id)) del self.running_jobs[job] self.launch(cmd_id, 1 + num_rerun) else: if not os.path.exists(donefile): print( 'Node failure running {}({}). This is a known issue. Assuming everything is OK...' .format(cmd_id, job)) if num_rerun >= self.max_retry: raise ValueError( 'Command {} exceeded the maximum number of retries' .format(cmd_id)) #self.launch(cmd_id, 1 + num_rerun) #TODO FIXME TODO FIXME del self.running_jobs[job] self.successfully_completed.add(cmd_id) else: print( 'OK file does not exist for {}({}). This file will not be checkpointed.' .format(cmd_id, job)) del self.running_jobs[job] self.successfully_completed.add(cmd_id)
def completed(self, cmd_id): cmdline, dependencies, outputs = self.commands[cmd_id] okfile, failfile, donefile = gridutils.get_statusfile(outputs[0]) return (os.path.exists(okfile) and not self.force_rerun) or cmd_id in self.successfully_completed