Ejemplo n.º 1
0
    def update(self, timeout=False):
        by_states = defaultdict(list)
        for run in self.mpi_runs:
            state = self.check_state(run)
            by_states[state].append(run)

        done_pks = [r.job.pk for r in by_states['RUN_DONE']]
        BalsamJob.batch_update_state(done_pks, 'RUN_DONE')
        self.jobsource.release(done_pks)

        error_pks = [r.job.pk for r in by_states['RUN_ERROR']]
        with db.transaction.atomic():
            models.safe_select(BalsamJob.objects.filter(pk__in=error_pks))
            for run in by_states['RUN_ERROR']:
                run.job.refresh_from_db()
                run.job.update_state('RUN_ERROR', run.err_msg)
        self.jobsource.release(error_pks)

        active_pks = [r.job.pk for r in by_states['RUNNING']]
        if timeout:
            self.timeout_kill(by_states['RUNNING'])
            BalsamJob.batch_update_state(active_pks, 'RUN_TIMEOUT')
            self.jobsource.release(active_pks)
        else:
            killquery = self.jobsource.filter(job_id__in=active_pks, state='USER_KILLED')
            kill_pks  = killquery.values_list('job_id', flat=True)
            to_kill = [run for run in by_states['RUNNING'] if run.job.pk in kill_pks]
            self.timeout_kill(to_kill)
            self.jobsource.release(kill_pks)
            for run in to_kill: by_states['RUNNING'].remove(run)

        if timeout:
            self.mpi_runs = []
        else:
            self.mpi_runs = by_states['RUNNING']
Ejemplo n.º 2
0
 def _handle_errors(self, error_jobs):
     error_pks = [j[0] for j in error_jobs]
     safe_select(BalsamJob.objects.filter(pk__in=error_pks))
     for pk, retcode, tail in error_jobs:
         rank = self.running_locations[pk]
         self.revert_assign(rank, pk)
         job = BalsamJob.objects.get(pk=pk)
         state_msg = f"nonzero return {retcode}: {tail}"
         job.update_state('RUN_ERROR', state_msg)
     self.job_source.release(error_pks)
Ejemplo n.º 3
0
 def _handle_errors(self, error_msgs):
     error_pks = [uuid.UUID(msg[0]) for msg in error_msgs]
     jobs = {
         job.pk: job
         for job in safe_select(BalsamJob.objects.filter(pk__in=error_pks))
     }
     for pk, retcode, tail in error_msgs:
         job = jobs[uuid.UUID(pk)]
         state_msg = f"nonzero return {retcode}: {tail}"
         job.update_state('RUN_ERROR', state_msg, release=True)