コード例 #1
0
ファイル: serial_mode_timed.py プロジェクト: felker/balsam
 def _on_exit(self):
     timeout_pks = list(
         self._manager.filter(state="RUNNING").values_list("pk", flat=True))
     logger.info(f"Timing out {len(timeout_pks)} running jobs.")
     BalsamJob.batch_update_state(timeout_pks, "RUN_TIMEOUT", release=True)
     self._manager.release_all_owned()
     logger.info(f"BalsamJobSource thread finished.")
コード例 #2
0
    def _handle_dones(self, done_pks):
        for pk in done_pks:
            rank = self.running_locations[pk]
            self.revert_assign(rank, pk)

        BalsamJob.batch_update_state(done_pks, 'RUN_DONE')
        self.job_source.release(done_pks)
        logger.info(f"RUN_DONE: {len(done_pks)} jobs")
コード例 #3
0
    def allocate_next_jobs(self):
        '''Generator: yield (job,rank) pairs and mark the nodes/ranks as busy'''
        self.refresh_job_cache()
        send_requests = []
        pre_assignments = defaultdict(list)
        min_packing_count = 1

        for job in self.job_cache:
            if job.node_packing_count < min_packing_count: continue
            job_occ = 1.0 / job.node_packing_count

            free_ranks = (i for i in range(1, comm.size)
                          if self.node_occupancy[i] + job_occ < 1.0001)
            rank = next(free_ranks, None)

            if rank is None:
                logger.debug(f'no free ranks to assign {job.cute_id}')
                min_packing_count = job.node_packing_count + 1
            else:
                pre_assignments[rank].append(job)
                self.pre_assign(rank, job)

        if len(pre_assignments) == 0: return False

        to_acquire = [
            job.pk for rank in pre_assignments for job in pre_assignments[rank]
        ]
        acquired_pks = self.job_source.acquire(to_acquire)
        logger.info(
            f'Acquired lock on {len(acquired_pks)} out of {len(to_acquire)} jobs marked for running'
        )

        # Make actual assignment:
        for (rank, pre_jobs) in pre_assignments.items():
            runjobs = []
            for j in pre_jobs:
                if j.pk in acquired_pks:
                    runjobs.append(j)
                    self.job_cache.remove(j)
                else:
                    self.revert_assign(rank, j.pk)

            if runjobs:
                mpiReq = self._send_jobs(runjobs, rank)
                logger.info(
                    f"Sent {len(runjobs)} jobs to rank {rank}: occupancy is now {self.node_occupancy[rank]}"
                )
                send_requests.append(mpiReq)

        BalsamJob.batch_update_state(acquired_pks, 'RUNNING', self.RUN_MESSAGE)
        logger.debug("allocate_next_jobs: waiting on all isends...")
        MPI.Request.waitall(send_requests)
        logger.debug("allocate_next_jobs: all isends completed.")
        return len(acquired_pks) > 0
コード例 #4
0
ファイル: transitions.py プロジェクト: coreyjadams/balsam
def update_states_from_cache(job_cache):
    # Update states of fast-forwarded jobs
    update_jobs = defaultdict(list)
    failed_jobs = []
    for job in job_cache:
        if job.state != job.__old_state:
            job.__old_state = job.state
            if job.state != 'FAILED': update_jobs[job.state].append(job.pk)
            else: failed_jobs.append(job)

    if failed_jobs: fail_update(failed_jobs)
    for newstate, joblist in update_jobs.items():
        BalsamJob.batch_update_state(joblist, newstate)
コード例 #5
0
 def exit(self):
     outstanding_job_pks = list(self.manager.running_locations.keys())
     num_timeout = len(outstanding_job_pks)
     logger.info(
         f"Shutting down with {num_timeout} jobs still running..timing out")
     BalsamJob.batch_update_state(outstanding_job_pks, 'RUN_TIMEOUT',
                                  'timed out in MPI Ensemble')
     self.manager.job_source.release_all_owned()
     self.manager.send_exit()
     logger.debug("Send_exit: master done")
     logger.info(f"master calling MPI Finalize")
     MPI.Finalize()
     logger.info(f"ensemble master exit gracefully")
     sys.exit(0)
コード例 #6
0
    def perform_updates(self, update_msgs):
        start_pks = []
        done_pks = []
        error_msgs = []

        for msg in update_msgs:
            if msg == 'exit': continue
            start_pks.extend(uuid.UUID(pk) for pk in msg['started'])  # pk list
            done_pks.extend(uuid.UUID(pk) for pk in msg['done'])  # pk list
            error_msgs.extend(msg['error'])  # list: (pk, retcode, tail)

        if start_pks:
            BalsamJob.batch_update_state(start_pks, 'RUNNING')
            logger.info(f"StatusUpdater marked {len(start_pks)} RUNNING")
        if done_pks:
            BalsamJob.batch_update_state(done_pks, 'RUN_DONE', release=True)
            logger.info(f"StatusUpdater marked {len(done_pks)} DONE")
        if error_msgs:
            self._handle_errors(error_msgs)