def _on_exit(self): timeout_pks = list( self._manager.filter(state="RUNNING").values_list("pk", flat=True)) logger.info(f"Timing out {len(timeout_pks)} running jobs.") BalsamJob.batch_update_state(timeout_pks, "RUN_TIMEOUT", release=True) self._manager.release_all_owned() logger.info(f"BalsamJobSource thread finished.")
def _handle_dones(self, done_pks): for pk in done_pks: rank = self.running_locations[pk] self.revert_assign(rank, pk) BalsamJob.batch_update_state(done_pks, 'RUN_DONE') self.job_source.release(done_pks) logger.info(f"RUN_DONE: {len(done_pks)} jobs")
def allocate_next_jobs(self): '''Generator: yield (job,rank) pairs and mark the nodes/ranks as busy''' self.refresh_job_cache() send_requests = [] pre_assignments = defaultdict(list) min_packing_count = 1 for job in self.job_cache: if job.node_packing_count < min_packing_count: continue job_occ = 1.0 / job.node_packing_count free_ranks = (i for i in range(1, comm.size) if self.node_occupancy[i] + job_occ < 1.0001) rank = next(free_ranks, None) if rank is None: logger.debug(f'no free ranks to assign {job.cute_id}') min_packing_count = job.node_packing_count + 1 else: pre_assignments[rank].append(job) self.pre_assign(rank, job) if len(pre_assignments) == 0: return False to_acquire = [ job.pk for rank in pre_assignments for job in pre_assignments[rank] ] acquired_pks = self.job_source.acquire(to_acquire) logger.info( f'Acquired lock on {len(acquired_pks)} out of {len(to_acquire)} jobs marked for running' ) # Make actual assignment: for (rank, pre_jobs) in pre_assignments.items(): runjobs = [] for j in pre_jobs: if j.pk in acquired_pks: runjobs.append(j) self.job_cache.remove(j) else: self.revert_assign(rank, j.pk) if runjobs: mpiReq = self._send_jobs(runjobs, rank) logger.info( f"Sent {len(runjobs)} jobs to rank {rank}: occupancy is now {self.node_occupancy[rank]}" ) send_requests.append(mpiReq) BalsamJob.batch_update_state(acquired_pks, 'RUNNING', self.RUN_MESSAGE) logger.debug("allocate_next_jobs: waiting on all isends...") MPI.Request.waitall(send_requests) logger.debug("allocate_next_jobs: all isends completed.") return len(acquired_pks) > 0
def create_jobs(N): """If we're on a command line, create N tasks to square a number""" for i in range(N): job = BalsamJob( name=f"square{i}", workflow="demo-square", application="square", ) job.data["x"] = i job.save() print(f"Created {N} jobs")
def update_states_from_cache(job_cache): # Update states of fast-forwarded jobs update_jobs = defaultdict(list) failed_jobs = [] for job in job_cache: if job.state != job.__old_state: job.__old_state = job.state if job.state != 'FAILED': update_jobs[job.state].append(job.pk) else: failed_jobs.append(job) if failed_jobs: fail_update(failed_jobs) for newstate, joblist in update_jobs.items(): BalsamJob.batch_update_state(joblist, newstate)
def exit(self): outstanding_job_pks = list(self.manager.running_locations.keys()) num_timeout = len(outstanding_job_pks) logger.info( f"Shutting down with {num_timeout} jobs still running..timing out") BalsamJob.batch_update_state(outstanding_job_pks, 'RUN_TIMEOUT', 'timed out in MPI Ensemble') self.manager.job_source.release_all_owned() self.manager.send_exit() logger.debug("Send_exit: master done") logger.info(f"master calling MPI Finalize") MPI.Finalize() logger.info(f"ensemble master exit gracefully") sys.exit(0)
def _create_balsam_task(self, x): args = f"'{self.encode(x)}'" envs = f"KERAS_BACKEND={self.KERAS_BACKEND}:KMP_BLOCK_TIME=0" ranks_per_node = self.num_ranks_per_node threads_per_rank = self.num_threads_per_rank # override cli value by x's value if "hyperparameters" in x: if "ranks_per_node" in x["hyperparameters"]: ranks_per_node = x["hyperparameters"]["ranks_per_node"] threads_per_rank = self.num_threads_per_node // ranks_per_node resources = { "num_nodes": self.num_nodes_per_eval, "ranks_per_node": ranks_per_node, "threads_per_rank": threads_per_rank, "threads_per_core": 2, "node_packing_count": self.num_evals_per_node, "cpu_affinity": "depth", } for key in resources: if key in x: resources[key] = x[key] task = BalsamJob(application=self.appName, args=args, environ_vars=envs, **resources) return task
def add_task(point): job = BalsamJob( application=app_name, args=shlex.quote(json.dumps(point, cls=JSONEncoder)), num_nodes=1, ranks_per_node=1, ) return job
def add_task(point): job = BalsamJob( application=app_name, data={'point': to_encodable(point)}, num_nodes=1, ranks_per_node=1, ) return job
def perform_updates(self, update_msgs): start_pks = [] done_pks = [] error_msgs = [] for msg in update_msgs: if msg == 'exit': continue start_pks.extend(uuid.UUID(pk) for pk in msg['started']) # pk list done_pks.extend(uuid.UUID(pk) for pk in msg['done']) # pk list error_msgs.extend(msg['error']) # list: (pk, retcode, tail) if start_pks: BalsamJob.batch_update_state(start_pks, 'RUNNING') logger.info(f"StatusUpdater marked {len(start_pks)} RUNNING") if done_pks: BalsamJob.batch_update_state(done_pks, 'RUN_DONE', release=True) logger.info(f"StatusUpdater marked {len(done_pks)} DONE") if error_msgs: self._handle_errors(error_msgs)
def clone(job, **kwargs): assert isinstance(job, BalsamJob) new_job = BalsamJob() exclude_fields = '''_state objects source state tick user_workdir lock state_history job_id'''.split() fields = [f for f in job.__dict__ if f not in exclude_fields] for f in fields: new_job.__dict__[f] = job.__dict__[f] assert new_job.pk != job.pk for k, v in kwargs.items(): try: field = job._meta.get_field(k) except: raise ValueError(f"Invalid field name: {k}") else: new_job.__dict__[k] = v return new_job
def pre_submit(problem, run, workflow): """Validate command line; prepare apps""" from balsam.core.models import BalsamJob validate(problem, run, workflow) print("Bootstrapping apps...", end="", flush=True) bootstrap_apps() print("OK") job = BalsamJob(name=workflow, workflow=workflow) return job
def new_job(name, workdir, workflow_tag): '''Create a new BalsamJob object *without* saving it to DB''' return BalsamJob( name=name, user_workdir=workdir, # the job will run inside this directory workflow=workflow_tag, application=APPNAME, num_nodes=NNODES, ranks_per_node=RPN, threads_per_rank=TPR, cpu_affinity='depth', )
def _create_balsam_task(self, x): args = f"'{self.encode(x)}'" envs = f"KERAS_BACKEND={self.KERAS_BACKEND}" # envs = ":".join(f'KERAS_BACKEND={self.KERAS_BACKEND} OMP_NUM_THREADS=62 KMP_BLOCKTIME=0 KMP_AFFINITY=\"granularity=fine,compact,1,0\"'.split()) resources = { 'num_nodes': 1, 'ranks_per_node': 1, 'threads_per_rank': 64, 'node_packing_count': self.WORKERS_PER_NODE, } for key in resources: if key in x: resources[key] = x[key] task = BalsamJob(application=self.appName, args=args, environ_vars=envs, **resources) return task
def _create_balsam_task(self, x): args = f"'{self.encode(x)}'" envs = f"KERAS_BACKEND={self.KERAS_BACKEND}" resources = { "num_nodes": self.num_nodes_per_eval, "ranks_per_node": self.num_ranks_per_node, "threads_per_rank": self.num_threads_per_rank, "node_packing_count": self.num_evals_per_node } for key in resources: if key in x: resources[key] = x[key] task = BalsamJob(application=self.appName, args=args, environ_vars=envs, **resources) return task
def run_migrations(): from django.core.management import call_command from balsam.django_config.db_index import refresh_db_index setup() print(f"DB settings:", settings.DATABASES['default']) call_command('makemigrations', interactive=True, verbosity=2) call_command('migrate', interactive=True, verbosity=2) refresh_db_index() try: from balsam.core.models import BalsamJob j = BalsamJob() j.save() j.delete() except: print("BalsamJob table not properly created") raise else: print("BalsamJob table created successfully")
def add_job(name, workflow, application, description='', args='', num_nodes=1, ranks_per_node=1, cpu_affinity='depth', threads_per_rank=1, threads_per_core=1, environ_vars={}, data=None, save=True, **kwargs): '''Add a new job to the BalsamJob DB Creates a new job and saves it to the database in CREATED state. The job is initialized with all blank/default values for its fields; these must be configured by the user or provided via ``kwargs`` Args: - ``kwargs`` (*dict*): contains BalsamJob fields (keys) and their values to be set on BalsamJob instantiation. Returns: - ``job`` (*BalsamJob*): the newly-created BalsamJob instance Raises: - ``ValueError``: if an invalid field name is provided to *kwargs* ''' job = BalsamJob() job.name = name job.workflow = workflow job.application = application job.description = description job.args = args job.num_nodes = num_nodes job.ranks_per_node = ranks_per_node job.threads_per_rank = threads_per_rank job.threads_per_core = threads_per_core job.cpu_affinity = cpu_affinity job.environ_vars = environ_vars job.data = data if data else dict() job.get_application() for k, v in kwargs.items(): setattr(job, k, v) if current_job: job.queued_launch = current_job.queued_launch if save: job.save() return job
RPN, TRIALS, COMMON_PARAMS, BENCHMARK_SCRIPTS, ) from balsam.core.models import BalsamJob, ApplicationDefinition RELEASE_PATH = os.environ['RELEASE_PATH'] PYTHON = os.path.join(RELEASE_PATH, 'env', 'bin', 'python') for script_path in BENCHMARK_SCRIPTS: executable = ' '.join((PYTHON, script_path)) app_name = script_path[script_path.find('osu_') + 4:-3] app, created = ApplicationDefinition.objects.get_or_create( name=app_name, defaults=dict( executable=executable, ) ) for (num_nodes, rpn, trial) in product(NUM_NODES, RPN, TRIALS): job = BalsamJob( name=f"{num_nodes}nodes.{rpn}rpn.{trial}", workflow=f"{app_name}", application=app_name, num_nodes=num_nodes, ranks_per_node=rpn, **COMMON_PARAMS, ) job.save()