Esempio n. 1
0
 def _on_exit(self):
     timeout_pks = list(
         self._manager.filter(state="RUNNING").values_list("pk", flat=True))
     logger.info(f"Timing out {len(timeout_pks)} running jobs.")
     BalsamJob.batch_update_state(timeout_pks, "RUN_TIMEOUT", release=True)
     self._manager.release_all_owned()
     logger.info(f"BalsamJobSource thread finished.")
Esempio n. 2
0
    def _handle_dones(self, done_pks):
        for pk in done_pks:
            rank = self.running_locations[pk]
            self.revert_assign(rank, pk)

        BalsamJob.batch_update_state(done_pks, 'RUN_DONE')
        self.job_source.release(done_pks)
        logger.info(f"RUN_DONE: {len(done_pks)} jobs")
Esempio n. 3
0
    def allocate_next_jobs(self):
        '''Generator: yield (job,rank) pairs and mark the nodes/ranks as busy'''
        self.refresh_job_cache()
        send_requests = []
        pre_assignments = defaultdict(list)
        min_packing_count = 1

        for job in self.job_cache:
            if job.node_packing_count < min_packing_count: continue
            job_occ = 1.0 / job.node_packing_count

            free_ranks = (i for i in range(1, comm.size)
                          if self.node_occupancy[i] + job_occ < 1.0001)
            rank = next(free_ranks, None)

            if rank is None:
                logger.debug(f'no free ranks to assign {job.cute_id}')
                min_packing_count = job.node_packing_count + 1
            else:
                pre_assignments[rank].append(job)
                self.pre_assign(rank, job)

        if len(pre_assignments) == 0: return False

        to_acquire = [
            job.pk for rank in pre_assignments for job in pre_assignments[rank]
        ]
        acquired_pks = self.job_source.acquire(to_acquire)
        logger.info(
            f'Acquired lock on {len(acquired_pks)} out of {len(to_acquire)} jobs marked for running'
        )

        # Make actual assignment:
        for (rank, pre_jobs) in pre_assignments.items():
            runjobs = []
            for j in pre_jobs:
                if j.pk in acquired_pks:
                    runjobs.append(j)
                    self.job_cache.remove(j)
                else:
                    self.revert_assign(rank, j.pk)

            if runjobs:
                mpiReq = self._send_jobs(runjobs, rank)
                logger.info(
                    f"Sent {len(runjobs)} jobs to rank {rank}: occupancy is now {self.node_occupancy[rank]}"
                )
                send_requests.append(mpiReq)

        BalsamJob.batch_update_state(acquired_pks, 'RUNNING', self.RUN_MESSAGE)
        logger.debug("allocate_next_jobs: waiting on all isends...")
        MPI.Request.waitall(send_requests)
        logger.debug("allocate_next_jobs: all isends completed.")
        return len(acquired_pks) > 0
Esempio n. 4
0
def create_jobs(N):
    """If we're on a command line, create N tasks to square a number"""
    for i in range(N):
        job = BalsamJob(
            name=f"square{i}",
            workflow="demo-square",
            application="square",
        )
        job.data["x"] = i
        job.save()
    print(f"Created {N} jobs")
Esempio n. 5
0
def update_states_from_cache(job_cache):
    # Update states of fast-forwarded jobs
    update_jobs = defaultdict(list)
    failed_jobs = []
    for job in job_cache:
        if job.state != job.__old_state:
            job.__old_state = job.state
            if job.state != 'FAILED': update_jobs[job.state].append(job.pk)
            else: failed_jobs.append(job)

    if failed_jobs: fail_update(failed_jobs)
    for newstate, joblist in update_jobs.items():
        BalsamJob.batch_update_state(joblist, newstate)
Esempio n. 6
0
 def exit(self):
     outstanding_job_pks = list(self.manager.running_locations.keys())
     num_timeout = len(outstanding_job_pks)
     logger.info(
         f"Shutting down with {num_timeout} jobs still running..timing out")
     BalsamJob.batch_update_state(outstanding_job_pks, 'RUN_TIMEOUT',
                                  'timed out in MPI Ensemble')
     self.manager.job_source.release_all_owned()
     self.manager.send_exit()
     logger.debug("Send_exit: master done")
     logger.info(f"master calling MPI Finalize")
     MPI.Finalize()
     logger.info(f"ensemble master exit gracefully")
     sys.exit(0)
Esempio n. 7
0
    def _create_balsam_task(self, x):
        args = f"'{self.encode(x)}'"
        envs = f"KERAS_BACKEND={self.KERAS_BACKEND}:KMP_BLOCK_TIME=0"

        ranks_per_node = self.num_ranks_per_node
        threads_per_rank = self.num_threads_per_rank

        # override cli value by x's value
        if "hyperparameters" in x:
            if "ranks_per_node" in x["hyperparameters"]:
                ranks_per_node = x["hyperparameters"]["ranks_per_node"]
                threads_per_rank = self.num_threads_per_node // ranks_per_node

        resources = {
            "num_nodes": self.num_nodes_per_eval,
            "ranks_per_node": ranks_per_node,
            "threads_per_rank": threads_per_rank,
            "threads_per_core": 2,
            "node_packing_count": self.num_evals_per_node,
            "cpu_affinity": "depth",
        }

        for key in resources:
            if key in x:
                resources[key] = x[key]

        task = BalsamJob(application=self.appName,
                         args=args,
                         environ_vars=envs,
                         **resources)
        return task
Esempio n. 8
0
def add_task(point):
    job = BalsamJob(
        application=app_name,
        args=shlex.quote(json.dumps(point, cls=JSONEncoder)),
        num_nodes=1,
        ranks_per_node=1,
    )
    return job
Esempio n. 9
0
def add_task(point):
    job = BalsamJob(
        application=app_name,
        data={'point': to_encodable(point)},
        num_nodes=1,
        ranks_per_node=1,
    )
    return job
Esempio n. 10
0
    def perform_updates(self, update_msgs):
        start_pks = []
        done_pks = []
        error_msgs = []

        for msg in update_msgs:
            if msg == 'exit': continue
            start_pks.extend(uuid.UUID(pk) for pk in msg['started'])  # pk list
            done_pks.extend(uuid.UUID(pk) for pk in msg['done'])  # pk list
            error_msgs.extend(msg['error'])  # list: (pk, retcode, tail)

        if start_pks:
            BalsamJob.batch_update_state(start_pks, 'RUNNING')
            logger.info(f"StatusUpdater marked {len(start_pks)} RUNNING")
        if done_pks:
            BalsamJob.batch_update_state(done_pks, 'RUN_DONE', release=True)
            logger.info(f"StatusUpdater marked {len(done_pks)} DONE")
        if error_msgs:
            self._handle_errors(error_msgs)
Esempio n. 11
0
def clone(job, **kwargs):
    assert isinstance(job, BalsamJob)
    new_job = BalsamJob()

    exclude_fields = '''_state objects source state tick user_workdir
    lock state_history job_id'''.split()
    fields = [f for f in job.__dict__ if f not in exclude_fields]

    for f in fields:
        new_job.__dict__[f] = job.__dict__[f]
    assert new_job.pk != job.pk

    for k, v in kwargs.items():
        try:
            field = job._meta.get_field(k)
        except:
            raise ValueError(f"Invalid field name: {k}")
        else:
            new_job.__dict__[k] = v
    return new_job
Esempio n. 12
0
def pre_submit(problem, run, workflow):
    """Validate command line; prepare apps"""
    from balsam.core.models import BalsamJob

    validate(problem, run, workflow)
    print("Bootstrapping apps...", end="", flush=True)
    bootstrap_apps()
    print("OK")

    job = BalsamJob(name=workflow, workflow=workflow)
    return job
Esempio n. 13
0
def new_job(name, workdir, workflow_tag):
    '''Create a new BalsamJob object *without* saving it to DB'''
    return BalsamJob(
        name=name,
        user_workdir=workdir,  # the job will run inside this directory
        workflow=workflow_tag,
        application=APPNAME,
        num_nodes=NNODES,
        ranks_per_node=RPN,
        threads_per_rank=TPR,
        cpu_affinity='depth',
    )
Esempio n. 14
0
    def _create_balsam_task(self, x):
        args = f"'{self.encode(x)}'"
        envs = f"KERAS_BACKEND={self.KERAS_BACKEND}"
        # envs = ":".join(f'KERAS_BACKEND={self.KERAS_BACKEND} OMP_NUM_THREADS=62 KMP_BLOCKTIME=0 KMP_AFFINITY=\"granularity=fine,compact,1,0\"'.split())
        resources = {
            'num_nodes': 1,
            'ranks_per_node': 1,
            'threads_per_rank': 64,
            'node_packing_count': self.WORKERS_PER_NODE,
        }
        for key in resources:
            if key in x:
                resources[key] = x[key]

        task = BalsamJob(application=self.appName,
                         args=args,
                         environ_vars=envs,
                         **resources)
        return task
Esempio n. 15
0
    def _create_balsam_task(self, x):
        args = f"'{self.encode(x)}'"
        envs = f"KERAS_BACKEND={self.KERAS_BACKEND}"
        resources = {
            "num_nodes": self.num_nodes_per_eval,
            "ranks_per_node": self.num_ranks_per_node,
            "threads_per_rank": self.num_threads_per_rank,
            "node_packing_count": self.num_evals_per_node
        }

        for key in resources:
            if key in x:
                resources[key] = x[key]

        task = BalsamJob(application=self.appName,
                         args=args,
                         environ_vars=envs,
                         **resources)
        return task
Esempio n. 16
0
def run_migrations():
    from django.core.management import call_command
    from balsam.django_config.db_index import refresh_db_index
    setup()
    print(f"DB settings:", settings.DATABASES['default'])
    call_command('makemigrations', interactive=True, verbosity=2)
    call_command('migrate', interactive=True, verbosity=2)
    refresh_db_index()
    try:
        from balsam.core.models import BalsamJob
        j = BalsamJob()
        j.save()
        j.delete()
    except:
        print("BalsamJob table not properly created")
        raise
    else:
        print("BalsamJob table created successfully")
Esempio n. 17
0
def add_job(name,
            workflow,
            application,
            description='',
            args='',
            num_nodes=1,
            ranks_per_node=1,
            cpu_affinity='depth',
            threads_per_rank=1,
            threads_per_core=1,
            environ_vars={},
            data=None,
            save=True,
            **kwargs):
    '''Add a new job to the BalsamJob DB
    
    Creates a new job and saves it to the database in CREATED state.
    The job is initialized with all blank/default values for its fields; these
    must be configured by the user or provided via ``kwargs``
    
    Args:
        - ``kwargs`` (*dict*): contains BalsamJob fields (keys) and their values to
          be set on BalsamJob instantiation.

    Returns:
        - ``job`` (*BalsamJob*): the newly-created BalsamJob instance

    Raises:
        - ``ValueError``: if an invalid field name is provided to *kwargs*
    '''
    job = BalsamJob()
    job.name = name
    job.workflow = workflow
    job.application = application
    job.description = description
    job.args = args
    job.num_nodes = num_nodes
    job.ranks_per_node = ranks_per_node
    job.threads_per_rank = threads_per_rank
    job.threads_per_core = threads_per_core
    job.cpu_affinity = cpu_affinity
    job.environ_vars = environ_vars
    job.data = data if data else dict()
    job.get_application()

    for k, v in kwargs.items():
        setattr(job, k, v)

    if current_job:
        job.queued_launch = current_job.queued_launch
    if save:
        job.save()
    return job
Esempio n. 18
0
    RPN,
    TRIALS,
    COMMON_PARAMS,
    BENCHMARK_SCRIPTS,
)
from balsam.core.models import BalsamJob, ApplicationDefinition


RELEASE_PATH = os.environ['RELEASE_PATH']
PYTHON = os.path.join(RELEASE_PATH, 'env', 'bin', 'python')

for script_path in BENCHMARK_SCRIPTS:
    executable = ' '.join((PYTHON, script_path))
    app_name = script_path[script_path.find('osu_') + 4:-3]
    app, created = ApplicationDefinition.objects.get_or_create(
        name=app_name,
        defaults=dict(
            executable=executable,
        )
    )
    for (num_nodes, rpn, trial) in product(NUM_NODES, RPN, TRIALS):
        job = BalsamJob(
            name=f"{num_nodes}nodes.{rpn}rpn.{trial}",
            workflow=f"{app_name}",
            application=app_name,
            num_nodes=num_nodes,
            ranks_per_node=rpn,
            **COMMON_PARAMS,
        )
        job.save()