Beispiel #1
0
def spawn_inference_job(num_nodes, wall_time_minutes, name, workflow, dimension, args=None, **kwargs):

    # There are two optional inputs here.
    # First, args can be passed in a completed form, which is useful for re-spawning a training job 
    # that continues a previous job.

    # Second, args can be built from kwargs.  Any arg not supplied will rely on the default values
    # in the FLAGS class, so it's a YMMV kind of situation

    # TODO: verify kwargs work

    if args is None:
        args = build_arg_list(kwargs)

    if dimension == '2D':
        app = 'event-ID-2D-inference'
    else:
        app = 'event-ID-3D-inference'

    job = dag.add_job(
            name                = name,
            workflow            = workflow,
            description         = 'Inference job for resnet {}'.format(dimension),
            num_nodes           = num_nodes,
            ranks_per_node      = 2,
            threads_per_rank    = 1,
            environ_vars        = "PYTHONPATH:\"\"",
            wall_time_minutes   = wall_time_minutes,
            args                = args,
            application         = app
        )

    return job
Beispiel #2
0
    def _eval_exec(self, x):
        jobname = f"task{self.counter}"
        args = f"'{self.encode(x)}'"
        envs = f"KERAS_BACKEND={self.KERAS_BACKEND}"
        #envs = ":".join(f'KERAS_BACKEND={self.KERAS_BACKEND} OMP_NUM_THREADS=62 KMP_BLOCKTIME=0 KMP_AFFINITY=\"granularity=fine,compact,1,0\"'.split())
        resources = {
            'num_nodes': 1,
            'ranks_per_node': 1,
            'threads_per_rank': 64,
            'node_packing_count': self.WORKERS_PER_NODE,
        }
        for key in resources:
            if key in x:
                resources[key] = x[key]

        if dag.current_job is not None:
            wf = dag.current_job.workflow
        else:
            wf = self.appName
        task = dag.add_job(name=jobname,
                           workflow=wf,
                           application=self.appName,
                           args=args,
                           environ_vars=envs,
                           **resources)
        logger.debug(f"Created job {jobname}")
        logger.debug(f"Args: {args}")

        future = FutureTask(task, self._on_done, fail_callback=self._on_fail)
        future.task_args = args
        return future
Beispiel #3
0
    def _eval_exec(self, x):
        jobname = f"task{self.counter}"
        # args = f"'{self.encode(x)}'"
        args = self.problem.args_format(x.values())
        pb_res = self.problem.resources
        envs = x.get('env') or pb_res.get('env') or ''
        resources = {
            'num_nodes': x.get('num_nodes') \
                or pb_res.get('num_nodes') \
                or 1,
            'ranks_per_node': x.get('ranks_per_node') \
                or pb_res.get('ranks_per_node') \
                or 1,
            'threads_per_rank': x.get('threads_per_rank') \
                or pb_res.get('threads_per_rank') \
                or 64,
            'threads_per_core': x.get('threads_per_core') \
                or pb_res.get('threads_per_core') \
                or 1,
            'cpu_affinity': x.get('cpu_affinity') \
                or pb_res.get('cpu_affinity') \
                or 'none',
            'node_packing_count': self.WORKERS_PER_NODE,
        }
        for key in resources:
            if key in x:
                resources[key] = x[key]

        if dag.current_job is not None:
            wf = dag.current_job.workflow
        else:
            wf = self.appName
        task = dag.add_job(name=jobname,
                           workflow=wf,
                           application=self.appName,
                           args=args,
                           environ_vars=envs,
                           **resources)
        logger.debug(f"Created job {jobname}")
        logger.debug(f"Args: {args}")

        future = FutureTask(task, self._on_done, fail_callback=self._on_fail)
        future.task_args = args
        return future
Beispiel #4
0
        except Exception as e:
            print(e)
            raise ("Cannot make simulation directory %s" % sim_path)
MPI.COMM_WORLD.Barrier()  # Ensure output dir created

print("Host job rank is %d Output dir is %s" % (myrank, sim_input_dir))

start = time.time()
for sim_id in range(steps):
    jobname = 'outfile_t1_' + 'for_sim_id_' + str(sim_id) + '_ranks_' + str(
        myrank) + '.txt'

    current_job = dag.add_job(name=jobname,
                              workflow="libe_workflow",
                              application="helloworld",
                              application_args=str(sleep_time),
                              num_nodes=1,
                              ranks_per_node=8,
                              stage_out_url="local:" + sim_path,
                              stage_out_files=jobname + ".out")

    success = poll_until_state(current_job, 'JOB_FINISHED')  # OR job killed
    if success:
        print("Completed job: %s rank=%d time=%f" %
              (jobname, myrank, time.time() - start))
    else:
        print(
            "Task not completed: %s rank=%d time=%f Status" %
            (jobname, myrank, time.time() - start), current_job.state)

end = time.time()
print("Done: rank=%d  time=%f" % (myrank, end - start))
Beispiel #5
0
    # Get the number of events based on an input file
    nevents = getNumberEvents(join_args_full, input_event_list)

    # Increment the total num events
    tot_events = tot_events + nevents

    print(join_args_full, "  ", nevents)

    workflow = f"uboone_beamoff_run1_midscale"

    mergeFinal_job = dag.add_job(name=f"joinedFinal_{i}",
                                 workflow=workflow,
                                 description="joining final outputfiles",
                                 num_nodes=1,
                                 ranks_per_node=1,
                                 node_packing_count=node_pack_count,
                                 args=join_args,
                                 wall_time_minutes=50,
                                 application="join_art_rootfiles")

    for ievent in range(nevents):
        reco1_args = f"-c {reco1_fcl} -s {_file} -n1 --nskip {ievent} -o %ifb_event{ievent}_reco1.root"
        celltree_args = f"-c {celltree_fcl} -s *reco1.root"
        larcv_args = f"-c {larcv_fcl} -s *postwcct.root"
        reco1a_args = f"-c {reco1a_fcl} -s *postdl.root"
        reco2_args = f"-c {reco2_fcl} -s *r1a.root"
        reco2_post_args = f"-c {reco2_post_fcl} -s *reco2.root"

        reco1_job = dag.add_job(name=f"reco1_{i}_{ievent}",
                                workflow=workflow,
Beispiel #6
0
application = APPLICATIONNAME
workflow = APPLICATIONNAME + "_Runsof_Total_{}_jobs".format(str(
    len(Num_nodes)))

for i, node in enumerate(Num_nodes):
    model_name = 'BNN_Nodes_{}_Run_ID_{}'.format(node, i)

    args = generic_params.format(PathPythonCode, Data_Dir, model_name)

    print(args)
    job = dag.add_job(name=f'{application}_node{node}_BNNRun_ID_{i}',
                      workflow=workflow,
                      description=f'Run for different {node}',
                      num_nodes=node,
                      ranks_per_node=1,
                      threads_per_rank=128,
                      threads_per_core=2,
                      cpu_affinity='depth',
                      args=args,
                      application=application)

    job.data['node'] = node
    job.data['ID'] = i
    job.save()

# Here, generate a suggested balsam submission command
print("Example of a balsam submission command to run all of these jobs: ")
print(
    "balsam submit-launch -n <num_nodes> -q <queue> -t <time> -A <account> --wf-filter {} --job-mode mpi"
    .format(APPLICATIONNAME))
Beispiel #7
0
    nevents = getNumberEvents(join_args_full, input_event_list)

    # Increment the total num events 
    tot_events = tot_events + nevents

    print(join_args_full,"  ", nevents)
    #print(join_args)

    workflow  = f"beamoff_chain_run1"

    mergeFinal_job = dag.add_job(
        name = f"joinedFinal_{i}",
        workflow = workflow,
        description = "joining final outputfiles",
        num_nodes = 1,
        ranks_per_node = 1,
        node_packing_count = node_pack_count,
        args = join_args,
        wall_time_minutes = 1,
        application= "join_art_rootfiles_preproc"
    )
    
    for ievent in range(nevents):
        beamoff_args  = f"{_file} {ievent} {ts_string}"

        beamoff_job = dag.add_job(
            name = f"beamoff_{i}_{ievent}",
            workflow = workflow,
            description = "uboone full beam off chain",
            num_nodes = 1,
            ranks_per_node = 1,
Beispiel #8
0
    def submit(self, calc_type, num_procs=None, num_nodes=None,
               ranks_per_node=None, machinefile=None, app_args=None,
               stdout=None, stderr=None, stage_inout=None,
               hyperthreads=False, dry_run=False, wait_on_run=False,
               extra_args=None):
        """Creates a new task, and either executes or schedules to execute
        in the executor

        The created task object is returned.
        """
        app = self.default_app(calc_type)

        # Specific to this class
        if machinefile is not None:
            logger.warning("machinefile arg ignored - not supported in Balsam")
            jassert(num_procs or num_nodes or ranks_per_node,
                    "No procs/nodes provided - aborting")

        # Extra_args analysis not done here - could pick up self.mpi_runner but possible
        # that Balsam finds a different runner.
        if self.auto_resources:
            num_procs, num_nodes, ranks_per_node = \
                self.resources.get_resources(
                    num_procs=num_procs,
                    num_nodes=num_nodes, ranks_per_node=ranks_per_node,
                    hyperthreads=hyperthreads)
        else:
            num_procs, num_nodes, ranks_per_node = \
                MPIResources.task_partition(num_procs, num_nodes, ranks_per_node)

        if stdout is not None or stderr is not None:
            logger.warning("Balsam does not currently accept a stdout "
                           "or stderr name - ignoring")
            stdout = None
            stderr = None

        # Will be possible to override with arg when implemented
        # (or can have option to let Balsam assign)
        default_workdir = os.getcwd()
        task = BalsamTask(app, app_args, default_workdir,
                          stdout, stderr, self.workerID)

        add_task_args = {'name': task.name,
                         'workflow': self.workflow_name,
                         'user_workdir': default_workdir,
                         'application': app.name,
                         'args': task.app_args,
                         'num_nodes': num_nodes,
                         'ranks_per_node': ranks_per_node,
                         'mpi_flags': extra_args}

        if stage_inout is not None:
            # For now hardcode staging - for testing
            add_task_args['stage_in_url'] = "local:" + stage_inout + "/*"
            add_task_args['stage_out_url'] = "local:" + stage_inout
            add_task_args['stage_out_files'] = "*.out"

        if dry_run:
            task.dry_run = True
            logger.info('Test (No submit) Runline: {}'.format(' '.join(add_task_args)))
            task.set_as_complete()
        else:
            task.process = dag.add_job(**add_task_args)

            if (wait_on_run):
                self._wait_on_run(task)

            if not task.timer.timing:
                task.timer.start()
                task.submit_time = task.timer.tstart  # Time not date - may not need if using timer.

            logger.info("Added task to Balsam database {}: "
                        "nodes {} ppn {}".
                        format(task.name, num_nodes, ranks_per_node))

            # task.workdir = task.process.working_directory  # Might not be set yet!
        self.list_of_tasks.append(task)
        return task
    # Get the number of events based on an input file
    nevents = getNumberEvents(join_args_full, input_event_list)

    # Increment the total num events
    tot_events = tot_events + nevents

    print(join_args_full, "  ", nevents)

    workflow = f"uboone_beamoff_run1_combined_container"

    mergeFinal_job = dag.add_job(name=f"joinedFinal_{i}",
                                 workflow=workflow,
                                 description="joining final outputfiles",
                                 num_nodes=1,
                                 ranks_per_node=1,
                                 node_packing_count=node_pack_count,
                                 args=join_args,
                                 wall_time_minutes=50,
                                 application="join_art_rootfiles")

    for ievent in range(nevents):
        v01b_args = f"{_file} {ievent}"
        v27_args = f" "

        v01b_job = dag.add_job(
            name=f"v01b_{i}_{ievent}",
            workflow=workflow,
            description="uboone testing v08_00_00_01b chain",
            num_nodes=1,
            ranks_per_node=1,
Beispiel #10
0
    def launch(self,
               calc_type,
               num_procs=None,
               num_nodes=None,
               ranks_per_node=None,
               machinefile=None,
               app_args=None,
               stdout=None,
               stderr=None,
               stage_inout=None,
               hyperthreads=False,
               test=False,
               wait_on_run=False):
        """Creates a new job, and either launches or schedules to launch
        in the job controller

        The created job object is returned.
        """
        app = self.default_app(calc_type)

        # Need test somewhere for if no breakdown supplied....
        # or only machinefile

        # Specific to this class
        if machinefile is not None:
            logger.warning("machinefile arg ignored - not supported in Balsam")
            jassert(num_procs or num_nodes or ranks_per_node,
                    "No procs/nodes provided - aborting")

        # Set num_procs, num_nodes and ranks_per_node for this job

        # Without resource detection
        # num_procs, num_nodes, ranks_per_node = JobController.job_partition(num_procs, num_nodes, ranks_per_node)  # Note: not included machinefile option

        # With resource detection (may do only if under-specified?? though that will not tell if larger than possible
        # for static allocation - but Balsam does allow dynamic allocation if too large!!
        # For now allow user to specify - but default is True....
        if self.auto_resources:
            num_procs, num_nodes, ranks_per_node = \
                self.resources.get_resources(
                    num_procs=num_procs,
                    num_nodes=num_nodes, ranks_per_node=ranks_per_node,
                    hyperthreads=hyperthreads)
        else:
            # Without resource detection (note: not included machinefile option)
            num_procs, num_nodes, ranks_per_node = \
                MPIResources.job_partition(num_procs, num_nodes, ranks_per_node)

        # temp - while balsam does not accept a standard out name
        if stdout is not None or stderr is not None:
            logger.warning("Balsam does not currently accept a stdout "
                           "or stderr name - ignoring")
            stdout = None
            stderr = None

        # Will be possible to override with arg when implemented
        # (or can have option to let Balsam assign)
        default_workdir = os.getcwd()
        job = BalsamJob(app, app_args, default_workdir, stdout, stderr,
                        self.workerID)

        # This is not used with Balsam for run-time as this would include wait time
        # Again considering changing launch to submit - or whatever I chose before.....
        # job.launch_time = time.time()  # Not good for timing job - as I dont know when it finishes - only poll/kill est.

        add_job_args = {
            'name': job.name,
            'workflow': "libe_workflow",  # add arg for this
            'user_workdir': default_workdir,  # add arg for this
            'application': app.name,
            'args': job.app_args,
            'num_nodes': num_nodes,
            'ranks_per_node': ranks_per_node
        }

        if stage_inout is not None:
            # For now hardcode staging - for testing
            add_job_args['stage_in_url'] = "local:" + stage_inout + "/*"
            add_job_args['stage_out_url'] = "local:" + stage_inout
            add_job_args['stage_out_files'] = "*.out"

        job.process = dag.add_job(**add_job_args)

        if (wait_on_run):
            self._wait_on_run(job)

        if not job.timer.timing:
            job.timer.start()
            job.launch_time = job.timer.tstart  # Time not date - may not need if using timer.

        logger.info("Added job to Balsam database {}: "
                    "nodes {} ppn {}".format(job.name, num_nodes,
                                             ranks_per_node))

        # job.workdir = job.process.working_directory  # Might not be set yet!!!!
        self.list_of_jobs.append(job)
        return job
    script = sys.argv[1]
else:
    script = default_script

#print("script is", script)

script_basename = os.path.splitext(script)[0]  #rm .py extension
app_name = script_basename + '.app'

#Add app if its not already there
AppDef = models.ApplicationDefinition
app_exists = AppDef.objects.filter(name__contains=app_name)
if not app_exists:
    app_path = sys.executable + ' ' + script
    app_desc = 'Test ' + script
    add_app(app_name, app_path, app_desc)

#Delete existing jobs
del_jobs()

#Add the job
job = dag.add_job(name = 'job_' + script_basename,
                  workflow = "libe_workflow", #add arg for this
                  application = app_name,
                  #application_args = job.app_args,
                  num_nodes = 1,
                  ranks_per_node = 1,
                  stage_in_url="local:/" + stage_in,
                  stage_out_url = "local:/" + stage_in, #same as in
                  stage_out_files = "*.out")
Beispiel #12
0
def mock_addjobs():
    job1 = dag.add_job(name="added1")
    job2 = dag.add_job(name="added2")
    job3 = dag.add_job(name="added3")
    dag.add_dependency(parent=job2, child=job3)
Beispiel #13
0
from balsam.launcher import dag
import os
import subprocess
import glob

node_pack_count=64
 
# ---------------------------------------------------------------------------------------------
workflow  = f"curl_testing"

mergeFinal_job = dag.add_job(
    name = f"curl1",
    workflow = workflow,
    description = "curl test",
    num_nodes = 1,
    ranks_per_node = 1,
    node_packing_count = node_pack_count,
    args = "",
    wall_time_minutes = 50,
    application= "curl_test")
Beispiel #14
0
from mpi4py import MPI
import balsam.launcher.dag as dag

comm = MPI.COMM_WORLD
rank = comm.Get_rank()

job_name = f"hello{rank}"
dag.add_job(name=job_name,
            workflow="test",
            application="hello",
            num_nodes=1,
            ranks_per_node=1)
print(f"Rank {rank} added job: success")
# Add test jobs apps and jobs - and set to run one at a time
prev_job_name = None

for job in job_list:

    app_name = os.path.splitext(job)[0]
    app_path = os.path.join(work_dir, job)
    app_desc = 'Run ' + app_name
    run_line = sys.executable + ' ' + app_path
    add_app(app_name, run_line, app_desc)

    job_name = 'job_' + app_name
    dag.add_job(name=job_name,
                workflow="libe_workflow",
                application=app_name,
                num_nodes=num_nodes,
                ranks_per_node=ranks_per_node,
                stage_out_url="local:" + work_dir,
                stage_out_files=job_name + ".out")

    # Add dependency between jobs so run one at a time.
    if prev_job_name:
        BalsamJob = dag.BalsamJob
        parent = BalsamJob.objects.get(name=prev_job_name)
        child = BalsamJob.objects.get(name=job_name)
        dag.add_dependency(parent, child)

    prev_job_name = job_name

# Check how to do in API - until then use CLI
run_cmd("balsam ls apps", True)
Beispiel #16
0
# number of files to generate and number of events per file

# populate database
# don't make more jobs than necessary:
n_jobs = int(100 * n_nodes * node_pack_count)

# This is the workflow name
workflow = f"array_add_{n_nodes}_node_core_{node_pack_count}"

# loop over files, index used for run number in events so must count from 1
for i_job in range(n_jobs):

    empty_job = dag.add_job(
        name=
        f"array_add_{i_job}_{n_nodes}_{node_pack_count}",  # This will be the name of the job in the database
        workflow=workflow,
        description=
        "empty application for serial testing",  # A description of what this job is
        num_nodes=1,  # Number of nodes each job needs
        ranks_per_node=1,  # The number of ranks per node
        node_packing_count=node_pack_count,  # This is set to 64
        wall_time_minutes=2,  # Wall time of job
        application="array_add"  # The name of the application
    )

print(f"Loaded {n_jobs} into the database under workflow {workflow}")
print("To launch these jobs, run:")
print(
    f"balsam submit-launch -n {n_nodes} -t 30 --job-mode serial --wf-filter {workflow} -A datascience -q default"
)
Beispiel #17
0
    tot_events = tot_events + nevents

    print(join_args_full, "  ", nevents)
    #print(join_args)

    workflow_timestamp = f"beamon_chain_run1_timestamp"
    workflow_main = f"beamon_chain_run1"
    workflow_join = f"beamon_chain_run1_join"

    timestamp_args = f"{_file}"

    timestamp_job = dag.add_job(
        name=f"timestamp_{i}",
        workflow=workflow_timestamp,
        description="Container that gets the timestamps for the event",
        num_nodes=1,
        ranks_per_node=1,
        node_packing_count=node_pack_count,
        args=timestamp_args,
        wall_time_minutes=15,
        application="GetTimestampFile")

    mergeFinal_job = dag.add_job(name=f"joinedFinal_{i}",
                                 workflow=workflow_join,
                                 description="joining final outputfiles",
                                 num_nodes=1,
                                 ranks_per_node=1,
                                 node_packing_count=node_pack_count,
                                 args=join_args,
                                 wall_time_minutes=10,
                                 application="join_art_rootfiles")
   
    # print the file
    print("File: ", ifile)


    # loop over events for file, index used for event number so must count from 1
    for ievent in range (1, n_events + 1):
	
	# offset run number by 1 million to avoid overlap with fermigrid production
        irun = ifile + 1000000

        MCP2_0_args  = f"{ifile} {irun} {ievent}" 

        MCP2_0_job = dag.add_job(
            name = f"gen_long_{ifile}_{ievent}",                # This will be the name of the job in the database
            workflow = workflow, 
            description = "cosmics generation stage only",  # A description of what this job is
            num_nodes = 1,                                     # Number of nodes each job needs
            ranks_per_node = 1,                                # The number of ranks per node
            node_packing_count = node_pack_count,              # This is set to 64
            args = MCP2_0_args,                                # The arguments to the application (the bash script being run)
            wall_time_minutes = 2,                            # Wall time of job
            application= "cosmics_gen_stage"         # The name of the application
        )

print("Total number of events to be generated: ", tot_events)

print("To launch these jobs, run:")
print(f"balsam submit-launch -n {n_nodes} -t 30 --job-mode serial --wf-filter {workflow} -A datascience -q default")