def spawn_inference_job(num_nodes, wall_time_minutes, name, workflow, dimension, args=None, **kwargs): # There are two optional inputs here. # First, args can be passed in a completed form, which is useful for re-spawning a training job # that continues a previous job. # Second, args can be built from kwargs. Any arg not supplied will rely on the default values # in the FLAGS class, so it's a YMMV kind of situation # TODO: verify kwargs work if args is None: args = build_arg_list(kwargs) if dimension == '2D': app = 'event-ID-2D-inference' else: app = 'event-ID-3D-inference' job = dag.add_job( name = name, workflow = workflow, description = 'Inference job for resnet {}'.format(dimension), num_nodes = num_nodes, ranks_per_node = 2, threads_per_rank = 1, environ_vars = "PYTHONPATH:\"\"", wall_time_minutes = wall_time_minutes, args = args, application = app ) return job
def _eval_exec(self, x): jobname = f"task{self.counter}" args = f"'{self.encode(x)}'" envs = f"KERAS_BACKEND={self.KERAS_BACKEND}" #envs = ":".join(f'KERAS_BACKEND={self.KERAS_BACKEND} OMP_NUM_THREADS=62 KMP_BLOCKTIME=0 KMP_AFFINITY=\"granularity=fine,compact,1,0\"'.split()) resources = { 'num_nodes': 1, 'ranks_per_node': 1, 'threads_per_rank': 64, 'node_packing_count': self.WORKERS_PER_NODE, } for key in resources: if key in x: resources[key] = x[key] if dag.current_job is not None: wf = dag.current_job.workflow else: wf = self.appName task = dag.add_job(name=jobname, workflow=wf, application=self.appName, args=args, environ_vars=envs, **resources) logger.debug(f"Created job {jobname}") logger.debug(f"Args: {args}") future = FutureTask(task, self._on_done, fail_callback=self._on_fail) future.task_args = args return future
def _eval_exec(self, x): jobname = f"task{self.counter}" # args = f"'{self.encode(x)}'" args = self.problem.args_format(x.values()) pb_res = self.problem.resources envs = x.get('env') or pb_res.get('env') or '' resources = { 'num_nodes': x.get('num_nodes') \ or pb_res.get('num_nodes') \ or 1, 'ranks_per_node': x.get('ranks_per_node') \ or pb_res.get('ranks_per_node') \ or 1, 'threads_per_rank': x.get('threads_per_rank') \ or pb_res.get('threads_per_rank') \ or 64, 'threads_per_core': x.get('threads_per_core') \ or pb_res.get('threads_per_core') \ or 1, 'cpu_affinity': x.get('cpu_affinity') \ or pb_res.get('cpu_affinity') \ or 'none', 'node_packing_count': self.WORKERS_PER_NODE, } for key in resources: if key in x: resources[key] = x[key] if dag.current_job is not None: wf = dag.current_job.workflow else: wf = self.appName task = dag.add_job(name=jobname, workflow=wf, application=self.appName, args=args, environ_vars=envs, **resources) logger.debug(f"Created job {jobname}") logger.debug(f"Args: {args}") future = FutureTask(task, self._on_done, fail_callback=self._on_fail) future.task_args = args return future
except Exception as e: print(e) raise ("Cannot make simulation directory %s" % sim_path) MPI.COMM_WORLD.Barrier() # Ensure output dir created print("Host job rank is %d Output dir is %s" % (myrank, sim_input_dir)) start = time.time() for sim_id in range(steps): jobname = 'outfile_t1_' + 'for_sim_id_' + str(sim_id) + '_ranks_' + str( myrank) + '.txt' current_job = dag.add_job(name=jobname, workflow="libe_workflow", application="helloworld", application_args=str(sleep_time), num_nodes=1, ranks_per_node=8, stage_out_url="local:" + sim_path, stage_out_files=jobname + ".out") success = poll_until_state(current_job, 'JOB_FINISHED') # OR job killed if success: print("Completed job: %s rank=%d time=%f" % (jobname, myrank, time.time() - start)) else: print( "Task not completed: %s rank=%d time=%f Status" % (jobname, myrank, time.time() - start), current_job.state) end = time.time() print("Done: rank=%d time=%f" % (myrank, end - start))
# Get the number of events based on an input file nevents = getNumberEvents(join_args_full, input_event_list) # Increment the total num events tot_events = tot_events + nevents print(join_args_full, " ", nevents) workflow = f"uboone_beamoff_run1_midscale" mergeFinal_job = dag.add_job(name=f"joinedFinal_{i}", workflow=workflow, description="joining final outputfiles", num_nodes=1, ranks_per_node=1, node_packing_count=node_pack_count, args=join_args, wall_time_minutes=50, application="join_art_rootfiles") for ievent in range(nevents): reco1_args = f"-c {reco1_fcl} -s {_file} -n1 --nskip {ievent} -o %ifb_event{ievent}_reco1.root" celltree_args = f"-c {celltree_fcl} -s *reco1.root" larcv_args = f"-c {larcv_fcl} -s *postwcct.root" reco1a_args = f"-c {reco1a_fcl} -s *postdl.root" reco2_args = f"-c {reco2_fcl} -s *r1a.root" reco2_post_args = f"-c {reco2_post_fcl} -s *reco2.root" reco1_job = dag.add_job(name=f"reco1_{i}_{ievent}", workflow=workflow,
application = APPLICATIONNAME workflow = APPLICATIONNAME + "_Runsof_Total_{}_jobs".format(str( len(Num_nodes))) for i, node in enumerate(Num_nodes): model_name = 'BNN_Nodes_{}_Run_ID_{}'.format(node, i) args = generic_params.format(PathPythonCode, Data_Dir, model_name) print(args) job = dag.add_job(name=f'{application}_node{node}_BNNRun_ID_{i}', workflow=workflow, description=f'Run for different {node}', num_nodes=node, ranks_per_node=1, threads_per_rank=128, threads_per_core=2, cpu_affinity='depth', args=args, application=application) job.data['node'] = node job.data['ID'] = i job.save() # Here, generate a suggested balsam submission command print("Example of a balsam submission command to run all of these jobs: ") print( "balsam submit-launch -n <num_nodes> -q <queue> -t <time> -A <account> --wf-filter {} --job-mode mpi" .format(APPLICATIONNAME))
nevents = getNumberEvents(join_args_full, input_event_list) # Increment the total num events tot_events = tot_events + nevents print(join_args_full," ", nevents) #print(join_args) workflow = f"beamoff_chain_run1" mergeFinal_job = dag.add_job( name = f"joinedFinal_{i}", workflow = workflow, description = "joining final outputfiles", num_nodes = 1, ranks_per_node = 1, node_packing_count = node_pack_count, args = join_args, wall_time_minutes = 1, application= "join_art_rootfiles_preproc" ) for ievent in range(nevents): beamoff_args = f"{_file} {ievent} {ts_string}" beamoff_job = dag.add_job( name = f"beamoff_{i}_{ievent}", workflow = workflow, description = "uboone full beam off chain", num_nodes = 1, ranks_per_node = 1,
def submit(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, machinefile=None, app_args=None, stdout=None, stderr=None, stage_inout=None, hyperthreads=False, dry_run=False, wait_on_run=False, extra_args=None): """Creates a new task, and either executes or schedules to execute in the executor The created task object is returned. """ app = self.default_app(calc_type) # Specific to this class if machinefile is not None: logger.warning("machinefile arg ignored - not supported in Balsam") jassert(num_procs or num_nodes or ranks_per_node, "No procs/nodes provided - aborting") # Extra_args analysis not done here - could pick up self.mpi_runner but possible # that Balsam finds a different runner. if self.auto_resources: num_procs, num_nodes, ranks_per_node = \ self.resources.get_resources( num_procs=num_procs, num_nodes=num_nodes, ranks_per_node=ranks_per_node, hyperthreads=hyperthreads) else: num_procs, num_nodes, ranks_per_node = \ MPIResources.task_partition(num_procs, num_nodes, ranks_per_node) if stdout is not None or stderr is not None: logger.warning("Balsam does not currently accept a stdout " "or stderr name - ignoring") stdout = None stderr = None # Will be possible to override with arg when implemented # (or can have option to let Balsam assign) default_workdir = os.getcwd() task = BalsamTask(app, app_args, default_workdir, stdout, stderr, self.workerID) add_task_args = {'name': task.name, 'workflow': self.workflow_name, 'user_workdir': default_workdir, 'application': app.name, 'args': task.app_args, 'num_nodes': num_nodes, 'ranks_per_node': ranks_per_node, 'mpi_flags': extra_args} if stage_inout is not None: # For now hardcode staging - for testing add_task_args['stage_in_url'] = "local:" + stage_inout + "/*" add_task_args['stage_out_url'] = "local:" + stage_inout add_task_args['stage_out_files'] = "*.out" if dry_run: task.dry_run = True logger.info('Test (No submit) Runline: {}'.format(' '.join(add_task_args))) task.set_as_complete() else: task.process = dag.add_job(**add_task_args) if (wait_on_run): self._wait_on_run(task) if not task.timer.timing: task.timer.start() task.submit_time = task.timer.tstart # Time not date - may not need if using timer. logger.info("Added task to Balsam database {}: " "nodes {} ppn {}". format(task.name, num_nodes, ranks_per_node)) # task.workdir = task.process.working_directory # Might not be set yet! self.list_of_tasks.append(task) return task
# Get the number of events based on an input file nevents = getNumberEvents(join_args_full, input_event_list) # Increment the total num events tot_events = tot_events + nevents print(join_args_full, " ", nevents) workflow = f"uboone_beamoff_run1_combined_container" mergeFinal_job = dag.add_job(name=f"joinedFinal_{i}", workflow=workflow, description="joining final outputfiles", num_nodes=1, ranks_per_node=1, node_packing_count=node_pack_count, args=join_args, wall_time_minutes=50, application="join_art_rootfiles") for ievent in range(nevents): v01b_args = f"{_file} {ievent}" v27_args = f" " v01b_job = dag.add_job( name=f"v01b_{i}_{ievent}", workflow=workflow, description="uboone testing v08_00_00_01b chain", num_nodes=1, ranks_per_node=1,
def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, machinefile=None, app_args=None, stdout=None, stderr=None, stage_inout=None, hyperthreads=False, test=False, wait_on_run=False): """Creates a new job, and either launches or schedules to launch in the job controller The created job object is returned. """ app = self.default_app(calc_type) # Need test somewhere for if no breakdown supplied.... # or only machinefile # Specific to this class if machinefile is not None: logger.warning("machinefile arg ignored - not supported in Balsam") jassert(num_procs or num_nodes or ranks_per_node, "No procs/nodes provided - aborting") # Set num_procs, num_nodes and ranks_per_node for this job # Without resource detection # num_procs, num_nodes, ranks_per_node = JobController.job_partition(num_procs, num_nodes, ranks_per_node) # Note: not included machinefile option # With resource detection (may do only if under-specified?? though that will not tell if larger than possible # for static allocation - but Balsam does allow dynamic allocation if too large!! # For now allow user to specify - but default is True.... if self.auto_resources: num_procs, num_nodes, ranks_per_node = \ self.resources.get_resources( num_procs=num_procs, num_nodes=num_nodes, ranks_per_node=ranks_per_node, hyperthreads=hyperthreads) else: # Without resource detection (note: not included machinefile option) num_procs, num_nodes, ranks_per_node = \ MPIResources.job_partition(num_procs, num_nodes, ranks_per_node) # temp - while balsam does not accept a standard out name if stdout is not None or stderr is not None: logger.warning("Balsam does not currently accept a stdout " "or stderr name - ignoring") stdout = None stderr = None # Will be possible to override with arg when implemented # (or can have option to let Balsam assign) default_workdir = os.getcwd() job = BalsamJob(app, app_args, default_workdir, stdout, stderr, self.workerID) # This is not used with Balsam for run-time as this would include wait time # Again considering changing launch to submit - or whatever I chose before..... # job.launch_time = time.time() # Not good for timing job - as I dont know when it finishes - only poll/kill est. add_job_args = { 'name': job.name, 'workflow': "libe_workflow", # add arg for this 'user_workdir': default_workdir, # add arg for this 'application': app.name, 'args': job.app_args, 'num_nodes': num_nodes, 'ranks_per_node': ranks_per_node } if stage_inout is not None: # For now hardcode staging - for testing add_job_args['stage_in_url'] = "local:" + stage_inout + "/*" add_job_args['stage_out_url'] = "local:" + stage_inout add_job_args['stage_out_files'] = "*.out" job.process = dag.add_job(**add_job_args) if (wait_on_run): self._wait_on_run(job) if not job.timer.timing: job.timer.start() job.launch_time = job.timer.tstart # Time not date - may not need if using timer. logger.info("Added job to Balsam database {}: " "nodes {} ppn {}".format(job.name, num_nodes, ranks_per_node)) # job.workdir = job.process.working_directory # Might not be set yet!!!! self.list_of_jobs.append(job) return job
script = sys.argv[1] else: script = default_script #print("script is", script) script_basename = os.path.splitext(script)[0] #rm .py extension app_name = script_basename + '.app' #Add app if its not already there AppDef = models.ApplicationDefinition app_exists = AppDef.objects.filter(name__contains=app_name) if not app_exists: app_path = sys.executable + ' ' + script app_desc = 'Test ' + script add_app(app_name, app_path, app_desc) #Delete existing jobs del_jobs() #Add the job job = dag.add_job(name = 'job_' + script_basename, workflow = "libe_workflow", #add arg for this application = app_name, #application_args = job.app_args, num_nodes = 1, ranks_per_node = 1, stage_in_url="local:/" + stage_in, stage_out_url = "local:/" + stage_in, #same as in stage_out_files = "*.out")
def mock_addjobs(): job1 = dag.add_job(name="added1") job2 = dag.add_job(name="added2") job3 = dag.add_job(name="added3") dag.add_dependency(parent=job2, child=job3)
from balsam.launcher import dag import os import subprocess import glob node_pack_count=64 # --------------------------------------------------------------------------------------------- workflow = f"curl_testing" mergeFinal_job = dag.add_job( name = f"curl1", workflow = workflow, description = "curl test", num_nodes = 1, ranks_per_node = 1, node_packing_count = node_pack_count, args = "", wall_time_minutes = 50, application= "curl_test")
from mpi4py import MPI import balsam.launcher.dag as dag comm = MPI.COMM_WORLD rank = comm.Get_rank() job_name = f"hello{rank}" dag.add_job(name=job_name, workflow="test", application="hello", num_nodes=1, ranks_per_node=1) print(f"Rank {rank} added job: success")
# Add test jobs apps and jobs - and set to run one at a time prev_job_name = None for job in job_list: app_name = os.path.splitext(job)[0] app_path = os.path.join(work_dir, job) app_desc = 'Run ' + app_name run_line = sys.executable + ' ' + app_path add_app(app_name, run_line, app_desc) job_name = 'job_' + app_name dag.add_job(name=job_name, workflow="libe_workflow", application=app_name, num_nodes=num_nodes, ranks_per_node=ranks_per_node, stage_out_url="local:" + work_dir, stage_out_files=job_name + ".out") # Add dependency between jobs so run one at a time. if prev_job_name: BalsamJob = dag.BalsamJob parent = BalsamJob.objects.get(name=prev_job_name) child = BalsamJob.objects.get(name=job_name) dag.add_dependency(parent, child) prev_job_name = job_name # Check how to do in API - until then use CLI run_cmd("balsam ls apps", True)
# number of files to generate and number of events per file # populate database # don't make more jobs than necessary: n_jobs = int(100 * n_nodes * node_pack_count) # This is the workflow name workflow = f"array_add_{n_nodes}_node_core_{node_pack_count}" # loop over files, index used for run number in events so must count from 1 for i_job in range(n_jobs): empty_job = dag.add_job( name= f"array_add_{i_job}_{n_nodes}_{node_pack_count}", # This will be the name of the job in the database workflow=workflow, description= "empty application for serial testing", # A description of what this job is num_nodes=1, # Number of nodes each job needs ranks_per_node=1, # The number of ranks per node node_packing_count=node_pack_count, # This is set to 64 wall_time_minutes=2, # Wall time of job application="array_add" # The name of the application ) print(f"Loaded {n_jobs} into the database under workflow {workflow}") print("To launch these jobs, run:") print( f"balsam submit-launch -n {n_nodes} -t 30 --job-mode serial --wf-filter {workflow} -A datascience -q default" )
tot_events = tot_events + nevents print(join_args_full, " ", nevents) #print(join_args) workflow_timestamp = f"beamon_chain_run1_timestamp" workflow_main = f"beamon_chain_run1" workflow_join = f"beamon_chain_run1_join" timestamp_args = f"{_file}" timestamp_job = dag.add_job( name=f"timestamp_{i}", workflow=workflow_timestamp, description="Container that gets the timestamps for the event", num_nodes=1, ranks_per_node=1, node_packing_count=node_pack_count, args=timestamp_args, wall_time_minutes=15, application="GetTimestampFile") mergeFinal_job = dag.add_job(name=f"joinedFinal_{i}", workflow=workflow_join, description="joining final outputfiles", num_nodes=1, ranks_per_node=1, node_packing_count=node_pack_count, args=join_args, wall_time_minutes=10, application="join_art_rootfiles")
# print the file print("File: ", ifile) # loop over events for file, index used for event number so must count from 1 for ievent in range (1, n_events + 1): # offset run number by 1 million to avoid overlap with fermigrid production irun = ifile + 1000000 MCP2_0_args = f"{ifile} {irun} {ievent}" MCP2_0_job = dag.add_job( name = f"gen_long_{ifile}_{ievent}", # This will be the name of the job in the database workflow = workflow, description = "cosmics generation stage only", # A description of what this job is num_nodes = 1, # Number of nodes each job needs ranks_per_node = 1, # The number of ranks per node node_packing_count = node_pack_count, # This is set to 64 args = MCP2_0_args, # The arguments to the application (the bash script being run) wall_time_minutes = 2, # Wall time of job application= "cosmics_gen_stage" # The name of the application ) print("Total number of events to be generated: ", tot_events) print("To launch these jobs, run:") print(f"balsam submit-launch -n {n_nodes} -t 30 --job-mode serial --wf-filter {workflow} -A datascience -q default")