def __init__(self, app=None, app_args=None, workdir=None, stdout=None, stderr=None, workerid=None): """Instantiate a new BalsamJob instance. A new BalsamJob object is created with an id, status and configuration attributes. This will normally be created by the job_controller on a launch. """ # May want to override workdir with Balsam value when it exists Job.__init__(self, app, app_args, workdir, stdout, stderr, workerid)
def test_poll_job_with_no_launch(): from libensemble.controller import Job print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) setup_job_controller() jobctl = JobController.controller cores = NCORES #Try poll invalid job try: jobctl.poll('myjob') except: assert 1 else: assert 0 # Create a job directly with no launch (Not supported for users) registry = Register.default_registry myapp = registry.sim_default_app job1 = Job(app = myapp, stdout = 'stdout.txt') try: jobctl.poll(job1) except: assert 1 else: assert 0
def test_poll_job_with_no_launch(): from libensemble.controller import Job print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) setup_job_controller() jobctl = JobController.controller # Create a job directly with no launch (Not supported for users) myapp = jobctl.sim_default_app job1 = Job(app=myapp, stdout='stdout.txt') try: job1.poll() except JobControllerException as e: assert e.args[0][:35] == 'Polled job job_my_simjob.x.simfunc_' assert e.args[0][37:] == ' has no process ID - check jobs been launched' else: assert 0
def test_kill_job_with_no_launch(): from libensemble.controller import Job print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) setup_job_controller() jobctl = JobController.controller # Try kill invalid job try: jobctl.kill('myjob') except JobControllerException as e: assert e.args[0] == 'Invalid job has been provided' else: assert 0 # Create a job directly with no launch (Not supported for users) myapp = jobctl.sim_default_app job1 = Job(app=myapp, stdout='stdout.txt') try: jobctl.kill(job1) except JobControllerException as e: assert e.args[0][:47] == 'Attempting to kill job job_my_simjob.x.simfunc_' assert e.args[0][49:] == ' that has no process ID - check jobs been launched' else: assert 0
def test_job_funcs(): dummyappname = os.getcwd() + '/myapp.x' registry = Register() jobctrl = JobController(registry = registry, auto_resources = False) registry.register_calc(full_path=dummyappname, calc_type='gen', desc='A dummy calc') registry.register_calc(full_path=dummyappname, calc_type='sim', desc='A dummy calc') dirname = 'dir_jobc_tests' if os.path.exists(dirname): shutil.rmtree(dirname) os.mkdir(dirname) os.chdir(dirname) myworkdir=os.getcwd() #First try no app - check exception raised? jc_triggered = False try: job = Job(workdir = myworkdir, stdout = 'stdout.txt') except JobControllerException: jc_triggered = True assert jc_triggered, "Failed to raise exception if create job with no app" #Now with no workdir specified dummyapp = registry.gen_default_app job1 = Job(app = dummyapp, stdout = 'stdout.txt') wd_exist = job1.workdir_exists() assert not wd_exist #, "No workdir specified, yet workdir_exists does not return False" stdout_exist = job1.stdout_exists() assert not stdout_exist f_exist = job1.file_exists_in_workdir('running_output.txt') assert not f_exist #Create job properly specified job2 = Job(app = dummyapp, workdir = myworkdir ,stdout = 'stdout.txt') #Workdir does exist wd_exist = job2.workdir_exists() assert wd_exist #Files do not exist stdout_exist = job2.stdout_exists() assert not stdout_exist f_exist = job2.file_exists_in_workdir('running_output.txt') assert not f_exist valerr_triggered = False try: job2.read_stdout() except ValueError: valerr_triggered = True assert valerr_triggered valerr_triggered = False try: job2.read_file_in_workdir('running_output.txt') except ValueError: valerr_triggered = True assert valerr_triggered #Now create files and check positive results with open("stdout.txt","w") as f: f.write('This is stdout') with open("running_output.txt","w") as f: f.write('This is running output') #job2 = Job(app = dummyapp, workdir = myworkdir ,stdout = 'stdout.txt') #wd_exist = job2.workdir_exists() #assert wd_exist stdout_exist = job2.stdout_exists() assert stdout_exist f_exist = job2.file_exists_in_workdir('running_output.txt') assert f_exist assert 'This is stdout' in job2.read_stdout() assert 'This is running output' in job2.read_file_in_workdir('running_output.txt') #Check if workdir does not exist job2.workdir = job2.workdir + '/bubbles' wd_exist = job2.workdir_exists() assert not wd_exist os.chdir('../') shutil.rmtree(dirname)
def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, machinefile=None, app_args=None, stdout=None, stderr=None, stage_inout=None, hyperthreads=False, test=False, wait_on_run=False): """Creates a new job, and either launches or schedules launch. The created job object is returned. Parameters ---------- calc_type: String The calculation type: 'sim' or 'gen' num_procs: int, optional The total number of MPI tasks on which to launch the job. num_nodes: int, optional The number of nodes on which to launch the job. ranks_per_node: int, optional The ranks per node for this job. machinefile: string, optional Name of a machinefile for this job to use. app_args: string, optional A string of the application arguments to be added to job launch command line. stdout: string, optional A standard output filename. stderr: string, optional A standard error filename. stage_inout: string, optional A directory to copy files from. Default will take from current directory. hyperthreads: boolean, optional Whether to launch MPI tasks to hyperthreads test: boolean, optional Whether this is a test - No job will be launched. Instead runline is printed to logger (At INFO level). wait_on_run: boolean, optional Whether to wait for job to be polled as RUNNING (or other active/end state) before continuing. Returns ------- job: obj: Job The lauched job object. Note that if some combination of num_procs, num_nodes and ranks_per_node are provided, these will be honored if possible. If resource detection is on and these are omitted, then the available resources will be divided amongst workers. """ app = self.default_app(calc_type) default_workdir = os.getcwd() job = Job(app, app_args, default_workdir, stdout, stderr, self.workerID) if stage_inout is not None: logger.warning("stage_inout option ignored in this " "job_controller - runs in-place") mpi_specs = self._get_mpi_specs(num_procs, num_nodes, ranks_per_node, machinefile, hyperthreads) runline = launcher.form_command(self.mpi_command, mpi_specs) runline.append(job.app.full_path) if job.app_args is not None: runline.extend(job.app_args.split()) if test: logger.info('Test (No launch) Runline: {}'.format( ' '.join(runline))) else: subgroup_launch = True if self.mpi_launch_type in ['aprun', 'srun']: subgroup_launch = False retry_count = 0 while retry_count < self.max_launch_attempts: retry = False try: retry_string = " (Retry {})".format( retry_count) if retry_count > 0 else "" logger.info("Launching job {}{}: {}".format( job.name, retry_string, " ".join(runline))) job.process = launcher.launch( runline, cwd='./', stdout=open(job.stdout, 'w'), stderr=open(job.stderr, 'w'), start_new_session=subgroup_launch) except Exception as e: logger.warning( 'job {} launch command failed on try {} with error {}'. format(job.name, retry_count, e)) retry = True retry_count += 1 else: if (wait_on_run): self._wait_on_run(job, self.fail_time) if job.state == 'FAILED': logger.warning( 'job {} failed within fail_time on try {} with err code {}' .format(job.name, retry_count, job.errcode)) retry = True retry_count += 1 if retry and retry_count < self.max_launch_attempts: # retry_count += 1 # Do not want to reset job if not going to retry. logger.debug('Retry number {} for job {}') time.sleep(retry_count * 5) job.reset( ) # Note: Some cases may require user cleanup - currently not supported (could use callback) else: break if not job.timer.timing: job.timer.start() job.launch_time = job.timer.tstart # Time not date - may not need if using timer. self.list_of_jobs.append(job) return job