Beispiel #1
0
    def __init__(self,
                 app=None,
                 app_args=None,
                 workdir=None,
                 stdout=None,
                 stderr=None,
                 workerid=None):
        """Instantiate a new BalsamJob instance.

        A new BalsamJob object is created with an id, status and
        configuration attributes.  This will normally be created by the
        job_controller on a launch.
        """
        # May want to override workdir with Balsam value when it exists
        Job.__init__(self, app, app_args, workdir, stdout, stderr, workerid)
Beispiel #2
0
def test_poll_job_with_no_launch():
    from libensemble.controller import Job
    print("\nTest: {}\n".format(sys._getframe().f_code.co_name))
    setup_job_controller()
    jobctl = JobController.controller
    cores = NCORES

    #Try poll invalid job
    try:
        jobctl.poll('myjob')
    except:
        assert 1
    else:
        assert 0

    # Create a job directly with no launch (Not supported for users)
    registry = Register.default_registry
    myapp = registry.sim_default_app
    job1 = Job(app = myapp, stdout = 'stdout.txt')
    try:
        jobctl.poll(job1)
    except:
        assert 1
    else:
        assert 0
def test_poll_job_with_no_launch():
    from libensemble.controller import Job
    print("\nTest: {}\n".format(sys._getframe().f_code.co_name))
    setup_job_controller()
    jobctl = JobController.controller

    # Create a job directly with no launch (Not supported for users)
    myapp = jobctl.sim_default_app
    job1 = Job(app=myapp, stdout='stdout.txt')
    try:
        job1.poll()
    except JobControllerException as e:
        assert e.args[0][:35] == 'Polled job job_my_simjob.x.simfunc_'
        assert e.args[0][37:] == ' has no process ID - check jobs been launched'
    else:
        assert 0
def test_kill_job_with_no_launch():
    from libensemble.controller import Job
    print("\nTest: {}\n".format(sys._getframe().f_code.co_name))
    setup_job_controller()
    jobctl = JobController.controller

    # Try kill invalid job
    try:
        jobctl.kill('myjob')
    except JobControllerException as e:
        assert e.args[0] == 'Invalid job has been provided'
    else:
        assert 0

    # Create a job directly with no launch (Not supported for users)
    myapp = jobctl.sim_default_app
    job1 = Job(app=myapp, stdout='stdout.txt')
    try:
        jobctl.kill(job1)
    except JobControllerException as e:
        assert e.args[0][:47] == 'Attempting to kill job job_my_simjob.x.simfunc_'
        assert e.args[0][49:] == ' that has no process ID - check jobs been launched'
    else:
        assert 0
Beispiel #5
0
def test_job_funcs():
    dummyappname = os.getcwd() + '/myapp.x'
    registry = Register()
    jobctrl = JobController(registry = registry, auto_resources = False)
    registry.register_calc(full_path=dummyappname, calc_type='gen', desc='A dummy calc')
    registry.register_calc(full_path=dummyappname, calc_type='sim', desc='A dummy calc')

    dirname = 'dir_jobc_tests'
    if os.path.exists(dirname):
        shutil.rmtree(dirname)
    os.mkdir(dirname)
    os.chdir(dirname)
    myworkdir=os.getcwd()

    #First try no app - check exception raised?
    jc_triggered = False
    try:
        job = Job(workdir = myworkdir, stdout = 'stdout.txt')
    except JobControllerException:
        jc_triggered = True
    assert jc_triggered, "Failed to raise exception if create job with no app"

    #Now with no workdir specified
    dummyapp = registry.gen_default_app
    job1 = Job(app = dummyapp, stdout = 'stdout.txt')
    wd_exist = job1.workdir_exists()
    assert not wd_exist #, "No workdir specified, yet workdir_exists does not return False"
    stdout_exist = job1.stdout_exists()
    assert not stdout_exist
    f_exist = job1.file_exists_in_workdir('running_output.txt')
    assert not f_exist

    #Create job properly specified
    job2 = Job(app = dummyapp, workdir = myworkdir ,stdout = 'stdout.txt')

    #Workdir does exist
    wd_exist = job2.workdir_exists()
    assert wd_exist

    #Files do not exist
    stdout_exist = job2.stdout_exists()
    assert not stdout_exist
    f_exist = job2.file_exists_in_workdir('running_output.txt')
    assert not f_exist

    valerr_triggered = False
    try:
        job2.read_stdout()
    except ValueError:
        valerr_triggered = True
    assert valerr_triggered

    valerr_triggered = False
    try:
        job2.read_file_in_workdir('running_output.txt')
    except ValueError:
        valerr_triggered = True
    assert valerr_triggered

    #Now create files and check positive results
    with open("stdout.txt","w") as f:
        f.write('This is stdout')
    with open("running_output.txt","w") as f:
        f.write('This is running output')

    #job2 = Job(app = dummyapp, workdir = myworkdir ,stdout = 'stdout.txt')
    #wd_exist = job2.workdir_exists()
    #assert wd_exist
    stdout_exist = job2.stdout_exists()
    assert stdout_exist
    f_exist = job2.file_exists_in_workdir('running_output.txt')
    assert f_exist
    assert 'This is stdout' in job2.read_stdout()
    assert 'This is running output' in job2.read_file_in_workdir('running_output.txt')

    #Check if workdir does not exist
    job2.workdir = job2.workdir + '/bubbles'
    wd_exist = job2.workdir_exists()
    assert not wd_exist

    os.chdir('../')
    shutil.rmtree(dirname)
Beispiel #6
0
    def launch(self,
               calc_type,
               num_procs=None,
               num_nodes=None,
               ranks_per_node=None,
               machinefile=None,
               app_args=None,
               stdout=None,
               stderr=None,
               stage_inout=None,
               hyperthreads=False,
               test=False,
               wait_on_run=False):
        """Creates a new job, and either launches or schedules launch.

        The created job object is returned.

        Parameters
        ----------

        calc_type: String
            The calculation type: 'sim' or 'gen'

        num_procs: int, optional
            The total number of MPI tasks on which to launch the job.

        num_nodes: int, optional
            The number of nodes on which to launch the job.

        ranks_per_node: int, optional
            The ranks per node for this job.

        machinefile: string, optional
            Name of a machinefile for this job to use.

        app_args: string, optional
            A string of the application arguments to be added to job
            launch command line.

        stdout: string, optional
            A standard output filename.

        stderr: string, optional
            A standard error filename.

        stage_inout: string, optional
            A directory to copy files from. Default will take from
            current directory.

        hyperthreads: boolean, optional
            Whether to launch MPI tasks to hyperthreads

        test: boolean, optional
            Whether this is a test - No job will be launched. Instead
            runline is printed to logger (At INFO level).

        wait_on_run: boolean, optional
            Whether to wait for job to be polled as RUNNING (or other
            active/end state) before continuing.


        Returns
        -------

        job: obj: Job
            The lauched job object.


        Note that if some combination of num_procs, num_nodes and
        ranks_per_node are provided, these will be honored if
        possible. If resource detection is on and these are omitted,
        then the available resources will be divided amongst workers.
        """

        app = self.default_app(calc_type)
        default_workdir = os.getcwd()
        job = Job(app, app_args, default_workdir, stdout, stderr,
                  self.workerID)

        if stage_inout is not None:
            logger.warning("stage_inout option ignored in this "
                           "job_controller - runs in-place")

        mpi_specs = self._get_mpi_specs(num_procs, num_nodes, ranks_per_node,
                                        machinefile, hyperthreads)
        runline = launcher.form_command(self.mpi_command, mpi_specs)
        runline.append(job.app.full_path)
        if job.app_args is not None:
            runline.extend(job.app_args.split())

        if test:
            logger.info('Test (No launch) Runline: {}'.format(
                ' '.join(runline)))
        else:
            subgroup_launch = True
            if self.mpi_launch_type in ['aprun', 'srun']:
                subgroup_launch = False

            retry_count = 0
            while retry_count < self.max_launch_attempts:
                retry = False
                try:
                    retry_string = " (Retry {})".format(
                        retry_count) if retry_count > 0 else ""
                    logger.info("Launching job {}{}: {}".format(
                        job.name, retry_string, " ".join(runline)))

                    job.process = launcher.launch(
                        runline,
                        cwd='./',
                        stdout=open(job.stdout, 'w'),
                        stderr=open(job.stderr, 'w'),
                        start_new_session=subgroup_launch)
                except Exception as e:
                    logger.warning(
                        'job {} launch command failed on try {} with error {}'.
                        format(job.name, retry_count, e))
                    retry = True
                    retry_count += 1
                else:
                    if (wait_on_run):
                        self._wait_on_run(job, self.fail_time)

                    if job.state == 'FAILED':
                        logger.warning(
                            'job {} failed within fail_time on try {} with err code {}'
                            .format(job.name, retry_count, job.errcode))
                        retry = True
                        retry_count += 1

                if retry and retry_count < self.max_launch_attempts:
                    # retry_count += 1 # Do not want to reset job if not going to retry.
                    logger.debug('Retry number {} for job {}')
                    time.sleep(retry_count * 5)
                    job.reset(
                    )  # Note: Some cases may require user cleanup - currently not supported (could use callback)
                else:
                    break

            if not job.timer.timing:
                job.timer.start()
                job.launch_time = job.timer.tstart  # Time not date - may not need if using timer.

            self.list_of_jobs.append(job)

        return job