Python get_total_queue_usageの例

プログラミング言語: Python

名前空間/パッケージ名: molSimplify.job_manager.tools

メソッド/関数: get_total_queue_usage

hotexamples.comのコード掲載数: 2

Python get_total_queue_usage - 2件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのmolSimplify.job_manager.tools.get_total_queue_usageの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

def resub(directory='in place'):
    # Takes a directory, resubmits errors, scf failures, and spin contaminated cases
    configure_dict = manager_io.read_configure(directory, None)
    max_resub = configure_dict['max_resub']
    max_jobs = configure_dict['max_jobs']
    hard_job_limit = configure_dict['hard_job_limit']
    hit_queue_limit = False  # Describes if this run has limitted the number of jobs submitted to work well with the queue
    # Get the state of all jobs being managed by this instance of the job manager
    completeness = moltools.check_completeness(directory,
                                               max_resub,
                                               configure_dict=configure_dict)
    print("completeness: ", completeness)
    errors = completeness[
        'Error']  # These are calculations which failed to complete
    scf_errors = completeness[
        'SCF_Error']  # These are calculations which failed to complete, appear to have an scf error, and hit wall time
    oscillating_scf_errors = completeness[
        'oscillating_scf_errors']  # These are calculations which failed to complete, appear to have an oscillaing scf error,
    need_resub = completeness[
        'Needs_resub']  # These are calculations with level shifts changed or hfx exchange changed
    spin_contaminated = completeness[
        'Spin_contaminated']  # These are finished jobs with spin contaminated solutions
    active = completeness[
        'Active']  # These are jobs which are currently running
    thermo_grad_error = completeness[
        'Thermo_grad_error']  # These are thermo jobs encountering the thermo grad error
    waiting = completeness[
        'Waiting']  # These are jobs which are or were waiting for another job to finish before continuing.
    bad_geos = completeness[
        'Bad_geos']  # These are jobs which finished, but converged to a bad geometry.
    finished = completeness['Finished']
    molscontrol_kills = completeness['molscontrol_kills']
    nactive = tools.get_number_active(
    )  # number of active jobs, counting bundled jobs as a single job
    # Kill SCF errors in progress, which are wasting computational resources
    all_scf_errors = completeness[
        'SCF_Errors_Including_Active']  # These are all jobs which appear to have scf error, including active ones
    scf_errors_to_kill = [
        scf_err for scf_err in all_scf_errors if scf_err not in scf_errors
    ]
    names_to_kill = [
        os.path.split(scf_err)[-1].rsplit('.', 1)[0]
        for scf_err in scf_errors_to_kill
    ]
    kill_jobs(names_to_kill,
              message1='Job: ',
              message2=' appears to have an scf error. Killing this job early')
    # Prep derivative jobs such as thermo single points, vertical IP, and ligand dissociation energies
    needs_derivative_jobs = list(filter(tools.check_original, finished))
    print("needs_derivative_jobs: ", needs_derivative_jobs)
    prep_derivative_jobs(directory, needs_derivative_jobs)
    resubmitted = [
    ]  # Resubmitted list gets True if the job is submitted or False if not. Contains booleans, not job identifiers.

    for job in molscontrol_kills:
        print("killed by molscontrol: ", job)
    # Resub unidentified errors
    for error in errors:
        if ((nactive + np.sum(resubmitted)) >= max_jobs) or (
            (tools.get_total_queue_usage() + np.sum(resubmitted)) >=
                hard_job_limit):
            hit_queue_limit = True
            continue
        resub_tmp = recovery.simple_resub(error)
        if resub_tmp:
            print(('Unidentified error in job: ' + os.path.split(error)[-1] +
                   ' -Resubmitting'))
            print('')
        resubmitted.append(resub_tmp)

    # Resub oscillating_scf convergence errors
    for error in oscillating_scf_errors:
        if ((nactive + np.sum(resubmitted)) >= max_jobs) or (
            (tools.get_total_queue_usage() + np.sum(resubmitted)) >=
                hard_job_limit):
            hit_queue_limit = True
            continue
        local_configure = manager_io.read_configure(directory, None)
        if 'scf' in local_configure['job_recovery']:
            resub_tmp = recovery.resub_oscillating_scf(error)
            if resub_tmp:
                print(('Oscillating SCF error identified in job: ' +
                       os.path.split(error)[-1] +
                       ' -Resubmitting with adjusted precision and grid.'))
                print('')
            resubmitted.append(resub_tmp)

    # Resub scf convergence errors
    for error in scf_errors:
        if ((nactive + np.sum(resubmitted)) >= max_jobs) or (
            (tools.get_total_queue_usage() + np.sum(resubmitted)) >=
                hard_job_limit):
            hit_queue_limit = True
            continue
        local_configure = manager_io.read_configure(directory, None)
        if 'scf' in local_configure['job_recovery']:
            resub_tmp = recovery.resub_scf(error)
            if resub_tmp:
                print(('SCF error identified in job: ' +
                       os.path.split(error)[-1] +
                       ' -Resubmitting with adjusted levelshifts'))
                print('')
            resubmitted.append(resub_tmp)

    # Resub jobs which converged to bad geometries with additional constraints
    for error in bad_geos:
        if ((nactive + np.sum(resubmitted)) >= max_jobs) or (
            (tools.get_total_queue_usage() + np.sum(resubmitted)) >=
                hard_job_limit):
            hit_queue_limit = True
            continue
        local_configure = manager_io.read_configure(directory, None)
        if 'bad_geo' in local_configure['job_recovery']:
            resub_tmp = recovery.resub_bad_geo(error, directory)
            if resub_tmp:
                print((
                    'Bad final geometry in job: ' + os.path.split(error)[-1] +
                    ' -Resubmitting from initial structure with additional constraints'
                ))
                print('')
            resubmitted.append(resub_tmp)

    # Resub spin contaminated cases
    for error in spin_contaminated:
        if ((nactive + np.sum(resubmitted)) >= max_jobs) or (
            (tools.get_total_queue_usage() + np.sum(resubmitted)) >=
                hard_job_limit):
            hit_queue_limit = True
            continue
        local_configure = manager_io.read_configure(directory, None)
        if 'spin_contaminated' in local_configure['job_recovery']:
            resub_tmp = recovery.resub_spin(error)
            if resub_tmp:
                print(('Spin contamination identified in job: ' +
                       os.path.split(error)[-1] +
                       ' -Resubmitting with adjusted HFX'))
                print('')
            resubmitted.append(resub_tmp)

    # Resub jobs with atypical parameters used to aid convergence
    for error in need_resub:
        if ((nactive + np.sum(resubmitted)) >= max_jobs) or (
            (tools.get_total_queue_usage() + np.sum(resubmitted)) >=
                hard_job_limit):
            hit_queue_limit = True
            continue
        resub_tmp = recovery.clean_resub(error)
        if resub_tmp:
            print(('Job ' + os.path.split(error)[-1] +
                   ' needs to be rerun with typical paramters. -Resubmitting'))
            print('')
        resubmitted.append(resub_tmp)

    # Create a job with a tighter convergence threshold for failed thermo jobs
    for error in thermo_grad_error:
        if ((nactive + np.sum(resubmitted)) >= max_jobs) or (
            (tools.get_total_queue_usage() + np.sum(resubmitted)) >=
                hard_job_limit):
            hit_queue_limit = True
            continue
        local_configure = manager_io.read_configure(directory, None)
        if 'thermo_grad_error' in local_configure['job_recovery']:
            resub_tmp = recovery.resub_tighter(error)
            if resub_tmp:
                print((
                    'Job ' + os.path.split(error)[-1] +
                    ' needs a better initial geo. Creating a geometry run with tighter convergence criteria'
                ))
                print('')
            resubmitted.append(resub_tmp)

    # Look at jobs in "waiting," resume them if the job they were waiting for is finished
    # Currently, this should only ever be thermo jobs waiting for an ultratight job
    for waiting_dict in waiting:
        if ((nactive + np.sum(resubmitted)) >= max_jobs) or (
            (tools.get_total_queue_usage() + np.sum(resubmitted)) >=
                hard_job_limit):
            hit_queue_limit = True
            continue
        if len(list(waiting_dict.keys())) > 1:
            raise Exception('Waiting job list improperly constructed')
        job = list(waiting_dict.keys())[0]
        waiting_for = waiting_dict[job]
        if waiting_for in finished:
            history = recovery.load_history(job)
            history.waiting = None
            history.save()
            results_for_this_job = manager_io.read_outfile(job)
            if results_for_this_job['thermo_grad_error']:
                resubmitted.append(recovery.resub_thermo(job))
            else:
                raise Exception('A method for resuming job: ' + job +
                                ' is not defined')
        else:
            resubmitted.append(False)

    # Submit jobs which haven't yet been submitted
    if not ((nactive + np.sum(resubmitted)) >= max_jobs) or (
        (tools.get_total_queue_usage() + np.sum(resubmitted)) >=
            hard_job_limit):
        to_submit = []
        jobscripts = tools.find('*_jobscript')
        active_jobs = tools.list_active_jobs(home_directory=directory,
                                             parse_bundles=True)
        for job in jobscripts:
            if not os.path.isfile(job.rsplit('_', 1)[0] +
                                  '.out') and not os.path.split(
                                      job.rsplit('_',
                                                 1)[0])[-1] in active_jobs:
                to_submit.append(job)

        short_jobs_to_submit = [
            i for i in to_submit if tools.check_short_single_point(i)
        ]
        long_jobs_to_submit = [
            i for i in to_submit if i not in short_jobs_to_submit
        ]
        if len(short_jobs_to_submit) > 0:
            bundled_jobscripts = tools.bundle_jobscripts(
                os.getcwd(), short_jobs_to_submit)
        else:
            bundled_jobscripts = []
        to_submit = bundled_jobscripts + long_jobs_to_submit

        submitted = []
        for job in to_submit:
            if ((len(submitted) + nactive + np.sum(resubmitted)) >= max_jobs
                ) or ((tools.get_total_queue_usage() + len(submitted) +
                       np.sum(resubmitted)) >= hard_job_limit):
                hit_queue_limit = True
                continue
            print(('Initial submission for job: ' + os.path.split(job)[-1]))
            tools.qsub(job)
            submitted.append(True)
    else:
        hit_queue_limit = True
        submitted = []

    number_resubmitted = np.sum(np.array(resubmitted + submitted))
    # ~ print str(number_resubmitted)+' Jobs submitted'
    return int(number_resubmitted), int(len(
        completeness['Active'])), hit_queue_limit

コード例 #2

ファイルを表示

def resume_run(args):
    print('**********************************************')
    print(args)
    ## change to  run directory
    with switch_to_rundir(args.resume):
        print((os.getcwd()))
        if args.reps:
            reps = args.reps
        else:
            reps = 1
        its = 0
        while its < reps:
            GA_run = GA_run_definition()
            GA_run.deserialize(os.getcwd() + '/.madconfig')
            if args.post_all:
                GA_run.config['post_all'] = True
                print('NB: ALL post on')
                GA_run.serialize()
            else:
                GA_run.config['post_all'] = False
                print('NB: ALL post OFF')
                GA_run.serialize()
            path_dictionary = setup_paths()
            print((GA_run.config))
            if isKeyword('DFT'):
                print('DFT ON')
                ## update which jobs are live
                logger(path_dictionary['state_path'],
                       '************************************************')
                logger(path_dictionary['state_path'],
                       str(datetime.datetime.now()) + ' waking up...yawn')
                if isKeyword('job_manager'):
                    live_job_count = tools.get_total_queue_usage()
                else:
                    live_job_count = check_queue_for_live_jobs()
                logger(
                    path_dictionary['state_path'],
                    str(datetime.datetime.now()) +
                    ' monitoring, number of live jobs ' + str(live_job_count))
                ## wake the run
            logger(path_dictionary['state_path'],
                   str(datetime.datetime.now()) + ' resuming MAD')
            wake_up_routine()
            if isKeyword('DFT'):
                ## send off oustanding jobs
                if isKeyword('job_manager'):
                    logger(
                        path_dictionary['state_path'],
                        str(datetime.datetime.now()) +
                        ' Enlisted job manager for help.')
                    os.chdir(path_dictionary['job_manager'])
                    logger(
                        path_dictionary['state_path'],
                        str(datetime.datetime.now()) +
                        ' Changed to the path of all job manager jobs.')
                    resub.main()
                    logger(
                        path_dictionary['state_path'],
                        str(datetime.datetime.now()) +
                        ' Job manager has terminated.')
                    os.chdir(isKeyword('rundir'))
                    logger(
                        path_dictionary['state_path'],
                        str(datetime.datetime.now()) +
                        ' Switching back to the base rundir.')
                else:
                    joblist = submit_outstanding_jobs()
                    logger(
                        path_dictionary['state_path'],
                        str(datetime.datetime.now()) +
                        ' tracking a total of  ' + str(len(joblist)) + ' jobs')
                    live_job_count = check_queue_for_live_jobs(
                    )  # final check on live jobs
                    logger(
                        path_dictionary['state_path'],
                        str(datetime.datetime.now()) +
                        ' going back to sleep, number of live jobs ' +
                        str(live_job_count))
                    logger(
                        path_dictionary['state_path'],
                        str(datetime.datetime.now()) + ' going back to sleep')
                    logger(path_dictionary['state_path'],
                           '************************************************')
            else:
                print('------- DONE NOW (ANN VERSION)-----')
            if not isKeyword('runtype') == "redox":
                try:
                    print(
                        '-----format_frequencies and format_distances being carried out'
                    )
                    format_frequencies()
                    format_distances()
                    print('Done with distances now.')
                except:
                    print('Passed in ga_resume_run.')
                    pass
            its += 1
            if args.sleep:
                print(('sleeping for ' + str(args.sleep)))
                time.sleep(args.sleep)