Exemple #1
0
def local_setup():
    threads_config = Config(
        executors=[
            HighThroughputExecutor(
                label="theta_htex",
                # worker_debug=True,
                cores_per_worker=4,
                address=address_by_hostname(),
                provider=CobaltProvider(
                    queue='debug-flat-quad',
                    account='CSC249ADCD01',
                    launcher=AprunLauncher(overrides="-d 64"),
                    worker_init='source activate parsl-issues',
                    init_blocks=1,
                    max_blocks=1,
                    min_blocks=1,
                    nodes_per_block=4,
                    cmd_timeout=60,
                    walltime='00:10:00',
                ),
            )
        ],
        monitoring=MonitoringHub(hub_address=address_by_hostname(),
                                 hub_port=55055,
                                 logging_level=logging.DEBUG,
                                 resource_monitoring_interval=10),
        strategy=None)
    parsl.load(threads_config)
def theta_nwchem_config(log_dir: str,
                        nodes_per_nwchem: int = 2,
                        total_nodes: int = int(
                            os.environ.get("COBALT_JOBSIZE", 1)),
                        ml_prefetch: int = 0) -> Config:
    """Theta configuration where QC workers sit on the launch node (to be able to aprun)
    and ML workers are placed on compute nodes

    Args:
        nodes_per_nwchem: Number of nodes per NWChem computation
        log_dir: Path to store monitoring DB and parsl logs
        total_nodes: Total number of nodes available. Default: COBALT_JOBSIZE
        ml_prefetch: Number of tasks for ML workers to prefetch for inference
    Returns:
        (Config) Parsl configuration
    """
    assert total_nodes % nodes_per_nwchem == 0, "NWChem node count not a multiple of nodes per task"
    nwc_workers = total_nodes // nodes_per_nwchem

    return Config(
        executors=[
            ThreadPoolExecutor(label='qc', max_threads=nwc_workers),
            HighThroughputExecutor(
                address=address_by_hostname(),
                label="ml",
                max_workers=1,
                prefetch_capacity=ml_prefetch,
                provider=LocalProvider(
                    nodes_per_block=
                    nodes_per_nwchem,  # Minimum increment in blcoks
                    init_blocks=0,
                    max_blocks=total_nodes //
                    nodes_per_nwchem,  # Limits the number of manager processes,
                    launcher=AprunLauncher(
                        overrides='-d 256 --cc depth -j 4'
                    ),  # Places worker on the compute node
                    worker_init='''
module load miniconda-3
conda activate /lus/theta-fs0/projects/CSC249ADCD08/edw/env
    ''',
                ),
            )
        ],
        monitoring=MonitoringHub(
            hub_address=address_by_hostname(),
            monitoring_debug=False,
            resource_monitoring_interval=10,
            logdir=log_dir,
            logging_endpoint=
            f'sqlite:///{os.path.join(log_dir, "monitoring.db")}'),
        run_dir=log_dir,
        strategy='simple',
        max_idletime=15.)
Exemple #3
0
def fresh_config():
    return Config(
        executors=[
            HighThroughputExecutor(
                label='theta_local_htex_multinode',
                max_workers=1,
                address=address_by_hostname(),
                provider=CobaltProvider(
                    queue=user_opts['theta']['queue'],
                    account=user_opts['theta']['account'],
                    launcher=AprunLauncher(overrides="-d 64"),
                    walltime='00:10:00',
                    nodes_per_block=2,
                    init_blocks=1,
                    max_blocks=1,
                    # string to prepend to #COBALT blocks in the submit
                    # script to the scheduler eg: '#COBALT -t 50'
                    scheduler_options='',
                    # Command to be run before starting a worker, such as:
                    # 'module load Anaconda; source activate parsl_env'.
                    worker_init=user_opts['theta']['worker_init'],
                    cmd_timeout=120,
                ),
            )
        ], )
Exemple #4
0
def test_parsl_htex_executor():
    parsl = pytest.importorskip("parsl", minversion="0.7.2")

    from parsl.providers import LocalProvider
    from parsl.channels import LocalChannel
    from parsl.executors import HighThroughputExecutor
    from parsl.addresses import address_by_hostname
    from parsl.config import Config
    parsl_config = Config(
        executors=[
            HighThroughputExecutor(
                label="coffea_parsl_default",
                address=address_by_hostname(),
                cores_per_worker=max(multiprocessing.cpu_count() // 2, 1),
                max_workers=1,
                provider=LocalProvider(channel=LocalChannel(),
                                       init_blocks=1,
                                       max_blocks=1,
                                       nodes_per_block=1),
            )
        ],
        strategy=None,
    )

    do_parsl_job(parsl_config)
Exemple #5
0
def configure_parsl(n_threads, monitoring, **kwargs):
    from parsl.config import Config
    from parsl.executors.threads import ThreadPoolExecutor
    from parsl.addresses import address_by_hostname

    if monitoring:
        from parsl.monitoring import MonitoringHub
        monitoring = MonitoringHub(
            hub_address=address_by_hostname(),
            hub_port=55055,
            logging_level=logging.INFO,
            resource_monitoring_interval=10,
        )
    else:
        monitoring = None

    local_threads = ThreadPoolExecutor(max_threads=n_threads,
                                       label='local_threads')
    config = Config(
        executors=[local_threads],
        monitoring=monitoring,
        strategy=None,
        app_cache=True,
    )
    return config
Exemple #6
0
def fresh_config():
    config = Config(
        executors=[
            HighThroughputExecutor(
                label="bw_htex",
                cores_per_worker=1,
                worker_debug=False,
                max_workers=1,
                address=address_by_hostname(),
                provider=TorqueProvider(
                    queue='normal',
                    launcher=AprunLauncher(overrides="-b -- bwpy-environ --"),
                    # string to prepend to #SBATCH blocks in the submit
                    # script to the scheduler eg: '#SBATCH --constraint=knl,quad,cache'
                    scheduler_options='',
                    # Command to be run before starting a worker, such as:
                    # 'module load Anaconda; source activate parsl_env'.
                    worker_init=user_opts['bluewaters']['worker_init'],
                    init_blocks=1,
                    max_blocks=1,
                    min_blocks=1,
                    nodes_per_block=2,
                    walltime='00:30:00',
                    cmd_timeout=120,
                ),
            )
        ], )
    return config
Exemple #7
0
def fresh_config():
    return Config(
        executors=[
            HighThroughputExecutor(
                label="frontera_htex",
                address=address_by_hostname(),
                max_workers=1,
                provider=SlurmProvider(
                    cmd_timeout=
                    60,  # Add extra time for slow scheduler responses
                    channel=LocalChannel(),
                    nodes_per_block=2,
                    init_blocks=1,
                    min_blocks=1,
                    max_blocks=1,
                    partition='development',  # Replace with partition name
                    scheduler_options=user_opts['frontera']
                    ['scheduler_options'],

                    # Command to be run before starting a worker, such as:
                    # 'module load Anaconda; source activate parsl_env'.
                    worker_init=user_opts['frontera']['worker_init'],

                    # Ideally we set the walltime to the longest supported walltime.
                    walltime='00:10:00',
                    launcher=SrunLauncher(),
                ),
            )
        ], )
Exemple #8
0
def fresh_config():
    config = Config(
        executors=[
            HighThroughputExecutor(
                label='Midway_HTEX_multinode',
                worker_debug=False,
                address=address_by_hostname(),
                max_workers=1,
                provider=SlurmProvider(
                    'broadwl',  # Partition name, e.g 'broadwl'
                    launcher=SrunLauncher(),
                    nodes_per_block=2,
                    init_blocks=1,
                    min_blocks=1,
                    max_blocks=1,
                    # string to prepend to #SBATCH blocks in the submit
                    # script to the scheduler eg: '#SBATCH --constraint=knl,quad,cache'
                    scheduler_options='',
                    # Command to be run before starting a worker, such as:
                    # 'module load Anaconda; source activate parsl_env'.
                    worker_init=user_opts['midway']['worker_init'],
                    walltime='00:30:00',
                    cmd_timeout=120,
                ),
            )
        ], )
    return config
Exemple #9
0
def test_parsl_executor():
    parsl = pytest.importorskip("parsl", minversion="0.7.2")

    from coffea.processor import run_parsl_job

    from coffea.processor.parsl.detail import (_parsl_initialize, _parsl_stop)

    from parsl.providers import LocalProvider
    from parsl.channels import LocalChannel
    from parsl.executors import HighThroughputExecutor
    from parsl.addresses import address_by_hostname
    from parsl.config import Config
    parsl_config = Config(
        executors=[
            HighThroughputExecutor(
                label="coffea_parsl_default",
                address=address_by_hostname(),
                cores_per_worker=max(multiprocessing.cpu_count() // 2, 1),
                max_workers=1,
                provider=LocalProvider(channel=LocalChannel(),
                                       init_blocks=1,
                                       max_blocks=1,
                                       nodes_per_block=1),
            )
        ],
        strategy=None,
    )

    import os
    import os.path as osp

    filelist = {
        'ZJets': [osp.join(os.getcwd(), 'tests/samples/nano_dy.root')],
        'Data': [osp.join(os.getcwd(), 'tests/samples/nano_dimuon.root')]
    }
    treename = 'Events'

    from coffea.processor.test_items import NanoTestProcessor
    from coffea.processor.parsl.parsl_executor import parsl_executor

    dfk = _parsl_initialize(parsl_config)

    proc = NanoTestProcessor()

    hists = run_parsl_job(filelist,
                          treename,
                          processor_instance=proc,
                          executor=parsl_executor,
                          data_flow=dfk)

    _parsl_stop(dfk)

    assert (hists['cutflow']['ZJets_pt'] == 4)
    assert (hists['cutflow']['ZJets_mass'] == 1)
    assert (hists['cutflow']['Data_pt'] == 15)
    assert (hists['cutflow']['Data_mass'] == 5)
Exemple #10
0
def local_config(log_dir: str, max_workers: int, prefetch: int = 0) -> Config:
    """Single node with a single task per worker

    Args:
        log_dir: Path to store monitoring DB and parsl logs
        max_workers: Maximum number of concurrent tasks
        prefetch: Number of tasks for ML workers to prefetch for inference
    Returns:
        (Config) Parsl configuration
    """

    return Config(
        executors=[
            HighThroughputExecutor(
                address=address_by_hostname(),
                label="qc-worker",
                max_workers=max_workers,
                prefetch_capacity=prefetch,
                cpu_affinity='block',
                provider=LocalProvider(
                    nodes_per_block=1,
                    init_blocks=1,
                    max_blocks=1,
                    launcher=SimpleLauncher(),  # Places worker on the launch node
                ),
            ),
            HighThroughputExecutor(
                address=address_by_hostname(),
                label="ml-worker",
                max_workers=1,
                prefetch_capacity=prefetch,
                provider=LocalProvider(
                    nodes_per_block=1,
                    init_blocks=1,
                    max_blocks=1,
                    launcher=SimpleLauncher(),  # Places worker on the launch node
               )
           )
        ],
        run_dir=log_dir,
        strategy='simple',
        max_idletime=15.
    )
Exemple #11
0
def condor_config(cores_per_job=4, mem_per_core=2048,
                  total_workers=24, max_workers=200,
                  pyenv_dir='%s/.local' % (os.environ['HOME'], ),
                  grid_proxy_dir='/tmp',
                  htex_label='coffea_parsl_condor_htex',
                  wrk_init=None,
                  condor_cfg=None):
    pyenv_relpath = pyenv_dir.split('/')[-1]

    if wrk_init is None:
        wrk_init = '''
        source /cvmfs/sft.cern.ch/lcg/views/LCG_95apython3/x86_64-centos7-gcc7-opt/setup.sh
        export PATH=`pwd`/%s:$PATH
        export PYTHONPATH=`pwd`/%s:$PYTHONPATH

        export X509_USER_PROXY=`pwd`/%s
        mkdir -p ./%s
        ''' % ('%s/bin' % pyenv_relpath,
               '%s/lib/python3.6/site-packages' % pyenv_relpath,
               x509_proxy,
               htex_label)

    if condor_cfg in None:
        condor_cfg = '''
        transfer_output_files = %s
        RequestMemory = %d
        RequestCpus = %d
        ''' % (htex_label, mem_per_core * cores_per_job, cores_per_job)

    xfer_files = [pyenv_dir, osp.join(grid_proxy_dir, x509_proxy)]

    condor_htex = Config(
        executors=[
            HighThroughputExecutor(
                label=htex_label,
                address=address_by_hostname(),
                prefetch_capacity=0,
                cores_per_worker=1,
                max_workers=cores_per_job,
                worker_logdir_root='./',
                provider=CondorProvider(
                    channel=LocalChannel(),
                    init_blocks=total_workers,
                    max_blocks=max_workers,
                    nodes_per_block=1,
                    worker_init=wrk_init,
                    transfer_input_files=xfer_files,
                    scheduler_options=condor_cfg
                ),
            )
        ],
        strategy=None,
    )

    return condor_htex
Exemple #12
0
def slurm_config(
    cores_per_job=16,
    mem_per_core=2048,
    jobs_per_worker=1,
    initial_workers=4,
    max_workers=8,
    work_dir="./",
    grid_proxy_dir="/tmp",
    partition="",
    walltime="02:00:00",
    htex_label="coffea_parsl_slurm_htex",
):

    shutil.copy2(osp.join(grid_proxy_dir, x509_proxy), osp.join(work_dir, x509_proxy))

    wrk_init = """
    export XRD_RUNFORKHANDLER=1
    export X509_USER_PROXY=%s
    """ % (
        osp.join(work_dir, x509_proxy)
    )

    sched_opts = """
    #SBATCH --cpus-per-task=%d
    #SBATCH --mem-per-cpu=%d
    """ % (
        cores_per_job,
        mem_per_core,
    )

    slurm_htex = Config(
        executors=[
            HighThroughputExecutor(
                label=htex_label,
                address=address_by_hostname(),
                prefetch_capacity=0,
                max_workers=cores_per_job,
                provider=SlurmProvider(
                    channel=LocalChannel(),
                    launcher=SrunLauncher(),
                    init_blocks=initial_workers,
                    max_blocks=max_workers,
                    nodes_per_block=jobs_per_worker,
                    partition=partition,
                    scheduler_options=sched_opts,  # Enter scheduler_options if needed
                    worker_init=wrk_init,  # Enter worker_init if needed
                    walltime=walltime,
                ),
            )
        ],
        strategy=None,
    )

    return slurm_htex
Exemple #13
0
def test_parsl_htex_executor():
    parsl = pytest.importorskip("parsl", minversion="0.7.2")
    import os
    import os.path as osp

    from parsl.providers import LocalProvider
    from parsl.channels import LocalChannel
    from parsl.executors import HighThroughputExecutor
    from parsl.addresses import address_by_hostname
    from parsl.config import Config
    parsl_config = Config(
        executors=[
            HighThroughputExecutor(
                label="coffea_parsl_default",
                address=address_by_hostname(),
                cores_per_worker=max(multiprocessing.cpu_count() // 2, 1),
                max_workers=1,
                provider=LocalProvider(channel=LocalChannel(),
                                       init_blocks=1,
                                       max_blocks=1,
                                       nodes_per_block=1),
            )
        ],
        strategy=None,
    )
    parsl.load(parsl_config)

    filelist = {
        'ZJets': [osp.join(os.getcwd(), 'tests/samples/nano_dy.root')],
        'Data': [osp.join(os.getcwd(), 'tests/samples/nano_dimuon.root')]
    }

    do_parsl_job(filelist)
    do_parsl_job(filelist, compression=1)

    filelist = {
        'ZJets': {
            'treename': 'Events',
            'files': [osp.join(os.getcwd(), 'tests/samples/nano_dy.root')]
        },
        'Data': {
            'treename': 'Events',
            'files': [osp.join(os.getcwd(), 'tests/samples/nano_dimuon.root')]
        }
    }

    do_parsl_job(filelist)
    do_parsl_job(filelist, flatten=True)
Exemple #14
0
def theta_nwchem_config(
    choice: str,
    log_dir: str,
    nodes_per_nwchem: int = 2,
    total_nodes: int = int(os.environ.get("COBALT_JOBSIZE", 1))
) -> Config:
    """Theta configuration to run NWChem

    Args:
        choice: Choice of the runtime configuration
        nodes_per_nwchem: Number of nodes per NWChem computation
        log_dir: Path to store monitoring DB and parsl logs
        total_nodes: Total number of nodes available. Default: COBALT_JOBSIZE
    Returns:
        (Config) Parsl configuration
    """
    assert total_nodes % nodes_per_nwchem == 0, "NWChem node count not a multiple of nodes per task"
    nwc_workers = total_nodes // nodes_per_nwchem

    if choice == "htex":
        qc_exec = HighThroughputExecutor(
            address=address_by_hostname(),
            label="qc",
            max_workers=nwc_workers,
            cores_per_worker=1e-6,
            provider=LocalProvider(
                nodes_per_block=1,
                init_blocks=0,
                max_blocks=1,
                launcher=SimpleLauncher(),  # Places worker on the launch node
                worker_init='''
module load miniconda-3
conda activate /lus/theta-fs0/projects/CSC249ADCD08/edw/env
''',
            ),
        )
    elif choice == 'thread':
        qc_exec = ThreadPoolExecutor(label='qc', max_threads=nwc_workers)
    else:
        raise ValueError(f'Choice "{choice}" not recognized ')

    return Config(executors=[qc_exec],
                  run_dir=log_dir,
                  strategy='simple',
                  max_idletime=15.)
Exemple #15
0
def configure(memory=2048, nprocs=8, nodes=15):
    '''Configure the parsl scheduler (is it the right name?)
    arguments: 
      * memory: amount of memory per node (default: 2GB)
      * nprocs: number of cores per node (default: 16)
      * nodes: number of nodes (default: 20)
    '''
    wrk_init = f'''
    export XRD_RUNFORKHANDLER=1
    export X509_USER_PROXY={os.environ['X509_USER_PROXY']}
    '''

    sched_opts = f'''
    #SBATCH --cpus-per-task={nprocs}
    #SBATCH --mem-per-cpu={memory}
    '''

    slurm_htex = Config(
        executors=[
            HighThroughputExecutor(
                label="coffea_parsl_slurm",
                address=address_by_hostname(),
                prefetch_capacity=0,
                max_workers=nprocs,
                provider=SlurmProvider(
                    channel=LocalChannel(),
                    launcher=SrunLauncher(),
                    init_blocks=nodes,
                    max_blocks=nodes * 2,
                    nodes_per_block=1,
                    partition='all',
                    scheduler_options=
                    sched_opts,  # Enter scheduler_options if needed
                    worker_init=wrk_init,  # Enter worker_init if needed
                    walltime='00:30:00'),
            )
        ],
        #retries=3,
        strategy=None,
    )

    # parsl.set_stream_logger() # <-- log everything to stdout, WAAAAY too much

    return parsl.load(slurm_htex)
Exemple #16
0
def parsl_config(name: str) -> Tuple[Config, int]:
    """Make the compute resource configuration

    Args:
        name: Name of the diesred configuration
    Returns:
        - Parsl compute configuration
        - Number of compute slots: Includes execution slots and pre-fetch buffers
    """

    if name == 'local':
        return Config(
            executors=[
                HighThroughputExecutor(max_workers=16, prefetch_capacity=1)
            ]
        ), 64
    elif name == 'theta-debug':
        return Config(
            retries=16,
            executors=[HighThroughputExecutor(
                    address=address_by_hostname(),
                    label="debug",
                    max_workers=64,
                    prefetch_capacity=64,
                    cpu_affinity='block',
                    provider=CobaltProvider(
                        account='redox_adsp',
                        queue='debug-flat-quad',
                        nodes_per_block=8,
                        scheduler_options='#COBALT --attrs enable_ssh=1',
                        walltime='00:60:00',
                        init_blocks=0,
                        max_blocks=1,
                        cmd_timeout=360,
                        launcher=AprunLauncher(overrides='-d 64 --cc depth -j 1'),
                        worker_init='''
module load miniconda-3
conda activate /lus/theta-fs0/projects/CSC249ADCD08/edw/env''',
                    ),
                )]
            ), 64 * 8 * 4
    else:
        raise ValueError(f'Configuration not defined: {name}')
Exemple #17
0
def multisite_nwchem_config() -> Config:
    """Experimental multi-site configuration"""
    return Config(
        retries=1,
        executors=[
            HighThroughputExecutor(
                address=address_by_hostname(),
                label="qc",
                max_workers=8,  # One task per node
                provider=CobaltProvider(
                    cmd_timeout=120,
                    nodes_per_block=8,
                    account='CSC249ADCD08',
                    queue='debug-cache-quad',
                    walltime="1:00:00",
                    init_blocks=1,
                    max_blocks=1,
                    launcher=SimpleLauncher(),  # Places worker on the launch node
                    scheduler_options='#COBALT --attrs enable_ssh=1',
                    worker_init='''
module load miniconda-3
export PATH=~/software/psi4/bin:$PATH
conda activate /lus/theta-fs0/projects/CSC249ADCD08/edw/env

# NWChem settings
export PATH="/home/lward/software/nwchem-6.8.1/bin/LINUX64:$PATH"
module load atp
export MPICH_GNI_MAX_EAGER_MSG_SIZE=16384
export MPICH_GNI_MAX_VSHORT_MSG_SIZE=10000
export MPICH_GNI_MAX_EAGER_MSG_SIZE=131072
export MPICH_GNI_NUM_BUFS=300
export MPICH_GNI_NDREG_MAXSIZE=16777216
export MPICH_GNI_MBOX_PLACEMENT=nic
export MPICH_GNI_LMT_PATH=disabled
export COMEX_MAX_NB_OUTSTANDING=6
export LD_LIBRARY_PATH=/opt/intel/compilers_and_libraries_2018.0.128/linux/compiler/lib/intel64_lin:$LD_LIBRARY_PATH
''',
                ),
            ),
            HighThroughputExecutor(
                address='localhost',  # Using an SSH tunnel
                worker_ports=(54382, 54008),
                label="ml",
                max_workers=1,
                working_dir='/homes/lward/parsl',
                worker_logdir_root='/homes/lward/parsl',
                provider=LocalProvider(
                    channel=SSHChannel('lambda5.cels.anl.gov', script_dir='/home/lward/parsl'),
                    nodes_per_block=1,
                    init_blocks=1,
                    max_blocks=1,
                    launcher=SimpleLauncher(),
                    worker_init='''
source /homes/lward/miniconda3/etc/profile.d/conda.sh
conda activate colmena_full
export CUDA_VISIBLE_DEVICES=17  # Pins to a GPU worker
''',
                ),
            )
        ],
        strategy=None,
    )
Exemple #18
0
def theta_xtb_config(log_dir: str,
                     xtb_per_node: int = 1,
                     ml_tasks_per_node: int = 1,
                     total_nodes: int = int(os.environ.get(
                         "COBALT_JOBSIZE", 1))):
    """Theta configuration where QC tasks and ML tasks run on single nodes.

    There are no MPI tasks in this configuration.

    Args:
        ml_workers: Number of nodes dedicated to ML tasks
        xtb_per_node: Number of XTB calculations
        ml_tasks_per_node: Number of ML tasks to place on each node
        log_dir: Path to store monitoring DB and parsl logs
        total_nodes: Total number of nodes available. Default: COBALT_JOBSIZE
    Returns:
        (Config) Parsl configuration
    """

    return Config(
        executors=[
            HighThroughputExecutor(
                address=address_by_hostname(),
                label="qc",
                max_workers=xtb_per_node,
                cpu_affinity='block',
                provider=LocalProvider(
                    nodes_per_block=total_nodes,
                    init_blocks=0,
                    max_blocks=1,
                    launcher=AprunLauncher(
                        overrides='-d 64 --cc depth'
                    ),  # Places worker on the compute node
                    worker_init='''
module load miniconda-3
conda activate /lus/theta-fs0/projects/CSC249ADCD08/edw/env
''',
                ),
            ),
            HighThroughputExecutor(
                address=address_by_hostname(),
                label="ml",
                max_workers=ml_tasks_per_node,
                cpu_affinity='block',
                provider=LocalProvider(
                    nodes_per_block=total_nodes,
                    init_blocks=1,
                    max_blocks=1,
                    launcher=AprunLauncher(
                        overrides='-d 64 --cc depth'
                    ),  # Places worker on the compute node
                    worker_init='''
module load miniconda-3
conda activate /lus/theta-fs0/projects/CSC249ADCD08/edw/env
''',
                ),
            )
        ],
        monitoring=MonitoringHub(
            hub_address=address_by_hostname(),
            hub_port=55055,
            monitoring_debug=False,
            resource_monitoring_interval=10,
            logdir=log_dir,
            logging_endpoint=
            f'sqlite:///{os.path.join(log_dir, "monitoring.db")}'),
        run_dir=log_dir,
        strategy='simple',
        max_idletime=15.)
from parsl.channels import LocalChannel
from parsl.launchers import SingleNodeLauncher
from parsl.launchers import SrunLauncher
from parsl.launchers import SimpleLauncher
from parsl.executors import HighThroughputExecutor
from parsl.executors import ThreadPoolExecutor
from parsl.config import Config

## Parsl monitoring
from parsl.monitoring.monitoring import MonitoringHub
from parsl.addresses import address_by_hostname

##
##   Configure Parsl
##
hostName = address_by_hostname()

##
##  Define Parsl executors
##

## The following Executors are all based on the "High Throughput
## Executor" (or "HTEX") as recommended by the Parsl team.  They can
## operate on configuration ranging from a single login (or batch)
## node to many batch nodes.  These executors have been tuned for the
## makeBrighterFatterKernel.py DM tool.
#####################

## This executor is intended for large-scale KNL batch work with
## *multiple* nodes & workers/node and employing significant
## parallelism within the DM code ("-j")
Exemple #20
0
def test_parsl_executor():
    try:
        import parsl
    except ImportError:
        warnings.warn('parsl not installed, skipping tests')
        return
    except Exception as e:
        warnings.warn('other error when trying to import parsl!')
        raise e

    from fnal_column_analysis_tools.processor import run_parsl_job

    from fnal_column_analysis_tools.processor.parsl.detail import (
        _parsl_initialize, _parsl_stop)

    from parsl.providers import LocalProvider
    from parsl.channels import LocalChannel
    from parsl.executors import HighThroughputExecutor
    from parsl.addresses import address_by_hostname
    from parsl.config import Config
    parsl_config = Config(
        executors=[
            HighThroughputExecutor(
                label="coffea_parsl_default",
                address=address_by_hostname(),
                cores_per_worker=max(multiprocessing.cpu_count() // 2, 1),
                max_workers=1,
                provider=LocalProvider(channel=LocalChannel(),
                                       init_blocks=1,
                                       max_blocks=1,
                                       nodes_per_block=1),
            )
        ],
        strategy=None,
    )

    import os
    import os.path as osp

    filelist = {
        'ZJets': [osp.join(os.getcwd(), 'tests/samples/nano_dy.root')],
        'Data': [osp.join(os.getcwd(), 'tests/samples/nano_dimuon.root')]
    }
    treename = 'Events'

    from fnal_column_analysis_tools.processor.test_items import NanoTestProcessor
    from fnal_column_analysis_tools.processor.parsl.parsl_executor import parsl_executor

    dfk = _parsl_initialize(parsl_config)

    proc = NanoTestProcessor()

    hists = run_parsl_job(filelist,
                          treename,
                          processor_instance=proc,
                          executor=parsl_executor,
                          data_flow=dfk)

    _parsl_stop(dfk)

    assert (hists['cutflow']['ZJets_pt'] == 4)
    assert (hists['cutflow']['ZJets_mass'] == 1)
    assert (hists['cutflow']['Data_pt'] == 15)
    assert (hists['cutflow']['Data_mass'] == 5)
def main(args=None):

    # Grab CLI args if not present
    if args is None:
        args = parse_args()
    exit_callbacks = []

    # Construct object
    settings = ManagerSettings(**args)

    logger_map = {
        AdapterEnum.pool: "",
        AdapterEnum.dask: "dask_jobqueue.core",
        AdapterEnum.parsl: "parsl"
    }
    if settings.common.verbose:
        adapter_logger = logging.getLogger(logger_map[settings.common.adapter])
        adapter_logger.setLevel("DEBUG")

    if settings.manager.log_file_prefix is not None:
        tornado.options.options[
            'log_file_prefix'] = settings.manager.log_file_prefix
        # Clones the log to the output
        tornado.options.options['log_to_stderr'] = True
    tornado.log.enable_pretty_logging()

    if settings.manager.test:
        # Test this manager, no client needed
        client = None
    else:
        # Connect to a specified fractal server
        client = qcfractal.interface.FractalClient(
            address=settings.server.fractal_uri,
            **settings.server.dict(skip_defaults=True,
                                   exclude={"fractal_uri"}))

    # Figure out per-task data
    cores_per_task = settings.common.ncores // settings.common.ntasks
    memory_per_task = settings.common.memory / settings.common.ntasks
    if cores_per_task < 1:
        raise ValueError("Cores per task must be larger than one!")

    if settings.common.adapter == "pool":
        from concurrent.futures import ProcessPoolExecutor

        queue_client = ProcessPoolExecutor(max_workers=settings.common.ntasks)

    elif settings.common.adapter == "dask":

        dask_settings = settings.dask.dict(skip_defaults=True)
        # Checks
        if "extra" not in dask_settings:
            dask_settings["extra"] = []
        if QCA_RESOURCE_STRING not in dask_settings["extra"]:
            dask_settings["extra"].append(QCA_RESOURCE_STRING)
        # Scheduler opts
        scheduler_opts = settings.cluster.scheduler_options.copy()
        if settings.cluster.node_exclusivity and "--exclusive" not in scheduler_opts:
            scheduler_opts.append("--exclusive")

        _cluster_loaders = {
            "slurm": "SLURMCluster",
            "pbs": "PBSCluster",
            "moab": "MoabCluster",
            "sge": "SGECluster",
            "lsf": "LSFCluster"
        }

        # Create one construct to quickly merge dicts with a final check
        dask_construct = {
            "name": "QCFractal_Dask_Compute_Executor",
            "cores": settings.common.ncores,
            "memory": str(settings.common.memory) + "GB",
            "processes":
            settings.common.ntasks,  # Number of workers to generate == tasks
            "walltime": settings.cluster.walltime,
            "job_extra": scheduler_opts,
            "env_extra": settings.cluster.task_startup_commands,
            **dask_settings
        }

        # Import the dask things we need
        from dask.distributed import Client
        cluster_module = cli_utils.import_module(
            "dask_jobqueue",
            package=_cluster_loaders[settings.cluster.scheduler])
        cluster_class = getattr(cluster_module,
                                _cluster_loaders[settings.cluster.scheduler])

        from dask_jobqueue import SGECluster

        class SGEClusterWithJobQueue(SGECluster):
            """Helper class until Dask Jobqueue fixes #256"""
            def __init__(self, job_extra=None, **kwargs):
                super().__init__(**kwargs)
                if job_extra is not None:
                    more_header = ["#$ %s" % arg for arg in job_extra]
                    self.job_header += "\n" + "\n".join(more_header)

        # Temporary fix until Dask Jobqueue fixes #256
        if cluster_class is SGECluster and 'job_extra' not in inspect.getfullargspec(
                SGECluster.__init__).args:
            # Should the SGECluster ever get fixed, this if statement should automatically ensure we stop
            # using the custom class
            cluster_class = SGEClusterWithJobQueue

        cluster = cluster_class(**dask_construct)

        # Setup up adaption
        # Workers are distributed down to the cores through the sub-divided processes
        # Optimization may be needed
        workers = settings.common.ntasks * settings.cluster.max_nodes
        if settings.cluster.adaptive == AdaptiveCluster.adaptive:
            cluster.adapt(minimum=0, maximum=workers, interval="10s")
        else:
            cluster.scale(workers)

        queue_client = Client(cluster)

        # Make sure tempdir gets assigned correctly

        # Dragonstooth has the low priority queue

    elif settings.common.adapter == "parsl":

        scheduler_opts = settings.cluster.scheduler_options

        if not settings.cluster.node_exclusivity:
            raise ValueError(
                "For now, QCFractal can only be run with Parsl in node exclusivity. This will be relaxed "
                "in a future release of Parsl and QCFractal")

        # Import helpers
        _provider_loaders = {
            "slurm": "SlurmProvider",
            "pbs": "TorqueProvider",
            "moab": "TorqueProvider",
            "sge": "GridEngineProvider",
            "lsf": None
        }

        if _provider_loaders[settings.cluster.scheduler] is None:
            raise ValueError(
                f"Parsl does not know how to handle cluster of type {settings.cluster.scheduler}."
            )

        # Headers
        _provider_headers = {
            "slurm": "#SBATCH",
            "pbs": "#PBS",
            "moab": "#PBS",
            "sge": "#$$",
            "lsf": None
        }

        # Import the parsl things we need
        from parsl.config import Config
        from parsl.executors import HighThroughputExecutor
        from parsl.addresses import address_by_hostname
        provider_module = cli_utils.import_module(
            "parsl.providers",
            package=_provider_loaders[settings.cluster.scheduler])
        provider_class = getattr(provider_module,
                                 _provider_loaders[settings.cluster.scheduler])
        provider_header = _provider_headers[settings.cluster.scheduler]

        if _provider_loaders[settings.cluster.scheduler] == "moab":
            logger.warning(
                "Parsl uses its TorqueProvider for Moab clusters due to the scheduler similarities. "
                "However, if you find a bug with it, please report to the Parsl and QCFractal developers so "
                "it can be fixed on each respective end.")

        # Setup the providers

        # Create one construct to quickly merge dicts with a final check
        common_parsl_provider_construct = {
            "init_blocks":
            0,  # Update this at a later time of Parsl
            "max_blocks":
            settings.cluster.max_nodes,
            "walltime":
            settings.cluster.walltime,
            "scheduler_options":
            f'{provider_header} ' +
            f'\n{provider_header} '.join(scheduler_opts) + '\n',
            "nodes_per_block":
            1,
            "worker_init":
            '\n'.join(settings.cluster.task_startup_commands),
            **settings.parsl.provider.dict(skip_defaults=True,
                                           exclude={"partition"})
        }
        if settings.cluster.scheduler == "slurm":
            # The Parsl SLURM constructor has a strange set of arguments
            provider = provider_class(
                settings.parsl.provider.partition,
                exclusive=settings.cluster.node_exclusivity,
                **common_parsl_provider_construct)
        else:
            provider = provider_class(**common_parsl_provider_construct)

        parsl_executor_construct = {
            "label":
            "QCFractal_Parsl_{}_Executor".format(
                settings.cluster.scheduler.title()),
            "cores_per_worker":
            cores_per_task,
            "max_workers":
            settings.common.ntasks * settings.cluster.max_nodes,
            "provider":
            provider,
            "address":
            address_by_hostname(),
            **settings.parsl.executor.dict(skip_defaults=True)
        }

        queue_client = Config(
            executors=[HighThroughputExecutor(**parsl_executor_construct)])

    else:
        raise KeyError(
            "Unknown adapter type '{}', available options: {}.\n"
            "This code should also be unreachable with pydantic Validation, so if "
            "you see this message, please report it to the QCFractal GitHub".
            format(settings.common.adapter,
                   [getattr(AdapterEnum, v).value for v in AdapterEnum]))

    # Build out the manager itself
    manager = qcfractal.queue.QueueManager(
        client,
        queue_client,
        max_tasks=settings.manager.max_tasks,
        queue_tag=settings.manager.queue_tag,
        manager_name=settings.manager.manager_name,
        update_frequency=settings.manager.update_frequency,
        cores_per_task=cores_per_task,
        memory_per_task=memory_per_task,
        scratch_directory=settings.common.scratch_directory,
        verbose=settings.common.verbose)

    # Add exit callbacks
    for cb in exit_callbacks:
        manager.add_exit_callback(cb[0], *cb[1], **cb[2])

    # Either startup the manager or run until complete
    if settings.manager.test:
        success = manager.test(settings.manager.ntests)
        if success is False:
            raise ValueError("Testing was not successful, failing.")
    else:

        for signame in {"SIGHUP", "SIGINT", "SIGTERM"}:

            def stop(*args, **kwargs):
                manager.stop(signame)
                raise KeyboardInterrupt()

            signal.signal(getattr(signal, signame), stop)

        # Blocks until signal
        try:
            manager.start()
        except KeyboardInterrupt:
            pass
Exemple #22
0
def parsl_condor_config(workers=1):

    x509_proxy = f'x509up_u{UID}'
    grid_proxy_dir = '/tmp'

    cores_per_job = 1
    mem_per_core = 2000
    mem_request = mem_per_core * cores_per_job
    init_blocks = 1
    min_blocks = 1
    max_blocks = workers
    htex_label='coffea_parsl_condor_htex'
    log_dir = 'parsl_logs'
    log_dir_full = os.path.join('/nfs_scratch/dntaylor',log_dir)

    worker_init = f'''
echo "Setting up environment"
tar -zxf columnar.tar.gz
source columnar/bin/activate
export PATH=columnar/bin:$PATH
export PYTHONPATH=columnar/lib/python3.6/site-packages:$PYTHONPATH
export X509_USER_PROXY={x509_proxy}
mkdir -p {log_dir}/{htex_label}
echo "Environment ready"
'''

    # requirements for T2_US_Wisconsin (HAS_CMS_HDFS forces to run a T2 node not CHTC)
    # Removing for now:
    scheduler_options = f'''
transfer_output_files   = {log_dir}/{htex_label}
RequestMemory           = {mem_request}
RequestCpus             = {cores_per_job}
+RequiresCVMFS          = True
Requirements            = TARGET.HAS_CMS_HDFS && TARGET.Arch == "X86_64"
priority                = 10
'''

    transfer_input_files = [os.path.join(os.path.dirname(os.path.abspath(__file__)),'columnar.tar.gz'), os.path.join(grid_proxy_dir, x509_proxy)]

    htex = Config(
        executors=[
            HighThroughputExecutor(
                label=htex_label,
                address=address_by_hostname(),
                prefetch_capacity=0,
                cores_per_worker=1,
                max_workers=cores_per_job,
                worker_logdir_root=log_dir,
                provider=CondorProvider(
                    channel=LocalChannel(
                        userhome='/nfs_scratch/dntaylor',
                    ),
                    init_blocks=init_blocks,
                    min_blocks=min_blocks,
                    max_blocks=max_blocks,
                    nodes_per_block=1,
                    worker_init=worker_init,
                    transfer_input_files=transfer_input_files,
                    scheduler_options=scheduler_options,
                ),
            ),
            # TODO: works, but really isn't helpful since half of the tasks get shipped to the condor
            # executor and don't flock back when the local executor is empty
            # an alternative could be to preprocess locally and process on the grid
            # add a local executor so stuff starts fast
            #HighThroughputExecutor(
            #    label="coffea_parsl_default",
            #    cores_per_worker=1,
            #    max_workers=1, # TODO: multicore local?
            #    worker_logdir_root=log_dir,
            #    provider=LocalProvider(
            #        channel=LocalChannel(),
            #        init_blocks=1,
            #        max_blocks=1,
            #    ),
            #),
        ],
        strategy='simple',
        run_dir=os.path.join(log_dir_full,'runinfo'),
        retries = 2, # retry all failures, xrootd failures are retried then skipped via coffea executor itself
    )

    return htex
Exemple #23
0
import getpass
from parsl.addresses import address_by_hostname

global_options = {
    'username': getpass.getuser(),
    'email': '*****@*****.**',
    'broker_address': '127.0.0.1',
    'broker_port': 8088,
    'endpoint_address': address_by_hostname(),
}
def main(args=None):

    # Grab CLI args if not present
    if args is None:
        args = parse_args()
    exit_callbacks = []

    try:
        if args["debug"]["schema"]:
            print(ManagerSettings.schema_json(indent=2))
            return  # We're done, exit normally
    except KeyError:
        pass  # Don't worry if schema isn't in the list
    finally:
        args.pop("debug", None)  # Ensure the debug key is not present

    # Construct object
    settings = ManagerSettings(**args)

    logger_map = {
        AdapterEnum.pool: "",
        AdapterEnum.dask: "dask_jobqueue.core",
        AdapterEnum.parsl: "parsl"
    }
    if settings.common.verbose:
        adapter_logger = logging.getLogger(logger_map[settings.common.adapter])
        adapter_logger.setLevel("DEBUG")
        logger.setLevel("DEBUG")

    if settings.manager.log_file_prefix is not None:
        tornado.options.options[
            'log_file_prefix'] = settings.manager.log_file_prefix
        # Clones the log to the output
        tornado.options.options['log_to_stderr'] = True
    tornado.log.enable_pretty_logging()

    if settings.manager.test:
        # Test this manager, no client needed
        client = None
    else:
        # Connect to a specified fractal server
        client = qcfractal.interface.FractalClient(
            address=settings.server.fractal_uri,
            **settings.server.dict(skip_defaults=True,
                                   exclude={"fractal_uri"}))

    # Figure out per-task data
    cores_per_task = settings.common.cores_per_worker // settings.common.tasks_per_worker
    memory_per_task = settings.common.memory_per_worker / settings.common.tasks_per_worker
    if cores_per_task < 1:
        raise ValueError("Cores per task must be larger than one!")

    if settings.common.adapter == "pool":
        from concurrent.futures import ProcessPoolExecutor

        queue_client = ProcessPoolExecutor(
            max_workers=settings.common.tasks_per_worker)

    elif settings.common.adapter == "dask":

        dask_settings = settings.dask.dict(skip_defaults=True)
        # Checks
        if "extra" not in dask_settings:
            dask_settings["extra"] = []
        if QCA_RESOURCE_STRING not in dask_settings["extra"]:
            dask_settings["extra"].append(QCA_RESOURCE_STRING)
        # Scheduler opts
        scheduler_opts = settings.cluster.scheduler_options.copy()

        _cluster_loaders = {
            "slurm": "SLURMCluster",
            "pbs": "PBSCluster",
            "moab": "MoabCluster",
            "sge": "SGECluster",
            "lsf": "LSFCluster"
        }
        dask_exclusivity_map = {
            "slurm": "--exclusive",
            "pbs": "-n",
            "moab": "-n",  # Less sure about this one
            "sge": "-l exclusive=true",
            "lsf": "-x",
        }
        if settings.cluster.node_exclusivity and dask_exclusivity_map[
                settings.cluster.scheduler] not in scheduler_opts:
            scheduler_opts.append(
                dask_exclusivity_map[settings.cluster.scheduler])

        # Create one construct to quickly merge dicts with a final check
        dask_construct = {
            "name": "QCFractal_Dask_Compute_Executor",
            "cores": settings.common.cores_per_worker,
            "memory": str(settings.common.memory_per_worker) + "GB",
            "processes": settings.common.
            tasks_per_worker,  # Number of workers to generate == tasks in this construct
            "walltime": settings.cluster.walltime,
            "job_extra": scheduler_opts,
            "env_extra": settings.cluster.task_startup_commands,
            **dask_settings
        }

        try:
            # Import the dask things we need
            import dask_jobqueue
            from dask.distributed import Client
            cluster_module = cli_utils.import_module(
                "dask_jobqueue",
                package=_cluster_loaders[settings.cluster.scheduler])
            cluster_class = getattr(
                cluster_module, _cluster_loaders[settings.cluster.scheduler])
            if dask_jobqueue.__version__ < "0.5.0":
                raise ImportError
        except ImportError:
            raise ImportError(
                "You need`dask-jobqueue >= 0.5.0` to use the `dask` adapter")

        cluster = cluster_class(**dask_construct)

        # Setup up adaption
        # Workers are distributed down to the cores through the sub-divided processes
        # Optimization may be needed
        workers = settings.common.tasks_per_worker * settings.common.max_workers
        if settings.cluster.adaptive == AdaptiveCluster.adaptive:
            cluster.adapt(minimum=0, maximum=workers, interval="10s")
        else:
            cluster.scale(workers)

        queue_client = Client(cluster)

    elif settings.common.adapter == "parsl":

        scheduler_opts = settings.cluster.scheduler_options

        if not settings.cluster.node_exclusivity:
            raise ValueError(
                "For now, QCFractal can only be run with Parsl in node exclusivity. This will be relaxed "
                "in a future release of Parsl and QCFractal")

        # Import helpers
        _provider_loaders = {
            "slurm": "SlurmProvider",
            "pbs": "TorqueProvider",
            "moab": "TorqueProvider",
            "sge": "GridEngineProvider",
            "lsf": None
        }

        if _provider_loaders[settings.cluster.scheduler] is None:
            raise ValueError(
                f"Parsl does not know how to handle cluster of type {settings.cluster.scheduler}."
            )

        # Headers
        _provider_headers = {
            "slurm": "#SBATCH",
            "pbs": "#PBS",
            "moab": "#PBS",
            "sge": "#$$",
            "lsf": None
        }

        # Import the parsl things we need
        try:
            import parsl
            from parsl.config import Config
            from parsl.executors import HighThroughputExecutor
            from parsl.addresses import address_by_hostname
            provider_module = cli_utils.import_module(
                "parsl.providers",
                package=_provider_loaders[settings.cluster.scheduler])
            provider_class = getattr(
                provider_module, _provider_loaders[settings.cluster.scheduler])
            provider_header = _provider_headers[settings.cluster.scheduler]
            if parsl.__version__ < '0.8.0':
                raise ImportError
        except ImportError:
            raise ImportError(
                "You need `parsl >=0.8.0` to use the `parsl` adapter")

        if _provider_loaders[settings.cluster.scheduler] == "moab":
            logger.warning(
                "Parsl uses its TorqueProvider for Moab clusters due to the scheduler similarities. "
                "However, if you find a bug with it, please report to the Parsl and QCFractal developers so "
                "it can be fixed on each respective end.")

        # Setup the providers

        # Create one construct to quickly merge dicts with a final check
        common_parsl_provider_construct = {
            "init_blocks":
            0,  # Update this at a later time of Parsl
            "max_blocks":
            settings.common.max_workers,
            "walltime":
            settings.cluster.walltime,
            "scheduler_options":
            f'{provider_header} ' +
            f'\n{provider_header} '.join(scheduler_opts) + '\n',
            "nodes_per_block":
            1,
            "worker_init":
            '\n'.join(settings.cluster.task_startup_commands),
            **settings.parsl.provider.dict(skip_defaults=True,
                                           exclude={"partition", "launcher"})
        }
        if settings.parsl.provider.launcher:
            common_parsl_provider_construct[
                "launcher"] = settings.parsl.provider.launcher.build_launcher(
                )
        if settings.cluster.scheduler == "slurm":
            # The Parsl SLURM constructor has a strange set of arguments
            provider = provider_class(
                settings.parsl.provider.partition,
                exclusive=settings.cluster.node_exclusivity,
                **common_parsl_provider_construct)
        else:
            provider = provider_class(**common_parsl_provider_construct)

        parsl_executor_construct = {
            "label":
            "QCFractal_Parsl_{}_Executor".format(
                settings.cluster.scheduler.title()),
            "cores_per_worker":
            cores_per_task,
            "max_workers":
            settings.common.tasks_per_worker * settings.common.max_workers,
            "provider":
            provider,
            "address":
            address_by_hostname(),
            **settings.parsl.executor.dict(skip_defaults=True)
        }

        queue_client = Config(
            executors=[HighThroughputExecutor(**parsl_executor_construct)])

    else:
        raise KeyError(
            "Unknown adapter type '{}', available options: {}.\n"
            "This code should also be unreachable with pydantic Validation, so if "
            "you see this message, please report it to the QCFractal GitHub".
            format(settings.common.adapter,
                   [getattr(AdapterEnum, v).value for v in AdapterEnum]))

    # Build out the manager itself
    # Compute max tasks
    max_concurrent_tasks = settings.common.tasks_per_worker * settings.common.max_workers
    if settings.manager.max_queued_tasks is None:
        # Tasks * jobs * buffer + 1
        max_queued_tasks = ceil(max_concurrent_tasks * 2.00) + 1
    else:
        max_queued_tasks = settings.manager.max_queued_tasks

    manager = qcfractal.queue.QueueManager(
        client,
        queue_client,
        max_tasks=max_queued_tasks,
        queue_tag=settings.manager.queue_tag,
        manager_name=settings.manager.manager_name,
        update_frequency=settings.manager.update_frequency,
        cores_per_task=cores_per_task,
        memory_per_task=memory_per_task,
        scratch_directory=settings.common.scratch_directory,
        verbose=settings.common.verbose)

    # Set stats correctly since we buffer the max tasks a bit
    manager.statistics.max_concurrent_tasks = max_concurrent_tasks

    # Add exit callbacks
    for cb in exit_callbacks:
        manager.add_exit_callback(cb[0], *cb[1], **cb[2])

    # Either startup the manager or run until complete
    if settings.manager.test:
        success = manager.test(settings.manager.ntests)
        if success is False:
            raise ValueError("Testing was not successful, failing.")
    else:

        for signame in {"SIGHUP", "SIGINT", "SIGTERM"}:

            def stop(*args, **kwargs):
                manager.stop(signame)
                raise KeyboardInterrupt()

            signal.signal(getattr(signal, signame), stop)

        # Blocks until signal
        try:
            manager.start()
        except KeyboardInterrupt:
            pass
Exemple #25
0
from parsl.config import Config
from parsl.executors import HighThroughputExecutor
from parsl.launchers import SingleNodeLauncher
from parsl.providers import SlurmProvider
from parsl.addresses import address_by_hostname
from parsl.monitoring.monitoring import MonitoringHub
import os

config = Config(
    executors=[
        HighThroughputExecutor(
            cores_per_worker=4,
            mem_per_worker=40,
            max_workers=4,
            worker_debug=True,
            address=address_by_hostname(),
            provider=SlurmProvider(
                'daenerys',
                worker_init=("source activate /cephfs/users/jbreynier/conda/parsl_env2 ; "
                            "export PYTHONPATH='{}:{{PYTHONPATH}}'").format(os.getcwd()),
                init_blocks=1,
                max_blocks=10,
                min_blocks=0,
                nodes_per_block=1,
                walltime='99:00:00',
                scheduler_options='#SBATCH --exclude=kg15-11 --cpus-per-task=16 --mem=160gb --time=99:00:00',
            ),
        ),
    ],
    monitoring=MonitoringHub(
       hub_address=address_by_hostname(),
Exemple #26
0
def theta_nwchem_config(ml_workers: int, log_dir: str, nodes_per_nwchem: int = 2,
                        total_nodes: int = int(os.environ.get("COBALT_JOBSIZE", 1))) -> Config:
    """Theta configuration where QC workers sit on the launch node (to be able to aprun)
    and ML workers are placed on compute nodes

    Args:
        ml_workers: Number of nodes dedicated to ML tasks
        nodes_per_nwchem: Number of nodes per NWChem computation
        log_dir: Path to store monitoring DB and parsl logs
        total_nodes: Total number of nodes available. Default: COBALT_JOBSIZE
    Returns:
        (Config) Parsl configuration
    """
    nwc_nodes = total_nodes - ml_workers
    assert nwc_nodes % nodes_per_nwchem == 0, "NWChem node count not a multiple of nodes per task"
    nwc_workers = nwc_nodes // nodes_per_nwchem

    return Config(
        executors=[
            HighThroughputExecutor(
                address=address_by_hostname(),
                label="qc",
                max_workers=nwc_workers,
                cores_per_worker=1e-6,
                provider=LocalProvider(
                    nodes_per_block=1,
                    init_blocks=1,
                    max_blocks=1,
                    launcher=SimpleLauncher(),  # Places worker on the launch node
                    worker_init='''
module load miniconda-3
conda activate /lus/theta-fs0/projects/CSC249ADCD08/edw/env
''',
                ),
            ),
            HighThroughputExecutor(
                address=address_by_hostname(),
                label="ml",
                max_workers=1,
                provider=LocalProvider(
                    nodes_per_block=ml_workers,
                    init_blocks=1,
                    max_blocks=1,
                    launcher=AprunLauncher(overrides='-d 64 --cc depth'),  # Places worker on the compute node
                    worker_init='''
module load miniconda-3
conda activate /lus/theta-fs0/projects/CSC249ADCD08/edw/env
    ''',
                ),
            )
        ],
        monitoring=MonitoringHub(
            hub_address=address_by_hostname(),
            monitoring_debug=False,
            resource_monitoring_interval=10,
            logdir=log_dir,
            logging_endpoint=f'sqlite:///{os.path.join(log_dir, "monitoring.db")}'
        ),
        run_dir=log_dir,
        strategy=None,
    )
Exemple #27
0
twoGB = 2048
nproc = 16

sched_opts = '''
#SBATCH --cpus-per-task=%d
#SBATCH --mem-per-cpu=%d
''' % (
    nproc,
    twoGB,
)

slurm_htex = Config(
    executors=[
        HighThroughputExecutor(
            label="coffea_parsl_slurm",
            address=address_by_hostname(),
            prefetch_capacity=0,
            max_workers=nproc,
            provider=SlurmProvider(
                launcher=SrunLauncher(),
                init_blocks=4,
                max_blocks=4,
                nodes_per_block=1,
                partition='batch,guest,gpu',
                scheduler_options=
                sched_opts,  # Enter scheduler_options if needed
                worker_init=wrk_init,  # Enter worker_init if needed
                walltime='02:00:00'),
        )
    ],
    strategy=None,
Exemple #28
0
def local_config(log_dir: str, max_workers: int, prefetch: int = 0) -> Config:
    """Single node with a single task per worker

    Args:
        log_dir: Path to store monitoring DB and parsl logs
        max_workers: Maximum number of concurrent tasks
        prefetch: Number of tasks for ML workers to prefetch for inference
    Returns:
        (Config) Parsl configuration
    """

    return Config(
        executors=[
            HighThroughputExecutor(
                address=address_by_hostname(),
                label="qc-worker",
                max_workers=max_workers,
                prefetch_capacity=prefetch,
                cpu_affinity='block',
                provider=LocalProvider(
                    nodes_per_block=1,
                    init_blocks=1,
                    max_blocks=1,
                    launcher=SimpleLauncher(
                    ),  # Places worker on the launch node
                ),
            ),
            HighThroughputExecutor(
                address=address_by_hostname(),
                label="ml-worker-tensorflow",
                max_workers=1,
                prefetch_capacity=prefetch,
                provider=LocalProvider(
                    nodes_per_block=1,
                    init_blocks=1,
                    max_blocks=1,
                    worker_init=
                    'sleep 30',  # Give enough time for other workers to exit (memory!)
                    launcher=SimpleLauncher(
                    ),  # Places worker on the launch node
                )),
            HighThroughputExecutor(
                address=address_by_hostname(),
                label=
                "ml-worker-tensorflow-infer",  # Something about training and then running a model causes issues?
                max_workers=1,
                prefetch_capacity=prefetch,
                provider=LocalProvider(
                    nodes_per_block=1,
                    init_blocks=1,
                    max_blocks=1,
                    worker_init=
                    'sleep 30',  # Give enough time for other workers to exit (memory!)
                    launcher=SimpleLauncher(
                    ),  # Places worker on the launch node
                )),
            HighThroughputExecutor(
                address=address_by_hostname(),
                label="ml-worker-torch",
                max_workers=1,
                prefetch_capacity=prefetch,
                provider=LocalProvider(
                    nodes_per_block=1,
                    init_blocks=1,
                    max_blocks=1,
                    worker_init=
                    'sleep 30',  # Give enough time for other workers to exit (memory!)
                    launcher=SimpleLauncher(
                    ),  # Places worker on the launch node
                ))
        ],
        run_dir=log_dir,
        strategy='simple',
        max_idletime=15.)
Exemple #29
0
def main(args=None):

    # Grab CLI args if not present
    if args is None:
        args = parse_args()
    exit_callbacks = []

    try:
        if args["debug"]["schema"]:
            print(ManagerSettings.schema_json(indent=2))
            return  # We're done, exit normally
    except KeyError:
        pass  # Don't worry if schema isn't in the list
    finally:
        debug_args = args.pop("debug",
                              {})  # Ensure the debug key is not present

    # Construct object
    settings = ManagerSettings(**args)

    # Handle Skeleton Generation
    if debug_args.get("skeleton", None):

        class IndentListDumper(yaml.Dumper):
            """
            Internal yaml Dumper to make lists indent in the output YAML

            Buried inside this since its only used in "skeleton," once, and then exits. Does not need to be imported
            anywhere else or accessed somehow

            Based on response:
            https://stackoverflow.com/questions/25108581/python-yaml-dump-bad-indentation/39681672#39681672
            """
            def increase_indent(self, flow=False, indentless=False):
                return super(IndentListDumper,
                             self).increase_indent(flow, False)

        skel_path = os.path.expanduser(debug_args["skeleton"])
        with open(skel_path, "w") as skel:
            # cast to
            data = yaml.dump(json.loads(settings.json()),
                             Dumper=IndentListDumper,
                             default_flow_style=False)
            skel.write(data)
            print(
                f"Skeleton Queue Manager YAML file written to {skel_path}\n"
                f"Run: `qcfractal-manager --config-file={skel_path}` to start a manager with this configuration."
            )
            return

    logger_map = {
        AdapterEnum.pool: "",
        AdapterEnum.dask: "dask_jobqueue.core",
        AdapterEnum.parsl: "parsl"
    }
    if settings.common.verbose:
        adapter_logger = logging.getLogger(logger_map[settings.common.adapter])
        adapter_logger.setLevel("DEBUG")
        logger.setLevel("DEBUG")

    if settings.manager.log_file_prefix is not None:
        tornado.options.options[
            "log_file_prefix"] = settings.manager.log_file_prefix
        # Clones the log to the output
        tornado.options.options["log_to_stderr"] = True
    tornado.log.enable_pretty_logging()

    if settings.manager.test:
        # Test this manager, no client needed
        client = None
    else:
        # Connect to a specified fractal server
        client = qcfractal.interface.FractalClient(
            address=settings.server.fractal_uri,
            **settings.server.dict(skip_defaults=True,
                                   exclude={"fractal_uri"}))

    # Figure out per-task data
    node_parallel_tasks = settings.common.nodes_per_task > 1  # Whether tasks are node-parallel
    if node_parallel_tasks:
        supported_adapters = ["parsl"]
        if settings.common.adapter not in supported_adapters:
            raise ValueError(
                "Node-parallel jobs are only supported with {} adapters".
                format(supported_adapters))
        # Node-parallel tasks use all cores on a worker
        cores_per_task = settings.common.cores_per_worker
        memory_per_task = settings.common.memory_per_worker
        if settings.common.tasks_per_worker > 1:
            raise ValueError(
                ">1 task per node and >1 node per tasks are mutually-exclusive"
            )
    else:
        cores_per_task = settings.common.cores_per_worker // settings.common.tasks_per_worker
        memory_per_task = settings.common.memory_per_worker / settings.common.tasks_per_worker
    if cores_per_task < 1:
        raise ValueError("Cores per task must be larger than one!")

    if settings.common.adapter == "pool":
        from concurrent.futures import ProcessPoolExecutor

        # Error if the number of nodes per jobs is more than 1
        if settings.common.nodes_per_job > 1:
            raise ValueError("Pool adapters only run on a single local node")
        queue_client = ProcessPoolExecutor(
            max_workers=settings.common.tasks_per_worker)

    elif settings.common.adapter == "dask":

        dask_settings = settings.dask.dict(skip_defaults=True)
        # Checks
        if "extra" not in dask_settings:
            dask_settings["extra"] = []
        if QCA_RESOURCE_STRING not in dask_settings["extra"]:
            dask_settings["extra"].append(QCA_RESOURCE_STRING)
        # Scheduler opts
        scheduler_opts = settings.cluster.scheduler_options.copy()

        # Error if the number of nodes per jobs is more than 1
        if settings.common.nodes_per_job > 1:
            raise NotImplementedError(
                "Support for >1 node per job is not yet supported by QCFractal + Dask"
            )
            # TODO (wardlt): Implement multinode jobs in Dask

        _cluster_loaders = {
            "slurm": "SLURMCluster",
            "pbs": "PBSCluster",
            "moab": "MoabCluster",
            "sge": "SGECluster",
            "lsf": "LSFCluster",
        }
        dask_exclusivity_map = {
            "slurm": "--exclusive",
            "pbs": "-n",
            "moab": "-n",  # Less sure about this one
            "sge": "-l exclusive=true",
            "lsf": "-x",
        }
        if settings.cluster.node_exclusivity and dask_exclusivity_map[
                settings.cluster.scheduler] not in scheduler_opts:
            scheduler_opts.append(
                dask_exclusivity_map[settings.cluster.scheduler])

        # Create one construct to quickly merge dicts with a final check
        dask_construct = {
            "name": "QCFractal_Dask_Compute_Executor",
            "cores": settings.common.cores_per_worker,
            "memory": str(settings.common.memory_per_worker) + "GB",
            "processes": settings.common.
            tasks_per_worker,  # Number of workers to generate == tasks in this construct
            "walltime": settings.cluster.walltime,
            "job_extra": scheduler_opts,
            "env_extra": settings.cluster.task_startup_commands,
            **dask_settings,
        }

        try:
            # Import the dask things we need
            import dask_jobqueue
            from dask.distributed import Client

            cluster_module = cli_utils.import_module(
                "dask_jobqueue",
                package=_cluster_loaders[settings.cluster.scheduler])
            cluster_class = getattr(
                cluster_module, _cluster_loaders[settings.cluster.scheduler])
            if dask_jobqueue.__version__ < "0.5.0":
                raise ImportError
        except ImportError:
            raise ImportError(
                "You need`dask-jobqueue >= 0.5.0` to use the `dask` adapter")

        cluster = cluster_class(**dask_construct)

        # Setup up adaption
        # Workers are distributed down to the cores through the sub-divided processes
        # Optimization may be needed
        workers = settings.common.tasks_per_worker * settings.common.max_workers
        if settings.cluster.adaptive == AdaptiveCluster.adaptive:
            cluster.adapt(minimum=0, maximum=workers, interval="10s")
        else:
            cluster.scale(workers)

        queue_client = Client(cluster)

    elif settings.common.adapter == "parsl":

        scheduler_opts = settings.cluster.scheduler_options

        if not settings.cluster.node_exclusivity:
            raise ValueError(
                "For now, QCFractal can only be run with Parsl in node exclusivity. This will be relaxed "
                "in a future release of Parsl and QCFractal")

        # Import helpers
        _provider_loaders = {
            "slurm": "SlurmProvider",
            "pbs": "TorqueProvider",
            "moab": "TorqueProvider",
            "sge": "GridEngineProvider",
            "cobalt": "CobaltProvider",
            "lsf": None,
        }

        if _provider_loaders[settings.cluster.scheduler] is None:
            raise ValueError(
                f"Parsl does not know how to handle cluster of type {settings.cluster.scheduler}."
            )

        # Headers
        _provider_headers = {
            "slurm": "#SBATCH",
            "pbs": "#PBS",
            "moab": "#PBS",
            "sge": "#$$",
            "lsf": None,
            "cobalt": "#COBALT",
        }

        # Import the parsl things we need
        try:
            import parsl
            from parsl.config import Config
            from parsl.executors import HighThroughputExecutor
            from parsl.addresses import address_by_hostname

            provider_module = cli_utils.import_module(
                "parsl.providers",
                package=_provider_loaders[settings.cluster.scheduler])
            provider_class = getattr(
                provider_module, _provider_loaders[settings.cluster.scheduler])
            provider_header = _provider_headers[settings.cluster.scheduler]
            if parsl.__version__ < "0.9.0":
                raise ImportError
        except ImportError:
            raise ImportError(
                "You need `parsl >=0.9.0` to use the `parsl` adapter")

        if _provider_loaders[settings.cluster.scheduler] == "moab":
            logger.warning(
                "Parsl uses its TorqueProvider for Moab clusters due to the scheduler similarities. "
                "However, if you find a bug with it, please report to the Parsl and QCFractal developers so "
                "it can be fixed on each respective end.")

        # Setup the providers

        # Determine the maximum number of blocks
        # TODO (wardlt): Math assumes that user does not set aside a compute node for the adapter
        max_nodes = settings.common.max_workers * settings.common.nodes_per_task
        if settings.common.nodes_per_job > max_nodes:
            raise ValueError(
                "Number of nodes per job is more than the maximum number of nodes used by manager"
            )
        if max_nodes % settings.common.nodes_per_job != 0:
            raise ValueError(
                "Maximum number of nodes (maximum number of workers times nodes per task) "
                "needs to be a multiple of the number of nodes per job")
        if settings.common.nodes_per_job % settings.common.nodes_per_task != 0:
            raise ValueError(
                "Number of nodes per job needs to be a multiple of the number of nodes per task"
            )
        max_blocks = max_nodes // settings.common.nodes_per_job

        # Create one construct to quickly merge dicts with a final check
        common_parsl_provider_construct = {
            "init_blocks":
            0,  # Update this at a later time of Parsl
            "max_blocks":
            max_blocks,
            "walltime":
            settings.cluster.walltime,
            "scheduler_options":
            f"{provider_header} " +
            f"\n{provider_header} ".join(scheduler_opts) + "\n",
            "nodes_per_block":
            settings.common.nodes_per_job,
            "worker_init":
            "\n".join(settings.cluster.task_startup_commands),
            **settings.parsl.provider.dict(skip_defaults=True,
                                           exclude={"partition", "launcher"}),
        }
        if settings.cluster.scheduler.lower(
        ) == "slurm" and "cores_per_node" not in common_parsl_provider_construct:
            common_parsl_provider_construct[
                "cores_per_node"] = settings.common.cores_per_worker
        # TODO: uncomment after Parsl#1416 is resolved
        # if settings.cluster.scheduler.lower() == "slurm" and "mem_per_node" not in common_parsl_provider_construct:
        #    common_parsl_provider_construct["mem_per_node"] = settings.common.memory_per_worker

        if settings.parsl.provider.launcher:
            common_parsl_provider_construct[
                "launcher"] = settings.parsl.provider.launcher.build_launcher(
                )
        if settings.cluster.scheduler == "slurm":
            # The Parsl SLURM constructor has a strange set of arguments
            provider = provider_class(
                settings.parsl.provider.partition,
                exclusive=settings.cluster.node_exclusivity,
                **common_parsl_provider_construct,
            )
        else:
            provider = provider_class(**common_parsl_provider_construct)

        # The executor for Parsl is different for node parallel tasks and shared-memory tasks
        if node_parallel_tasks:
            # Tasks are launched from a single worker on the login node
            # TODO (wardlt): Remove assumption that there is only one Parsl worker running all tasks
            tasks_per_job = settings.common.nodes_per_job // settings.common.nodes_per_task
            logger.info(
                f"Preparing a HTEx to use node-parallel tasks with {tasks_per_job} workers"
            )
            parsl_executor_construct = {
                "label":
                "QCFractal_Parsl_{}_Executor".format(
                    settings.cluster.scheduler.title()),
                # Parsl will create one worker process per MPI task. Normally, Parsl prevents having
                #  more processes than cores. However, as each worker will spend most of its time
                #  waiting for the MPI task to complete, we can safely oversubscribe (e.g., more worker
                #  processes than cores), which requires setting "cores_per_worker" to <1
                "cores_per_worker":
                1e-6,
                "max_workers":
                tasks_per_job,
                "provider":
                provider,
                "address":
                address_by_hostname(),
                **settings.parsl.executor.dict(skip_defaults=True),
            }
        else:

            parsl_executor_construct = {
                "label":
                "QCFractal_Parsl_{}_Executor".format(
                    settings.cluster.scheduler.title()),
                "cores_per_worker":
                cores_per_task,
                "max_workers":
                settings.common.tasks_per_worker,
                "provider":
                provider,
                "address":
                address_by_hostname(),
                **settings.parsl.executor.dict(skip_defaults=True),
            }

        queue_client = Config(
            retries=settings.common.retries,
            executors=[HighThroughputExecutor(**parsl_executor_construct)])

    else:
        raise KeyError(
            "Unknown adapter type '{}', available options: {}.\n"
            "This code should also be unreachable with pydantic Validation, so if "
            "you see this message, please report it to the QCFractal GitHub".
            format(settings.common.adapter,
                   [getattr(AdapterEnum, v).value for v in AdapterEnum]))

    # Build out the manager itself
    # Compute max tasks
    max_concurrent_tasks = settings.common.tasks_per_worker * settings.common.max_workers
    if settings.manager.max_queued_tasks is None:
        # Tasks * jobs * buffer + 1
        max_queued_tasks = ceil(max_concurrent_tasks * 2.00) + 1
    else:
        max_queued_tasks = settings.manager.max_queued_tasks

    # The queue manager is configured differently for node-parallel and single-node tasks
    manager = qcfractal.queue.QueueManager(
        client,
        queue_client,
        max_tasks=max_queued_tasks,
        queue_tag=settings.manager.queue_tag,
        manager_name=settings.manager.manager_name,
        update_frequency=settings.manager.update_frequency,
        cores_per_task=cores_per_task,
        memory_per_task=memory_per_task,
        nodes_per_task=settings.common.nodes_per_task,
        scratch_directory=settings.common.scratch_directory,
        retries=settings.common.retries,
        verbose=settings.common.verbose,
        cores_per_rank=settings.common.cores_per_rank,
        configuration=settings,
    )

    # Set stats correctly since we buffer the max tasks a bit
    manager.statistics.max_concurrent_tasks = max_concurrent_tasks

    # Add exit callbacks
    for cb in exit_callbacks:
        manager.add_exit_callback(cb[0], *cb[1], **cb[2])

    # Either startup the manager or run until complete
    if settings.manager.test:
        success = manager.test(settings.manager.ntests)
        if success is False:
            raise ValueError("Testing was not successful, failing.")
    else:

        for signame in {"SIGHUP", "SIGINT", "SIGTERM"}:

            def stop(*args, **kwargs):
                manager.stop(signame)
                raise KeyboardInterrupt()

            signal.signal(getattr(signal, signame), stop)

        # Blocks until signal
        try:
            manager.start()
        except KeyboardInterrupt:
            pass
Exemple #30
0
logger.info("Done with imports")

# TODO: proper boolean switch here to switch between checkpointing and
# monitoring, as they do not work together at the moment.
#  - see https://github.com/Parsl/parsl/issues/1014

config = parsl.config.Config(
    executors=[
        parsl.executors.ThreadPoolExecutor(label="management", max_threads=20),
        parsl.executors.ThreadPoolExecutor(label="heavy", max_threads=3),
    ],

    # monitoring config from
    # https://parsl.readthedocs.io/en/latest/userguide/monitoring.html
    # modified to add hub_port - see https://github.com/Parsl/parsl/issues/1010
    monitoring=MonitoringHub(hub_address=address_by_hostname(),
                             logging_level=logging.INFO,
                             resource_monitoring_interval=10,
                             hub_port=30733))

# config.checkpoint_mode = 'task_exit'

REPO_BASE = "REPO"

logger.info("Getting checkpoint files")
config.checkpoint_files = parsl.utils.get_all_checkpoints()
logger.info("Checkpoint files: {}".format(config.checkpoint_files))


class RepoInfo:
    def __init__(self, repo_base, rerun=None):