def local_setup(): threads_config = Config( executors=[ HighThroughputExecutor( label="theta_htex", # worker_debug=True, cores_per_worker=4, address=address_by_hostname(), provider=CobaltProvider( queue='debug-flat-quad', account='CSC249ADCD01', launcher=AprunLauncher(overrides="-d 64"), worker_init='source activate parsl-issues', init_blocks=1, max_blocks=1, min_blocks=1, nodes_per_block=4, cmd_timeout=60, walltime='00:10:00', ), ) ], monitoring=MonitoringHub(hub_address=address_by_hostname(), hub_port=55055, logging_level=logging.DEBUG, resource_monitoring_interval=10), strategy=None) parsl.load(threads_config)
def theta_nwchem_config(log_dir: str, nodes_per_nwchem: int = 2, total_nodes: int = int( os.environ.get("COBALT_JOBSIZE", 1)), ml_prefetch: int = 0) -> Config: """Theta configuration where QC workers sit on the launch node (to be able to aprun) and ML workers are placed on compute nodes Args: nodes_per_nwchem: Number of nodes per NWChem computation log_dir: Path to store monitoring DB and parsl logs total_nodes: Total number of nodes available. Default: COBALT_JOBSIZE ml_prefetch: Number of tasks for ML workers to prefetch for inference Returns: (Config) Parsl configuration """ assert total_nodes % nodes_per_nwchem == 0, "NWChem node count not a multiple of nodes per task" nwc_workers = total_nodes // nodes_per_nwchem return Config( executors=[ ThreadPoolExecutor(label='qc', max_threads=nwc_workers), HighThroughputExecutor( address=address_by_hostname(), label="ml", max_workers=1, prefetch_capacity=ml_prefetch, provider=LocalProvider( nodes_per_block= nodes_per_nwchem, # Minimum increment in blcoks init_blocks=0, max_blocks=total_nodes // nodes_per_nwchem, # Limits the number of manager processes, launcher=AprunLauncher( overrides='-d 256 --cc depth -j 4' ), # Places worker on the compute node worker_init=''' module load miniconda-3 conda activate /lus/theta-fs0/projects/CSC249ADCD08/edw/env ''', ), ) ], monitoring=MonitoringHub( hub_address=address_by_hostname(), monitoring_debug=False, resource_monitoring_interval=10, logdir=log_dir, logging_endpoint= f'sqlite:///{os.path.join(log_dir, "monitoring.db")}'), run_dir=log_dir, strategy='simple', max_idletime=15.)
def fresh_config(): return Config( executors=[ HighThroughputExecutor( label='theta_local_htex_multinode', max_workers=1, address=address_by_hostname(), provider=CobaltProvider( queue=user_opts['theta']['queue'], account=user_opts['theta']['account'], launcher=AprunLauncher(overrides="-d 64"), walltime='00:10:00', nodes_per_block=2, init_blocks=1, max_blocks=1, # string to prepend to #COBALT blocks in the submit # script to the scheduler eg: '#COBALT -t 50' scheduler_options='', # Command to be run before starting a worker, such as: # 'module load Anaconda; source activate parsl_env'. worker_init=user_opts['theta']['worker_init'], cmd_timeout=120, ), ) ], )
def test_parsl_htex_executor(): parsl = pytest.importorskip("parsl", minversion="0.7.2") from parsl.providers import LocalProvider from parsl.channels import LocalChannel from parsl.executors import HighThroughputExecutor from parsl.addresses import address_by_hostname from parsl.config import Config parsl_config = Config( executors=[ HighThroughputExecutor( label="coffea_parsl_default", address=address_by_hostname(), cores_per_worker=max(multiprocessing.cpu_count() // 2, 1), max_workers=1, provider=LocalProvider(channel=LocalChannel(), init_blocks=1, max_blocks=1, nodes_per_block=1), ) ], strategy=None, ) do_parsl_job(parsl_config)
def configure_parsl(n_threads, monitoring, **kwargs): from parsl.config import Config from parsl.executors.threads import ThreadPoolExecutor from parsl.addresses import address_by_hostname if monitoring: from parsl.monitoring import MonitoringHub monitoring = MonitoringHub( hub_address=address_by_hostname(), hub_port=55055, logging_level=logging.INFO, resource_monitoring_interval=10, ) else: monitoring = None local_threads = ThreadPoolExecutor(max_threads=n_threads, label='local_threads') config = Config( executors=[local_threads], monitoring=monitoring, strategy=None, app_cache=True, ) return config
def fresh_config(): config = Config( executors=[ HighThroughputExecutor( label="bw_htex", cores_per_worker=1, worker_debug=False, max_workers=1, address=address_by_hostname(), provider=TorqueProvider( queue='normal', launcher=AprunLauncher(overrides="-b -- bwpy-environ --"), # string to prepend to #SBATCH blocks in the submit # script to the scheduler eg: '#SBATCH --constraint=knl,quad,cache' scheduler_options='', # Command to be run before starting a worker, such as: # 'module load Anaconda; source activate parsl_env'. worker_init=user_opts['bluewaters']['worker_init'], init_blocks=1, max_blocks=1, min_blocks=1, nodes_per_block=2, walltime='00:30:00', cmd_timeout=120, ), ) ], ) return config
def fresh_config(): return Config( executors=[ HighThroughputExecutor( label="frontera_htex", address=address_by_hostname(), max_workers=1, provider=SlurmProvider( cmd_timeout= 60, # Add extra time for slow scheduler responses channel=LocalChannel(), nodes_per_block=2, init_blocks=1, min_blocks=1, max_blocks=1, partition='development', # Replace with partition name scheduler_options=user_opts['frontera'] ['scheduler_options'], # Command to be run before starting a worker, such as: # 'module load Anaconda; source activate parsl_env'. worker_init=user_opts['frontera']['worker_init'], # Ideally we set the walltime to the longest supported walltime. walltime='00:10:00', launcher=SrunLauncher(), ), ) ], )
def fresh_config(): config = Config( executors=[ HighThroughputExecutor( label='Midway_HTEX_multinode', worker_debug=False, address=address_by_hostname(), max_workers=1, provider=SlurmProvider( 'broadwl', # Partition name, e.g 'broadwl' launcher=SrunLauncher(), nodes_per_block=2, init_blocks=1, min_blocks=1, max_blocks=1, # string to prepend to #SBATCH blocks in the submit # script to the scheduler eg: '#SBATCH --constraint=knl,quad,cache' scheduler_options='', # Command to be run before starting a worker, such as: # 'module load Anaconda; source activate parsl_env'. worker_init=user_opts['midway']['worker_init'], walltime='00:30:00', cmd_timeout=120, ), ) ], ) return config
def test_parsl_executor(): parsl = pytest.importorskip("parsl", minversion="0.7.2") from coffea.processor import run_parsl_job from coffea.processor.parsl.detail import (_parsl_initialize, _parsl_stop) from parsl.providers import LocalProvider from parsl.channels import LocalChannel from parsl.executors import HighThroughputExecutor from parsl.addresses import address_by_hostname from parsl.config import Config parsl_config = Config( executors=[ HighThroughputExecutor( label="coffea_parsl_default", address=address_by_hostname(), cores_per_worker=max(multiprocessing.cpu_count() // 2, 1), max_workers=1, provider=LocalProvider(channel=LocalChannel(), init_blocks=1, max_blocks=1, nodes_per_block=1), ) ], strategy=None, ) import os import os.path as osp filelist = { 'ZJets': [osp.join(os.getcwd(), 'tests/samples/nano_dy.root')], 'Data': [osp.join(os.getcwd(), 'tests/samples/nano_dimuon.root')] } treename = 'Events' from coffea.processor.test_items import NanoTestProcessor from coffea.processor.parsl.parsl_executor import parsl_executor dfk = _parsl_initialize(parsl_config) proc = NanoTestProcessor() hists = run_parsl_job(filelist, treename, processor_instance=proc, executor=parsl_executor, data_flow=dfk) _parsl_stop(dfk) assert (hists['cutflow']['ZJets_pt'] == 4) assert (hists['cutflow']['ZJets_mass'] == 1) assert (hists['cutflow']['Data_pt'] == 15) assert (hists['cutflow']['Data_mass'] == 5)
def local_config(log_dir: str, max_workers: int, prefetch: int = 0) -> Config: """Single node with a single task per worker Args: log_dir: Path to store monitoring DB and parsl logs max_workers: Maximum number of concurrent tasks prefetch: Number of tasks for ML workers to prefetch for inference Returns: (Config) Parsl configuration """ return Config( executors=[ HighThroughputExecutor( address=address_by_hostname(), label="qc-worker", max_workers=max_workers, prefetch_capacity=prefetch, cpu_affinity='block', provider=LocalProvider( nodes_per_block=1, init_blocks=1, max_blocks=1, launcher=SimpleLauncher(), # Places worker on the launch node ), ), HighThroughputExecutor( address=address_by_hostname(), label="ml-worker", max_workers=1, prefetch_capacity=prefetch, provider=LocalProvider( nodes_per_block=1, init_blocks=1, max_blocks=1, launcher=SimpleLauncher(), # Places worker on the launch node ) ) ], run_dir=log_dir, strategy='simple', max_idletime=15. )
def condor_config(cores_per_job=4, mem_per_core=2048, total_workers=24, max_workers=200, pyenv_dir='%s/.local' % (os.environ['HOME'], ), grid_proxy_dir='/tmp', htex_label='coffea_parsl_condor_htex', wrk_init=None, condor_cfg=None): pyenv_relpath = pyenv_dir.split('/')[-1] if wrk_init is None: wrk_init = ''' source /cvmfs/sft.cern.ch/lcg/views/LCG_95apython3/x86_64-centos7-gcc7-opt/setup.sh export PATH=`pwd`/%s:$PATH export PYTHONPATH=`pwd`/%s:$PYTHONPATH export X509_USER_PROXY=`pwd`/%s mkdir -p ./%s ''' % ('%s/bin' % pyenv_relpath, '%s/lib/python3.6/site-packages' % pyenv_relpath, x509_proxy, htex_label) if condor_cfg in None: condor_cfg = ''' transfer_output_files = %s RequestMemory = %d RequestCpus = %d ''' % (htex_label, mem_per_core * cores_per_job, cores_per_job) xfer_files = [pyenv_dir, osp.join(grid_proxy_dir, x509_proxy)] condor_htex = Config( executors=[ HighThroughputExecutor( label=htex_label, address=address_by_hostname(), prefetch_capacity=0, cores_per_worker=1, max_workers=cores_per_job, worker_logdir_root='./', provider=CondorProvider( channel=LocalChannel(), init_blocks=total_workers, max_blocks=max_workers, nodes_per_block=1, worker_init=wrk_init, transfer_input_files=xfer_files, scheduler_options=condor_cfg ), ) ], strategy=None, ) return condor_htex
def slurm_config( cores_per_job=16, mem_per_core=2048, jobs_per_worker=1, initial_workers=4, max_workers=8, work_dir="./", grid_proxy_dir="/tmp", partition="", walltime="02:00:00", htex_label="coffea_parsl_slurm_htex", ): shutil.copy2(osp.join(grid_proxy_dir, x509_proxy), osp.join(work_dir, x509_proxy)) wrk_init = """ export XRD_RUNFORKHANDLER=1 export X509_USER_PROXY=%s """ % ( osp.join(work_dir, x509_proxy) ) sched_opts = """ #SBATCH --cpus-per-task=%d #SBATCH --mem-per-cpu=%d """ % ( cores_per_job, mem_per_core, ) slurm_htex = Config( executors=[ HighThroughputExecutor( label=htex_label, address=address_by_hostname(), prefetch_capacity=0, max_workers=cores_per_job, provider=SlurmProvider( channel=LocalChannel(), launcher=SrunLauncher(), init_blocks=initial_workers, max_blocks=max_workers, nodes_per_block=jobs_per_worker, partition=partition, scheduler_options=sched_opts, # Enter scheduler_options if needed worker_init=wrk_init, # Enter worker_init if needed walltime=walltime, ), ) ], strategy=None, ) return slurm_htex
def test_parsl_htex_executor(): parsl = pytest.importorskip("parsl", minversion="0.7.2") import os import os.path as osp from parsl.providers import LocalProvider from parsl.channels import LocalChannel from parsl.executors import HighThroughputExecutor from parsl.addresses import address_by_hostname from parsl.config import Config parsl_config = Config( executors=[ HighThroughputExecutor( label="coffea_parsl_default", address=address_by_hostname(), cores_per_worker=max(multiprocessing.cpu_count() // 2, 1), max_workers=1, provider=LocalProvider(channel=LocalChannel(), init_blocks=1, max_blocks=1, nodes_per_block=1), ) ], strategy=None, ) parsl.load(parsl_config) filelist = { 'ZJets': [osp.join(os.getcwd(), 'tests/samples/nano_dy.root')], 'Data': [osp.join(os.getcwd(), 'tests/samples/nano_dimuon.root')] } do_parsl_job(filelist) do_parsl_job(filelist, compression=1) filelist = { 'ZJets': { 'treename': 'Events', 'files': [osp.join(os.getcwd(), 'tests/samples/nano_dy.root')] }, 'Data': { 'treename': 'Events', 'files': [osp.join(os.getcwd(), 'tests/samples/nano_dimuon.root')] } } do_parsl_job(filelist) do_parsl_job(filelist, flatten=True)
def theta_nwchem_config( choice: str, log_dir: str, nodes_per_nwchem: int = 2, total_nodes: int = int(os.environ.get("COBALT_JOBSIZE", 1)) ) -> Config: """Theta configuration to run NWChem Args: choice: Choice of the runtime configuration nodes_per_nwchem: Number of nodes per NWChem computation log_dir: Path to store monitoring DB and parsl logs total_nodes: Total number of nodes available. Default: COBALT_JOBSIZE Returns: (Config) Parsl configuration """ assert total_nodes % nodes_per_nwchem == 0, "NWChem node count not a multiple of nodes per task" nwc_workers = total_nodes // nodes_per_nwchem if choice == "htex": qc_exec = HighThroughputExecutor( address=address_by_hostname(), label="qc", max_workers=nwc_workers, cores_per_worker=1e-6, provider=LocalProvider( nodes_per_block=1, init_blocks=0, max_blocks=1, launcher=SimpleLauncher(), # Places worker on the launch node worker_init=''' module load miniconda-3 conda activate /lus/theta-fs0/projects/CSC249ADCD08/edw/env ''', ), ) elif choice == 'thread': qc_exec = ThreadPoolExecutor(label='qc', max_threads=nwc_workers) else: raise ValueError(f'Choice "{choice}" not recognized ') return Config(executors=[qc_exec], run_dir=log_dir, strategy='simple', max_idletime=15.)
def configure(memory=2048, nprocs=8, nodes=15): '''Configure the parsl scheduler (is it the right name?) arguments: * memory: amount of memory per node (default: 2GB) * nprocs: number of cores per node (default: 16) * nodes: number of nodes (default: 20) ''' wrk_init = f''' export XRD_RUNFORKHANDLER=1 export X509_USER_PROXY={os.environ['X509_USER_PROXY']} ''' sched_opts = f''' #SBATCH --cpus-per-task={nprocs} #SBATCH --mem-per-cpu={memory} ''' slurm_htex = Config( executors=[ HighThroughputExecutor( label="coffea_parsl_slurm", address=address_by_hostname(), prefetch_capacity=0, max_workers=nprocs, provider=SlurmProvider( channel=LocalChannel(), launcher=SrunLauncher(), init_blocks=nodes, max_blocks=nodes * 2, nodes_per_block=1, partition='all', scheduler_options= sched_opts, # Enter scheduler_options if needed worker_init=wrk_init, # Enter worker_init if needed walltime='00:30:00'), ) ], #retries=3, strategy=None, ) # parsl.set_stream_logger() # <-- log everything to stdout, WAAAAY too much return parsl.load(slurm_htex)
def parsl_config(name: str) -> Tuple[Config, int]: """Make the compute resource configuration Args: name: Name of the diesred configuration Returns: - Parsl compute configuration - Number of compute slots: Includes execution slots and pre-fetch buffers """ if name == 'local': return Config( executors=[ HighThroughputExecutor(max_workers=16, prefetch_capacity=1) ] ), 64 elif name == 'theta-debug': return Config( retries=16, executors=[HighThroughputExecutor( address=address_by_hostname(), label="debug", max_workers=64, prefetch_capacity=64, cpu_affinity='block', provider=CobaltProvider( account='redox_adsp', queue='debug-flat-quad', nodes_per_block=8, scheduler_options='#COBALT --attrs enable_ssh=1', walltime='00:60:00', init_blocks=0, max_blocks=1, cmd_timeout=360, launcher=AprunLauncher(overrides='-d 64 --cc depth -j 1'), worker_init=''' module load miniconda-3 conda activate /lus/theta-fs0/projects/CSC249ADCD08/edw/env''', ), )] ), 64 * 8 * 4 else: raise ValueError(f'Configuration not defined: {name}')
def multisite_nwchem_config() -> Config: """Experimental multi-site configuration""" return Config( retries=1, executors=[ HighThroughputExecutor( address=address_by_hostname(), label="qc", max_workers=8, # One task per node provider=CobaltProvider( cmd_timeout=120, nodes_per_block=8, account='CSC249ADCD08', queue='debug-cache-quad', walltime="1:00:00", init_blocks=1, max_blocks=1, launcher=SimpleLauncher(), # Places worker on the launch node scheduler_options='#COBALT --attrs enable_ssh=1', worker_init=''' module load miniconda-3 export PATH=~/software/psi4/bin:$PATH conda activate /lus/theta-fs0/projects/CSC249ADCD08/edw/env # NWChem settings export PATH="/home/lward/software/nwchem-6.8.1/bin/LINUX64:$PATH" module load atp export MPICH_GNI_MAX_EAGER_MSG_SIZE=16384 export MPICH_GNI_MAX_VSHORT_MSG_SIZE=10000 export MPICH_GNI_MAX_EAGER_MSG_SIZE=131072 export MPICH_GNI_NUM_BUFS=300 export MPICH_GNI_NDREG_MAXSIZE=16777216 export MPICH_GNI_MBOX_PLACEMENT=nic export MPICH_GNI_LMT_PATH=disabled export COMEX_MAX_NB_OUTSTANDING=6 export LD_LIBRARY_PATH=/opt/intel/compilers_and_libraries_2018.0.128/linux/compiler/lib/intel64_lin:$LD_LIBRARY_PATH ''', ), ), HighThroughputExecutor( address='localhost', # Using an SSH tunnel worker_ports=(54382, 54008), label="ml", max_workers=1, working_dir='/homes/lward/parsl', worker_logdir_root='/homes/lward/parsl', provider=LocalProvider( channel=SSHChannel('lambda5.cels.anl.gov', script_dir='/home/lward/parsl'), nodes_per_block=1, init_blocks=1, max_blocks=1, launcher=SimpleLauncher(), worker_init=''' source /homes/lward/miniconda3/etc/profile.d/conda.sh conda activate colmena_full export CUDA_VISIBLE_DEVICES=17 # Pins to a GPU worker ''', ), ) ], strategy=None, )
def theta_xtb_config(log_dir: str, xtb_per_node: int = 1, ml_tasks_per_node: int = 1, total_nodes: int = int(os.environ.get( "COBALT_JOBSIZE", 1))): """Theta configuration where QC tasks and ML tasks run on single nodes. There are no MPI tasks in this configuration. Args: ml_workers: Number of nodes dedicated to ML tasks xtb_per_node: Number of XTB calculations ml_tasks_per_node: Number of ML tasks to place on each node log_dir: Path to store monitoring DB and parsl logs total_nodes: Total number of nodes available. Default: COBALT_JOBSIZE Returns: (Config) Parsl configuration """ return Config( executors=[ HighThroughputExecutor( address=address_by_hostname(), label="qc", max_workers=xtb_per_node, cpu_affinity='block', provider=LocalProvider( nodes_per_block=total_nodes, init_blocks=0, max_blocks=1, launcher=AprunLauncher( overrides='-d 64 --cc depth' ), # Places worker on the compute node worker_init=''' module load miniconda-3 conda activate /lus/theta-fs0/projects/CSC249ADCD08/edw/env ''', ), ), HighThroughputExecutor( address=address_by_hostname(), label="ml", max_workers=ml_tasks_per_node, cpu_affinity='block', provider=LocalProvider( nodes_per_block=total_nodes, init_blocks=1, max_blocks=1, launcher=AprunLauncher( overrides='-d 64 --cc depth' ), # Places worker on the compute node worker_init=''' module load miniconda-3 conda activate /lus/theta-fs0/projects/CSC249ADCD08/edw/env ''', ), ) ], monitoring=MonitoringHub( hub_address=address_by_hostname(), hub_port=55055, monitoring_debug=False, resource_monitoring_interval=10, logdir=log_dir, logging_endpoint= f'sqlite:///{os.path.join(log_dir, "monitoring.db")}'), run_dir=log_dir, strategy='simple', max_idletime=15.)
from parsl.channels import LocalChannel from parsl.launchers import SingleNodeLauncher from parsl.launchers import SrunLauncher from parsl.launchers import SimpleLauncher from parsl.executors import HighThroughputExecutor from parsl.executors import ThreadPoolExecutor from parsl.config import Config ## Parsl monitoring from parsl.monitoring.monitoring import MonitoringHub from parsl.addresses import address_by_hostname ## ## Configure Parsl ## hostName = address_by_hostname() ## ## Define Parsl executors ## ## The following Executors are all based on the "High Throughput ## Executor" (or "HTEX") as recommended by the Parsl team. They can ## operate on configuration ranging from a single login (or batch) ## node to many batch nodes. These executors have been tuned for the ## makeBrighterFatterKernel.py DM tool. ##################### ## This executor is intended for large-scale KNL batch work with ## *multiple* nodes & workers/node and employing significant ## parallelism within the DM code ("-j")
def test_parsl_executor(): try: import parsl except ImportError: warnings.warn('parsl not installed, skipping tests') return except Exception as e: warnings.warn('other error when trying to import parsl!') raise e from fnal_column_analysis_tools.processor import run_parsl_job from fnal_column_analysis_tools.processor.parsl.detail import ( _parsl_initialize, _parsl_stop) from parsl.providers import LocalProvider from parsl.channels import LocalChannel from parsl.executors import HighThroughputExecutor from parsl.addresses import address_by_hostname from parsl.config import Config parsl_config = Config( executors=[ HighThroughputExecutor( label="coffea_parsl_default", address=address_by_hostname(), cores_per_worker=max(multiprocessing.cpu_count() // 2, 1), max_workers=1, provider=LocalProvider(channel=LocalChannel(), init_blocks=1, max_blocks=1, nodes_per_block=1), ) ], strategy=None, ) import os import os.path as osp filelist = { 'ZJets': [osp.join(os.getcwd(), 'tests/samples/nano_dy.root')], 'Data': [osp.join(os.getcwd(), 'tests/samples/nano_dimuon.root')] } treename = 'Events' from fnal_column_analysis_tools.processor.test_items import NanoTestProcessor from fnal_column_analysis_tools.processor.parsl.parsl_executor import parsl_executor dfk = _parsl_initialize(parsl_config) proc = NanoTestProcessor() hists = run_parsl_job(filelist, treename, processor_instance=proc, executor=parsl_executor, data_flow=dfk) _parsl_stop(dfk) assert (hists['cutflow']['ZJets_pt'] == 4) assert (hists['cutflow']['ZJets_mass'] == 1) assert (hists['cutflow']['Data_pt'] == 15) assert (hists['cutflow']['Data_mass'] == 5)
def main(args=None): # Grab CLI args if not present if args is None: args = parse_args() exit_callbacks = [] # Construct object settings = ManagerSettings(**args) logger_map = { AdapterEnum.pool: "", AdapterEnum.dask: "dask_jobqueue.core", AdapterEnum.parsl: "parsl" } if settings.common.verbose: adapter_logger = logging.getLogger(logger_map[settings.common.adapter]) adapter_logger.setLevel("DEBUG") if settings.manager.log_file_prefix is not None: tornado.options.options[ 'log_file_prefix'] = settings.manager.log_file_prefix # Clones the log to the output tornado.options.options['log_to_stderr'] = True tornado.log.enable_pretty_logging() if settings.manager.test: # Test this manager, no client needed client = None else: # Connect to a specified fractal server client = qcfractal.interface.FractalClient( address=settings.server.fractal_uri, **settings.server.dict(skip_defaults=True, exclude={"fractal_uri"})) # Figure out per-task data cores_per_task = settings.common.ncores // settings.common.ntasks memory_per_task = settings.common.memory / settings.common.ntasks if cores_per_task < 1: raise ValueError("Cores per task must be larger than one!") if settings.common.adapter == "pool": from concurrent.futures import ProcessPoolExecutor queue_client = ProcessPoolExecutor(max_workers=settings.common.ntasks) elif settings.common.adapter == "dask": dask_settings = settings.dask.dict(skip_defaults=True) # Checks if "extra" not in dask_settings: dask_settings["extra"] = [] if QCA_RESOURCE_STRING not in dask_settings["extra"]: dask_settings["extra"].append(QCA_RESOURCE_STRING) # Scheduler opts scheduler_opts = settings.cluster.scheduler_options.copy() if settings.cluster.node_exclusivity and "--exclusive" not in scheduler_opts: scheduler_opts.append("--exclusive") _cluster_loaders = { "slurm": "SLURMCluster", "pbs": "PBSCluster", "moab": "MoabCluster", "sge": "SGECluster", "lsf": "LSFCluster" } # Create one construct to quickly merge dicts with a final check dask_construct = { "name": "QCFractal_Dask_Compute_Executor", "cores": settings.common.ncores, "memory": str(settings.common.memory) + "GB", "processes": settings.common.ntasks, # Number of workers to generate == tasks "walltime": settings.cluster.walltime, "job_extra": scheduler_opts, "env_extra": settings.cluster.task_startup_commands, **dask_settings } # Import the dask things we need from dask.distributed import Client cluster_module = cli_utils.import_module( "dask_jobqueue", package=_cluster_loaders[settings.cluster.scheduler]) cluster_class = getattr(cluster_module, _cluster_loaders[settings.cluster.scheduler]) from dask_jobqueue import SGECluster class SGEClusterWithJobQueue(SGECluster): """Helper class until Dask Jobqueue fixes #256""" def __init__(self, job_extra=None, **kwargs): super().__init__(**kwargs) if job_extra is not None: more_header = ["#$ %s" % arg for arg in job_extra] self.job_header += "\n" + "\n".join(more_header) # Temporary fix until Dask Jobqueue fixes #256 if cluster_class is SGECluster and 'job_extra' not in inspect.getfullargspec( SGECluster.__init__).args: # Should the SGECluster ever get fixed, this if statement should automatically ensure we stop # using the custom class cluster_class = SGEClusterWithJobQueue cluster = cluster_class(**dask_construct) # Setup up adaption # Workers are distributed down to the cores through the sub-divided processes # Optimization may be needed workers = settings.common.ntasks * settings.cluster.max_nodes if settings.cluster.adaptive == AdaptiveCluster.adaptive: cluster.adapt(minimum=0, maximum=workers, interval="10s") else: cluster.scale(workers) queue_client = Client(cluster) # Make sure tempdir gets assigned correctly # Dragonstooth has the low priority queue elif settings.common.adapter == "parsl": scheduler_opts = settings.cluster.scheduler_options if not settings.cluster.node_exclusivity: raise ValueError( "For now, QCFractal can only be run with Parsl in node exclusivity. This will be relaxed " "in a future release of Parsl and QCFractal") # Import helpers _provider_loaders = { "slurm": "SlurmProvider", "pbs": "TorqueProvider", "moab": "TorqueProvider", "sge": "GridEngineProvider", "lsf": None } if _provider_loaders[settings.cluster.scheduler] is None: raise ValueError( f"Parsl does not know how to handle cluster of type {settings.cluster.scheduler}." ) # Headers _provider_headers = { "slurm": "#SBATCH", "pbs": "#PBS", "moab": "#PBS", "sge": "#$$", "lsf": None } # Import the parsl things we need from parsl.config import Config from parsl.executors import HighThroughputExecutor from parsl.addresses import address_by_hostname provider_module = cli_utils.import_module( "parsl.providers", package=_provider_loaders[settings.cluster.scheduler]) provider_class = getattr(provider_module, _provider_loaders[settings.cluster.scheduler]) provider_header = _provider_headers[settings.cluster.scheduler] if _provider_loaders[settings.cluster.scheduler] == "moab": logger.warning( "Parsl uses its TorqueProvider for Moab clusters due to the scheduler similarities. " "However, if you find a bug with it, please report to the Parsl and QCFractal developers so " "it can be fixed on each respective end.") # Setup the providers # Create one construct to quickly merge dicts with a final check common_parsl_provider_construct = { "init_blocks": 0, # Update this at a later time of Parsl "max_blocks": settings.cluster.max_nodes, "walltime": settings.cluster.walltime, "scheduler_options": f'{provider_header} ' + f'\n{provider_header} '.join(scheduler_opts) + '\n', "nodes_per_block": 1, "worker_init": '\n'.join(settings.cluster.task_startup_commands), **settings.parsl.provider.dict(skip_defaults=True, exclude={"partition"}) } if settings.cluster.scheduler == "slurm": # The Parsl SLURM constructor has a strange set of arguments provider = provider_class( settings.parsl.provider.partition, exclusive=settings.cluster.node_exclusivity, **common_parsl_provider_construct) else: provider = provider_class(**common_parsl_provider_construct) parsl_executor_construct = { "label": "QCFractal_Parsl_{}_Executor".format( settings.cluster.scheduler.title()), "cores_per_worker": cores_per_task, "max_workers": settings.common.ntasks * settings.cluster.max_nodes, "provider": provider, "address": address_by_hostname(), **settings.parsl.executor.dict(skip_defaults=True) } queue_client = Config( executors=[HighThroughputExecutor(**parsl_executor_construct)]) else: raise KeyError( "Unknown adapter type '{}', available options: {}.\n" "This code should also be unreachable with pydantic Validation, so if " "you see this message, please report it to the QCFractal GitHub". format(settings.common.adapter, [getattr(AdapterEnum, v).value for v in AdapterEnum])) # Build out the manager itself manager = qcfractal.queue.QueueManager( client, queue_client, max_tasks=settings.manager.max_tasks, queue_tag=settings.manager.queue_tag, manager_name=settings.manager.manager_name, update_frequency=settings.manager.update_frequency, cores_per_task=cores_per_task, memory_per_task=memory_per_task, scratch_directory=settings.common.scratch_directory, verbose=settings.common.verbose) # Add exit callbacks for cb in exit_callbacks: manager.add_exit_callback(cb[0], *cb[1], **cb[2]) # Either startup the manager or run until complete if settings.manager.test: success = manager.test(settings.manager.ntests) if success is False: raise ValueError("Testing was not successful, failing.") else: for signame in {"SIGHUP", "SIGINT", "SIGTERM"}: def stop(*args, **kwargs): manager.stop(signame) raise KeyboardInterrupt() signal.signal(getattr(signal, signame), stop) # Blocks until signal try: manager.start() except KeyboardInterrupt: pass
def parsl_condor_config(workers=1): x509_proxy = f'x509up_u{UID}' grid_proxy_dir = '/tmp' cores_per_job = 1 mem_per_core = 2000 mem_request = mem_per_core * cores_per_job init_blocks = 1 min_blocks = 1 max_blocks = workers htex_label='coffea_parsl_condor_htex' log_dir = 'parsl_logs' log_dir_full = os.path.join('/nfs_scratch/dntaylor',log_dir) worker_init = f''' echo "Setting up environment" tar -zxf columnar.tar.gz source columnar/bin/activate export PATH=columnar/bin:$PATH export PYTHONPATH=columnar/lib/python3.6/site-packages:$PYTHONPATH export X509_USER_PROXY={x509_proxy} mkdir -p {log_dir}/{htex_label} echo "Environment ready" ''' # requirements for T2_US_Wisconsin (HAS_CMS_HDFS forces to run a T2 node not CHTC) # Removing for now: scheduler_options = f''' transfer_output_files = {log_dir}/{htex_label} RequestMemory = {mem_request} RequestCpus = {cores_per_job} +RequiresCVMFS = True Requirements = TARGET.HAS_CMS_HDFS && TARGET.Arch == "X86_64" priority = 10 ''' transfer_input_files = [os.path.join(os.path.dirname(os.path.abspath(__file__)),'columnar.tar.gz'), os.path.join(grid_proxy_dir, x509_proxy)] htex = Config( executors=[ HighThroughputExecutor( label=htex_label, address=address_by_hostname(), prefetch_capacity=0, cores_per_worker=1, max_workers=cores_per_job, worker_logdir_root=log_dir, provider=CondorProvider( channel=LocalChannel( userhome='/nfs_scratch/dntaylor', ), init_blocks=init_blocks, min_blocks=min_blocks, max_blocks=max_blocks, nodes_per_block=1, worker_init=worker_init, transfer_input_files=transfer_input_files, scheduler_options=scheduler_options, ), ), # TODO: works, but really isn't helpful since half of the tasks get shipped to the condor # executor and don't flock back when the local executor is empty # an alternative could be to preprocess locally and process on the grid # add a local executor so stuff starts fast #HighThroughputExecutor( # label="coffea_parsl_default", # cores_per_worker=1, # max_workers=1, # TODO: multicore local? # worker_logdir_root=log_dir, # provider=LocalProvider( # channel=LocalChannel(), # init_blocks=1, # max_blocks=1, # ), #), ], strategy='simple', run_dir=os.path.join(log_dir_full,'runinfo'), retries = 2, # retry all failures, xrootd failures are retried then skipped via coffea executor itself ) return htex
import getpass from parsl.addresses import address_by_hostname global_options = { 'username': getpass.getuser(), 'email': '*****@*****.**', 'broker_address': '127.0.0.1', 'broker_port': 8088, 'endpoint_address': address_by_hostname(), }
def main(args=None): # Grab CLI args if not present if args is None: args = parse_args() exit_callbacks = [] try: if args["debug"]["schema"]: print(ManagerSettings.schema_json(indent=2)) return # We're done, exit normally except KeyError: pass # Don't worry if schema isn't in the list finally: args.pop("debug", None) # Ensure the debug key is not present # Construct object settings = ManagerSettings(**args) logger_map = { AdapterEnum.pool: "", AdapterEnum.dask: "dask_jobqueue.core", AdapterEnum.parsl: "parsl" } if settings.common.verbose: adapter_logger = logging.getLogger(logger_map[settings.common.adapter]) adapter_logger.setLevel("DEBUG") logger.setLevel("DEBUG") if settings.manager.log_file_prefix is not None: tornado.options.options[ 'log_file_prefix'] = settings.manager.log_file_prefix # Clones the log to the output tornado.options.options['log_to_stderr'] = True tornado.log.enable_pretty_logging() if settings.manager.test: # Test this manager, no client needed client = None else: # Connect to a specified fractal server client = qcfractal.interface.FractalClient( address=settings.server.fractal_uri, **settings.server.dict(skip_defaults=True, exclude={"fractal_uri"})) # Figure out per-task data cores_per_task = settings.common.cores_per_worker // settings.common.tasks_per_worker memory_per_task = settings.common.memory_per_worker / settings.common.tasks_per_worker if cores_per_task < 1: raise ValueError("Cores per task must be larger than one!") if settings.common.adapter == "pool": from concurrent.futures import ProcessPoolExecutor queue_client = ProcessPoolExecutor( max_workers=settings.common.tasks_per_worker) elif settings.common.adapter == "dask": dask_settings = settings.dask.dict(skip_defaults=True) # Checks if "extra" not in dask_settings: dask_settings["extra"] = [] if QCA_RESOURCE_STRING not in dask_settings["extra"]: dask_settings["extra"].append(QCA_RESOURCE_STRING) # Scheduler opts scheduler_opts = settings.cluster.scheduler_options.copy() _cluster_loaders = { "slurm": "SLURMCluster", "pbs": "PBSCluster", "moab": "MoabCluster", "sge": "SGECluster", "lsf": "LSFCluster" } dask_exclusivity_map = { "slurm": "--exclusive", "pbs": "-n", "moab": "-n", # Less sure about this one "sge": "-l exclusive=true", "lsf": "-x", } if settings.cluster.node_exclusivity and dask_exclusivity_map[ settings.cluster.scheduler] not in scheduler_opts: scheduler_opts.append( dask_exclusivity_map[settings.cluster.scheduler]) # Create one construct to quickly merge dicts with a final check dask_construct = { "name": "QCFractal_Dask_Compute_Executor", "cores": settings.common.cores_per_worker, "memory": str(settings.common.memory_per_worker) + "GB", "processes": settings.common. tasks_per_worker, # Number of workers to generate == tasks in this construct "walltime": settings.cluster.walltime, "job_extra": scheduler_opts, "env_extra": settings.cluster.task_startup_commands, **dask_settings } try: # Import the dask things we need import dask_jobqueue from dask.distributed import Client cluster_module = cli_utils.import_module( "dask_jobqueue", package=_cluster_loaders[settings.cluster.scheduler]) cluster_class = getattr( cluster_module, _cluster_loaders[settings.cluster.scheduler]) if dask_jobqueue.__version__ < "0.5.0": raise ImportError except ImportError: raise ImportError( "You need`dask-jobqueue >= 0.5.0` to use the `dask` adapter") cluster = cluster_class(**dask_construct) # Setup up adaption # Workers are distributed down to the cores through the sub-divided processes # Optimization may be needed workers = settings.common.tasks_per_worker * settings.common.max_workers if settings.cluster.adaptive == AdaptiveCluster.adaptive: cluster.adapt(minimum=0, maximum=workers, interval="10s") else: cluster.scale(workers) queue_client = Client(cluster) elif settings.common.adapter == "parsl": scheduler_opts = settings.cluster.scheduler_options if not settings.cluster.node_exclusivity: raise ValueError( "For now, QCFractal can only be run with Parsl in node exclusivity. This will be relaxed " "in a future release of Parsl and QCFractal") # Import helpers _provider_loaders = { "slurm": "SlurmProvider", "pbs": "TorqueProvider", "moab": "TorqueProvider", "sge": "GridEngineProvider", "lsf": None } if _provider_loaders[settings.cluster.scheduler] is None: raise ValueError( f"Parsl does not know how to handle cluster of type {settings.cluster.scheduler}." ) # Headers _provider_headers = { "slurm": "#SBATCH", "pbs": "#PBS", "moab": "#PBS", "sge": "#$$", "lsf": None } # Import the parsl things we need try: import parsl from parsl.config import Config from parsl.executors import HighThroughputExecutor from parsl.addresses import address_by_hostname provider_module = cli_utils.import_module( "parsl.providers", package=_provider_loaders[settings.cluster.scheduler]) provider_class = getattr( provider_module, _provider_loaders[settings.cluster.scheduler]) provider_header = _provider_headers[settings.cluster.scheduler] if parsl.__version__ < '0.8.0': raise ImportError except ImportError: raise ImportError( "You need `parsl >=0.8.0` to use the `parsl` adapter") if _provider_loaders[settings.cluster.scheduler] == "moab": logger.warning( "Parsl uses its TorqueProvider for Moab clusters due to the scheduler similarities. " "However, if you find a bug with it, please report to the Parsl and QCFractal developers so " "it can be fixed on each respective end.") # Setup the providers # Create one construct to quickly merge dicts with a final check common_parsl_provider_construct = { "init_blocks": 0, # Update this at a later time of Parsl "max_blocks": settings.common.max_workers, "walltime": settings.cluster.walltime, "scheduler_options": f'{provider_header} ' + f'\n{provider_header} '.join(scheduler_opts) + '\n', "nodes_per_block": 1, "worker_init": '\n'.join(settings.cluster.task_startup_commands), **settings.parsl.provider.dict(skip_defaults=True, exclude={"partition", "launcher"}) } if settings.parsl.provider.launcher: common_parsl_provider_construct[ "launcher"] = settings.parsl.provider.launcher.build_launcher( ) if settings.cluster.scheduler == "slurm": # The Parsl SLURM constructor has a strange set of arguments provider = provider_class( settings.parsl.provider.partition, exclusive=settings.cluster.node_exclusivity, **common_parsl_provider_construct) else: provider = provider_class(**common_parsl_provider_construct) parsl_executor_construct = { "label": "QCFractal_Parsl_{}_Executor".format( settings.cluster.scheduler.title()), "cores_per_worker": cores_per_task, "max_workers": settings.common.tasks_per_worker * settings.common.max_workers, "provider": provider, "address": address_by_hostname(), **settings.parsl.executor.dict(skip_defaults=True) } queue_client = Config( executors=[HighThroughputExecutor(**parsl_executor_construct)]) else: raise KeyError( "Unknown adapter type '{}', available options: {}.\n" "This code should also be unreachable with pydantic Validation, so if " "you see this message, please report it to the QCFractal GitHub". format(settings.common.adapter, [getattr(AdapterEnum, v).value for v in AdapterEnum])) # Build out the manager itself # Compute max tasks max_concurrent_tasks = settings.common.tasks_per_worker * settings.common.max_workers if settings.manager.max_queued_tasks is None: # Tasks * jobs * buffer + 1 max_queued_tasks = ceil(max_concurrent_tasks * 2.00) + 1 else: max_queued_tasks = settings.manager.max_queued_tasks manager = qcfractal.queue.QueueManager( client, queue_client, max_tasks=max_queued_tasks, queue_tag=settings.manager.queue_tag, manager_name=settings.manager.manager_name, update_frequency=settings.manager.update_frequency, cores_per_task=cores_per_task, memory_per_task=memory_per_task, scratch_directory=settings.common.scratch_directory, verbose=settings.common.verbose) # Set stats correctly since we buffer the max tasks a bit manager.statistics.max_concurrent_tasks = max_concurrent_tasks # Add exit callbacks for cb in exit_callbacks: manager.add_exit_callback(cb[0], *cb[1], **cb[2]) # Either startup the manager or run until complete if settings.manager.test: success = manager.test(settings.manager.ntests) if success is False: raise ValueError("Testing was not successful, failing.") else: for signame in {"SIGHUP", "SIGINT", "SIGTERM"}: def stop(*args, **kwargs): manager.stop(signame) raise KeyboardInterrupt() signal.signal(getattr(signal, signame), stop) # Blocks until signal try: manager.start() except KeyboardInterrupt: pass
from parsl.config import Config from parsl.executors import HighThroughputExecutor from parsl.launchers import SingleNodeLauncher from parsl.providers import SlurmProvider from parsl.addresses import address_by_hostname from parsl.monitoring.monitoring import MonitoringHub import os config = Config( executors=[ HighThroughputExecutor( cores_per_worker=4, mem_per_worker=40, max_workers=4, worker_debug=True, address=address_by_hostname(), provider=SlurmProvider( 'daenerys', worker_init=("source activate /cephfs/users/jbreynier/conda/parsl_env2 ; " "export PYTHONPATH='{}:{{PYTHONPATH}}'").format(os.getcwd()), init_blocks=1, max_blocks=10, min_blocks=0, nodes_per_block=1, walltime='99:00:00', scheduler_options='#SBATCH --exclude=kg15-11 --cpus-per-task=16 --mem=160gb --time=99:00:00', ), ), ], monitoring=MonitoringHub( hub_address=address_by_hostname(),
def theta_nwchem_config(ml_workers: int, log_dir: str, nodes_per_nwchem: int = 2, total_nodes: int = int(os.environ.get("COBALT_JOBSIZE", 1))) -> Config: """Theta configuration where QC workers sit on the launch node (to be able to aprun) and ML workers are placed on compute nodes Args: ml_workers: Number of nodes dedicated to ML tasks nodes_per_nwchem: Number of nodes per NWChem computation log_dir: Path to store monitoring DB and parsl logs total_nodes: Total number of nodes available. Default: COBALT_JOBSIZE Returns: (Config) Parsl configuration """ nwc_nodes = total_nodes - ml_workers assert nwc_nodes % nodes_per_nwchem == 0, "NWChem node count not a multiple of nodes per task" nwc_workers = nwc_nodes // nodes_per_nwchem return Config( executors=[ HighThroughputExecutor( address=address_by_hostname(), label="qc", max_workers=nwc_workers, cores_per_worker=1e-6, provider=LocalProvider( nodes_per_block=1, init_blocks=1, max_blocks=1, launcher=SimpleLauncher(), # Places worker on the launch node worker_init=''' module load miniconda-3 conda activate /lus/theta-fs0/projects/CSC249ADCD08/edw/env ''', ), ), HighThroughputExecutor( address=address_by_hostname(), label="ml", max_workers=1, provider=LocalProvider( nodes_per_block=ml_workers, init_blocks=1, max_blocks=1, launcher=AprunLauncher(overrides='-d 64 --cc depth'), # Places worker on the compute node worker_init=''' module load miniconda-3 conda activate /lus/theta-fs0/projects/CSC249ADCD08/edw/env ''', ), ) ], monitoring=MonitoringHub( hub_address=address_by_hostname(), monitoring_debug=False, resource_monitoring_interval=10, logdir=log_dir, logging_endpoint=f'sqlite:///{os.path.join(log_dir, "monitoring.db")}' ), run_dir=log_dir, strategy=None, )
twoGB = 2048 nproc = 16 sched_opts = ''' #SBATCH --cpus-per-task=%d #SBATCH --mem-per-cpu=%d ''' % ( nproc, twoGB, ) slurm_htex = Config( executors=[ HighThroughputExecutor( label="coffea_parsl_slurm", address=address_by_hostname(), prefetch_capacity=0, max_workers=nproc, provider=SlurmProvider( launcher=SrunLauncher(), init_blocks=4, max_blocks=4, nodes_per_block=1, partition='batch,guest,gpu', scheduler_options= sched_opts, # Enter scheduler_options if needed worker_init=wrk_init, # Enter worker_init if needed walltime='02:00:00'), ) ], strategy=None,
def local_config(log_dir: str, max_workers: int, prefetch: int = 0) -> Config: """Single node with a single task per worker Args: log_dir: Path to store monitoring DB and parsl logs max_workers: Maximum number of concurrent tasks prefetch: Number of tasks for ML workers to prefetch for inference Returns: (Config) Parsl configuration """ return Config( executors=[ HighThroughputExecutor( address=address_by_hostname(), label="qc-worker", max_workers=max_workers, prefetch_capacity=prefetch, cpu_affinity='block', provider=LocalProvider( nodes_per_block=1, init_blocks=1, max_blocks=1, launcher=SimpleLauncher( ), # Places worker on the launch node ), ), HighThroughputExecutor( address=address_by_hostname(), label="ml-worker-tensorflow", max_workers=1, prefetch_capacity=prefetch, provider=LocalProvider( nodes_per_block=1, init_blocks=1, max_blocks=1, worker_init= 'sleep 30', # Give enough time for other workers to exit (memory!) launcher=SimpleLauncher( ), # Places worker on the launch node )), HighThroughputExecutor( address=address_by_hostname(), label= "ml-worker-tensorflow-infer", # Something about training and then running a model causes issues? max_workers=1, prefetch_capacity=prefetch, provider=LocalProvider( nodes_per_block=1, init_blocks=1, max_blocks=1, worker_init= 'sleep 30', # Give enough time for other workers to exit (memory!) launcher=SimpleLauncher( ), # Places worker on the launch node )), HighThroughputExecutor( address=address_by_hostname(), label="ml-worker-torch", max_workers=1, prefetch_capacity=prefetch, provider=LocalProvider( nodes_per_block=1, init_blocks=1, max_blocks=1, worker_init= 'sleep 30', # Give enough time for other workers to exit (memory!) launcher=SimpleLauncher( ), # Places worker on the launch node )) ], run_dir=log_dir, strategy='simple', max_idletime=15.)
def main(args=None): # Grab CLI args if not present if args is None: args = parse_args() exit_callbacks = [] try: if args["debug"]["schema"]: print(ManagerSettings.schema_json(indent=2)) return # We're done, exit normally except KeyError: pass # Don't worry if schema isn't in the list finally: debug_args = args.pop("debug", {}) # Ensure the debug key is not present # Construct object settings = ManagerSettings(**args) # Handle Skeleton Generation if debug_args.get("skeleton", None): class IndentListDumper(yaml.Dumper): """ Internal yaml Dumper to make lists indent in the output YAML Buried inside this since its only used in "skeleton," once, and then exits. Does not need to be imported anywhere else or accessed somehow Based on response: https://stackoverflow.com/questions/25108581/python-yaml-dump-bad-indentation/39681672#39681672 """ def increase_indent(self, flow=False, indentless=False): return super(IndentListDumper, self).increase_indent(flow, False) skel_path = os.path.expanduser(debug_args["skeleton"]) with open(skel_path, "w") as skel: # cast to data = yaml.dump(json.loads(settings.json()), Dumper=IndentListDumper, default_flow_style=False) skel.write(data) print( f"Skeleton Queue Manager YAML file written to {skel_path}\n" f"Run: `qcfractal-manager --config-file={skel_path}` to start a manager with this configuration." ) return logger_map = { AdapterEnum.pool: "", AdapterEnum.dask: "dask_jobqueue.core", AdapterEnum.parsl: "parsl" } if settings.common.verbose: adapter_logger = logging.getLogger(logger_map[settings.common.adapter]) adapter_logger.setLevel("DEBUG") logger.setLevel("DEBUG") if settings.manager.log_file_prefix is not None: tornado.options.options[ "log_file_prefix"] = settings.manager.log_file_prefix # Clones the log to the output tornado.options.options["log_to_stderr"] = True tornado.log.enable_pretty_logging() if settings.manager.test: # Test this manager, no client needed client = None else: # Connect to a specified fractal server client = qcfractal.interface.FractalClient( address=settings.server.fractal_uri, **settings.server.dict(skip_defaults=True, exclude={"fractal_uri"})) # Figure out per-task data node_parallel_tasks = settings.common.nodes_per_task > 1 # Whether tasks are node-parallel if node_parallel_tasks: supported_adapters = ["parsl"] if settings.common.adapter not in supported_adapters: raise ValueError( "Node-parallel jobs are only supported with {} adapters". format(supported_adapters)) # Node-parallel tasks use all cores on a worker cores_per_task = settings.common.cores_per_worker memory_per_task = settings.common.memory_per_worker if settings.common.tasks_per_worker > 1: raise ValueError( ">1 task per node and >1 node per tasks are mutually-exclusive" ) else: cores_per_task = settings.common.cores_per_worker // settings.common.tasks_per_worker memory_per_task = settings.common.memory_per_worker / settings.common.tasks_per_worker if cores_per_task < 1: raise ValueError("Cores per task must be larger than one!") if settings.common.adapter == "pool": from concurrent.futures import ProcessPoolExecutor # Error if the number of nodes per jobs is more than 1 if settings.common.nodes_per_job > 1: raise ValueError("Pool adapters only run on a single local node") queue_client = ProcessPoolExecutor( max_workers=settings.common.tasks_per_worker) elif settings.common.adapter == "dask": dask_settings = settings.dask.dict(skip_defaults=True) # Checks if "extra" not in dask_settings: dask_settings["extra"] = [] if QCA_RESOURCE_STRING not in dask_settings["extra"]: dask_settings["extra"].append(QCA_RESOURCE_STRING) # Scheduler opts scheduler_opts = settings.cluster.scheduler_options.copy() # Error if the number of nodes per jobs is more than 1 if settings.common.nodes_per_job > 1: raise NotImplementedError( "Support for >1 node per job is not yet supported by QCFractal + Dask" ) # TODO (wardlt): Implement multinode jobs in Dask _cluster_loaders = { "slurm": "SLURMCluster", "pbs": "PBSCluster", "moab": "MoabCluster", "sge": "SGECluster", "lsf": "LSFCluster", } dask_exclusivity_map = { "slurm": "--exclusive", "pbs": "-n", "moab": "-n", # Less sure about this one "sge": "-l exclusive=true", "lsf": "-x", } if settings.cluster.node_exclusivity and dask_exclusivity_map[ settings.cluster.scheduler] not in scheduler_opts: scheduler_opts.append( dask_exclusivity_map[settings.cluster.scheduler]) # Create one construct to quickly merge dicts with a final check dask_construct = { "name": "QCFractal_Dask_Compute_Executor", "cores": settings.common.cores_per_worker, "memory": str(settings.common.memory_per_worker) + "GB", "processes": settings.common. tasks_per_worker, # Number of workers to generate == tasks in this construct "walltime": settings.cluster.walltime, "job_extra": scheduler_opts, "env_extra": settings.cluster.task_startup_commands, **dask_settings, } try: # Import the dask things we need import dask_jobqueue from dask.distributed import Client cluster_module = cli_utils.import_module( "dask_jobqueue", package=_cluster_loaders[settings.cluster.scheduler]) cluster_class = getattr( cluster_module, _cluster_loaders[settings.cluster.scheduler]) if dask_jobqueue.__version__ < "0.5.0": raise ImportError except ImportError: raise ImportError( "You need`dask-jobqueue >= 0.5.0` to use the `dask` adapter") cluster = cluster_class(**dask_construct) # Setup up adaption # Workers are distributed down to the cores through the sub-divided processes # Optimization may be needed workers = settings.common.tasks_per_worker * settings.common.max_workers if settings.cluster.adaptive == AdaptiveCluster.adaptive: cluster.adapt(minimum=0, maximum=workers, interval="10s") else: cluster.scale(workers) queue_client = Client(cluster) elif settings.common.adapter == "parsl": scheduler_opts = settings.cluster.scheduler_options if not settings.cluster.node_exclusivity: raise ValueError( "For now, QCFractal can only be run with Parsl in node exclusivity. This will be relaxed " "in a future release of Parsl and QCFractal") # Import helpers _provider_loaders = { "slurm": "SlurmProvider", "pbs": "TorqueProvider", "moab": "TorqueProvider", "sge": "GridEngineProvider", "cobalt": "CobaltProvider", "lsf": None, } if _provider_loaders[settings.cluster.scheduler] is None: raise ValueError( f"Parsl does not know how to handle cluster of type {settings.cluster.scheduler}." ) # Headers _provider_headers = { "slurm": "#SBATCH", "pbs": "#PBS", "moab": "#PBS", "sge": "#$$", "lsf": None, "cobalt": "#COBALT", } # Import the parsl things we need try: import parsl from parsl.config import Config from parsl.executors import HighThroughputExecutor from parsl.addresses import address_by_hostname provider_module = cli_utils.import_module( "parsl.providers", package=_provider_loaders[settings.cluster.scheduler]) provider_class = getattr( provider_module, _provider_loaders[settings.cluster.scheduler]) provider_header = _provider_headers[settings.cluster.scheduler] if parsl.__version__ < "0.9.0": raise ImportError except ImportError: raise ImportError( "You need `parsl >=0.9.0` to use the `parsl` adapter") if _provider_loaders[settings.cluster.scheduler] == "moab": logger.warning( "Parsl uses its TorqueProvider for Moab clusters due to the scheduler similarities. " "However, if you find a bug with it, please report to the Parsl and QCFractal developers so " "it can be fixed on each respective end.") # Setup the providers # Determine the maximum number of blocks # TODO (wardlt): Math assumes that user does not set aside a compute node for the adapter max_nodes = settings.common.max_workers * settings.common.nodes_per_task if settings.common.nodes_per_job > max_nodes: raise ValueError( "Number of nodes per job is more than the maximum number of nodes used by manager" ) if max_nodes % settings.common.nodes_per_job != 0: raise ValueError( "Maximum number of nodes (maximum number of workers times nodes per task) " "needs to be a multiple of the number of nodes per job") if settings.common.nodes_per_job % settings.common.nodes_per_task != 0: raise ValueError( "Number of nodes per job needs to be a multiple of the number of nodes per task" ) max_blocks = max_nodes // settings.common.nodes_per_job # Create one construct to quickly merge dicts with a final check common_parsl_provider_construct = { "init_blocks": 0, # Update this at a later time of Parsl "max_blocks": max_blocks, "walltime": settings.cluster.walltime, "scheduler_options": f"{provider_header} " + f"\n{provider_header} ".join(scheduler_opts) + "\n", "nodes_per_block": settings.common.nodes_per_job, "worker_init": "\n".join(settings.cluster.task_startup_commands), **settings.parsl.provider.dict(skip_defaults=True, exclude={"partition", "launcher"}), } if settings.cluster.scheduler.lower( ) == "slurm" and "cores_per_node" not in common_parsl_provider_construct: common_parsl_provider_construct[ "cores_per_node"] = settings.common.cores_per_worker # TODO: uncomment after Parsl#1416 is resolved # if settings.cluster.scheduler.lower() == "slurm" and "mem_per_node" not in common_parsl_provider_construct: # common_parsl_provider_construct["mem_per_node"] = settings.common.memory_per_worker if settings.parsl.provider.launcher: common_parsl_provider_construct[ "launcher"] = settings.parsl.provider.launcher.build_launcher( ) if settings.cluster.scheduler == "slurm": # The Parsl SLURM constructor has a strange set of arguments provider = provider_class( settings.parsl.provider.partition, exclusive=settings.cluster.node_exclusivity, **common_parsl_provider_construct, ) else: provider = provider_class(**common_parsl_provider_construct) # The executor for Parsl is different for node parallel tasks and shared-memory tasks if node_parallel_tasks: # Tasks are launched from a single worker on the login node # TODO (wardlt): Remove assumption that there is only one Parsl worker running all tasks tasks_per_job = settings.common.nodes_per_job // settings.common.nodes_per_task logger.info( f"Preparing a HTEx to use node-parallel tasks with {tasks_per_job} workers" ) parsl_executor_construct = { "label": "QCFractal_Parsl_{}_Executor".format( settings.cluster.scheduler.title()), # Parsl will create one worker process per MPI task. Normally, Parsl prevents having # more processes than cores. However, as each worker will spend most of its time # waiting for the MPI task to complete, we can safely oversubscribe (e.g., more worker # processes than cores), which requires setting "cores_per_worker" to <1 "cores_per_worker": 1e-6, "max_workers": tasks_per_job, "provider": provider, "address": address_by_hostname(), **settings.parsl.executor.dict(skip_defaults=True), } else: parsl_executor_construct = { "label": "QCFractal_Parsl_{}_Executor".format( settings.cluster.scheduler.title()), "cores_per_worker": cores_per_task, "max_workers": settings.common.tasks_per_worker, "provider": provider, "address": address_by_hostname(), **settings.parsl.executor.dict(skip_defaults=True), } queue_client = Config( retries=settings.common.retries, executors=[HighThroughputExecutor(**parsl_executor_construct)]) else: raise KeyError( "Unknown adapter type '{}', available options: {}.\n" "This code should also be unreachable with pydantic Validation, so if " "you see this message, please report it to the QCFractal GitHub". format(settings.common.adapter, [getattr(AdapterEnum, v).value for v in AdapterEnum])) # Build out the manager itself # Compute max tasks max_concurrent_tasks = settings.common.tasks_per_worker * settings.common.max_workers if settings.manager.max_queued_tasks is None: # Tasks * jobs * buffer + 1 max_queued_tasks = ceil(max_concurrent_tasks * 2.00) + 1 else: max_queued_tasks = settings.manager.max_queued_tasks # The queue manager is configured differently for node-parallel and single-node tasks manager = qcfractal.queue.QueueManager( client, queue_client, max_tasks=max_queued_tasks, queue_tag=settings.manager.queue_tag, manager_name=settings.manager.manager_name, update_frequency=settings.manager.update_frequency, cores_per_task=cores_per_task, memory_per_task=memory_per_task, nodes_per_task=settings.common.nodes_per_task, scratch_directory=settings.common.scratch_directory, retries=settings.common.retries, verbose=settings.common.verbose, cores_per_rank=settings.common.cores_per_rank, configuration=settings, ) # Set stats correctly since we buffer the max tasks a bit manager.statistics.max_concurrent_tasks = max_concurrent_tasks # Add exit callbacks for cb in exit_callbacks: manager.add_exit_callback(cb[0], *cb[1], **cb[2]) # Either startup the manager or run until complete if settings.manager.test: success = manager.test(settings.manager.ntests) if success is False: raise ValueError("Testing was not successful, failing.") else: for signame in {"SIGHUP", "SIGINT", "SIGTERM"}: def stop(*args, **kwargs): manager.stop(signame) raise KeyboardInterrupt() signal.signal(getattr(signal, signame), stop) # Blocks until signal try: manager.start() except KeyboardInterrupt: pass
logger.info("Done with imports") # TODO: proper boolean switch here to switch between checkpointing and # monitoring, as they do not work together at the moment. # - see https://github.com/Parsl/parsl/issues/1014 config = parsl.config.Config( executors=[ parsl.executors.ThreadPoolExecutor(label="management", max_threads=20), parsl.executors.ThreadPoolExecutor(label="heavy", max_threads=3), ], # monitoring config from # https://parsl.readthedocs.io/en/latest/userguide/monitoring.html # modified to add hub_port - see https://github.com/Parsl/parsl/issues/1010 monitoring=MonitoringHub(hub_address=address_by_hostname(), logging_level=logging.INFO, resource_monitoring_interval=10, hub_port=30733)) # config.checkpoint_mode = 'task_exit' REPO_BASE = "REPO" logger.info("Getting checkpoint files") config.checkpoint_files = parsl.utils.get_all_checkpoints() logger.info("Checkpoint files: {}".format(config.checkpoint_files)) class RepoInfo: def __init__(self, repo_base, rerun=None):