def __init__(self, channels=[], worker_init='', cmd_timeout=30, parallelism=1, move_files=None): self.channels = channels self._label = 'ad-hoc' self.worker_init = worker_init self.cmd_timeout = cmd_timeout self.parallelism = 1 self.move_files = move_files self.launcher = SimpleLauncher() self.init_blocks = self.min_blocks = self.max_blocks = len(channels) # This will be overridden by the DFK to the rundirs. self.script_dir = "." # In ad-hoc mode, nodes_per_block should be 1 self.nodes_per_block = 1 # Dictionary that keeps track of jobs, keyed on job_id self.resources = {} self.least_loaded = self._least_loaded() logger.debug("AdHoc provider initialized")
def test_one_block(): oneshot_provider = OneShotLocalProvider( channel=LocalChannel(), init_blocks=0, min_blocks=0, max_blocks=10, launcher=SimpleLauncher(), ) config = Config( executors=[ HighThroughputExecutor( label="htex_local", worker_debug=True, cores_per_worker=1, provider=oneshot_provider, ) ], strategy='simple', ) parsl.load(config) f = app() f.result() parsl.clear() assert oneshot_provider.recorded_submits == 1
def local_config(log_dir: str, max_workers: int, prefetch: int = 0) -> Config: """Single node with a single task per worker Args: log_dir: Path to store monitoring DB and parsl logs max_workers: Maximum number of concurrent tasks prefetch: Number of tasks for ML workers to prefetch for inference Returns: (Config) Parsl configuration """ return Config( executors=[ HighThroughputExecutor( address=address_by_hostname(), label="qc-worker", max_workers=max_workers, prefetch_capacity=prefetch, cpu_affinity='block', provider=LocalProvider( nodes_per_block=1, init_blocks=1, max_blocks=1, launcher=SimpleLauncher(), # Places worker on the launch node ), ), HighThroughputExecutor( address=address_by_hostname(), label="ml-worker", max_workers=1, prefetch_capacity=prefetch, provider=LocalProvider( nodes_per_block=1, init_blocks=1, max_blocks=1, launcher=SimpleLauncher(), # Places worker on the launch node ) ) ], run_dir=log_dir, strategy='simple', max_idletime=15. )
def fresh_config(): return Config( executors=[ HighThroughputExecutor( label="htex_local", worker_debug=True, cores_per_worker=1, provider=LocalProvider( channel=LocalChannel(), init_blocks=1, max_blocks=1, launcher=SimpleLauncher(), ), ) ], strategy=None, )
def theta_nwchem_config( choice: str, log_dir: str, nodes_per_nwchem: int = 2, total_nodes: int = int(os.environ.get("COBALT_JOBSIZE", 1)) ) -> Config: """Theta configuration to run NWChem Args: choice: Choice of the runtime configuration nodes_per_nwchem: Number of nodes per NWChem computation log_dir: Path to store monitoring DB and parsl logs total_nodes: Total number of nodes available. Default: COBALT_JOBSIZE Returns: (Config) Parsl configuration """ assert total_nodes % nodes_per_nwchem == 0, "NWChem node count not a multiple of nodes per task" nwc_workers = total_nodes // nodes_per_nwchem if choice == "htex": qc_exec = HighThroughputExecutor( address=address_by_hostname(), label="qc", max_workers=nwc_workers, cores_per_worker=1e-6, provider=LocalProvider( nodes_per_block=1, init_blocks=0, max_blocks=1, launcher=SimpleLauncher(), # Places worker on the launch node worker_init=''' module load miniconda-3 conda activate /lus/theta-fs0/projects/CSC249ADCD08/edw/env ''', ), ) elif choice == 'thread': qc_exec = ThreadPoolExecutor(label='qc', max_threads=nwc_workers) else: raise ValueError(f'Choice "{choice}" not recognized ') return Config(executors=[qc_exec], run_dir=log_dir, strategy='simple', max_idletime=15.)
from parsl.providers import LocalProvider from parsl.channels import LocalChannel from parsl.launchers import SimpleLauncher from parsl.config import Config from parsl.executors import ExtremeScaleExecutor config = Config( executors=[ ExtremeScaleExecutor(label="Extreme_Local", worker_debug=True, ranks_per_node=4, provider=LocalProvider( channel=LocalChannel(), init_blocks=1, max_blocks=1, launcher=SimpleLauncher(), )) ], strategy=None, )
def theta_nwchem_config(log_dir: str, nodes_per_nwchem: int = 2, total_nodes: int = int( os.environ.get("COBALT_JOBSIZE", 1)), ml_prefetch: int = 0) -> Config: """Theta configuration where QC workers sit on the launch node (to be able to aprun) and ML workers are placed on compute nodes Args: ml_workers: Number of nodes dedicated to ML tasks nodes_per_nwchem: Number of nodes per NWChem computation log_dir: Path to store monitoring DB and parsl logs total_nodes: Total number of nodes available. Default: COBALT_JOBSIZE ml_prefetch: Number of tasks for ML workers to prefect Returns: (Config) Parsl configuration """ assert total_nodes % nodes_per_nwchem == 0, "NWChem node count not a multiple of nodes per task" nwc_workers = total_nodes // nodes_per_nwchem return Config( executors=[ HighThroughputExecutor( address=address_by_hostname(), label="qc", max_workers=nwc_workers, cores_per_worker=1e-6, provider=LocalProvider( nodes_per_block=1, init_blocks=0, max_blocks=1, launcher=SimpleLauncher( ), # Places worker on the launch node worker_init=''' module load miniconda-3 conda activate /lus/theta-fs0/projects/CSC249ADCD08/edw/env ''', ), ), HighThroughputExecutor( address=address_by_hostname(), label="ml", max_workers=1, prefetch_capacity=ml_prefetch, provider=LocalProvider( nodes_per_block=total_nodes, init_blocks=1, max_blocks=1, launcher=AprunLauncher( overrides='-d 64 --cc depth' ), # Places worker on the compute node worker_init=''' module load miniconda-3 conda activate /lus/theta-fs0/projects/CSC249ADCD08/edw/env ''', ), ) ], monitoring=MonitoringHub( hub_address=address_by_hostname(), monitoring_debug=False, resource_monitoring_interval=10, logdir=log_dir, logging_endpoint= f'sqlite:///{os.path.join(log_dir, "monitoring.db")}'), run_dir=log_dir, strategy='simple', max_idletime=15.)
def local_config(log_dir: str, max_workers: int, prefetch: int = 0) -> Config: """Single node with a single task per worker Args: log_dir: Path to store monitoring DB and parsl logs max_workers: Maximum number of concurrent tasks prefetch: Number of tasks for ML workers to prefetch for inference Returns: (Config) Parsl configuration """ return Config( executors=[ HighThroughputExecutor( address=address_by_hostname(), label="qc-worker", max_workers=max_workers, prefetch_capacity=prefetch, cpu_affinity='block', provider=LocalProvider( nodes_per_block=1, init_blocks=1, max_blocks=1, launcher=SimpleLauncher( ), # Places worker on the launch node ), ), HighThroughputExecutor( address=address_by_hostname(), label="ml-worker-tensorflow", max_workers=1, prefetch_capacity=prefetch, provider=LocalProvider( nodes_per_block=1, init_blocks=1, max_blocks=1, worker_init= 'sleep 30', # Give enough time for other workers to exit (memory!) launcher=SimpleLauncher( ), # Places worker on the launch node )), HighThroughputExecutor( address=address_by_hostname(), label= "ml-worker-tensorflow-infer", # Something about training and then running a model causes issues? max_workers=1, prefetch_capacity=prefetch, provider=LocalProvider( nodes_per_block=1, init_blocks=1, max_blocks=1, worker_init= 'sleep 30', # Give enough time for other workers to exit (memory!) launcher=SimpleLauncher( ), # Places worker on the launch node )), HighThroughputExecutor( address=address_by_hostname(), label="ml-worker-torch", max_workers=1, prefetch_capacity=prefetch, provider=LocalProvider( nodes_per_block=1, init_blocks=1, max_blocks=1, worker_init= 'sleep 30', # Give enough time for other workers to exit (memory!) launcher=SimpleLauncher( ), # Places worker on the launch node )) ], run_dir=log_dir, strategy='simple', max_idletime=15.)
def multisite_nwchem_config() -> Config: """Experimental multi-site configuration""" return Config( retries=1, executors=[ HighThroughputExecutor( address=address_by_hostname(), label="qc", max_workers=8, # One task per node provider=CobaltProvider( cmd_timeout=120, nodes_per_block=8, account='CSC249ADCD08', queue='debug-cache-quad', walltime="1:00:00", init_blocks=1, max_blocks=1, launcher=SimpleLauncher(), # Places worker on the launch node scheduler_options='#COBALT --attrs enable_ssh=1', worker_init=''' module load miniconda-3 export PATH=~/software/psi4/bin:$PATH conda activate /lus/theta-fs0/projects/CSC249ADCD08/edw/env # NWChem settings export PATH="/home/lward/software/nwchem-6.8.1/bin/LINUX64:$PATH" module load atp export MPICH_GNI_MAX_EAGER_MSG_SIZE=16384 export MPICH_GNI_MAX_VSHORT_MSG_SIZE=10000 export MPICH_GNI_MAX_EAGER_MSG_SIZE=131072 export MPICH_GNI_NUM_BUFS=300 export MPICH_GNI_NDREG_MAXSIZE=16777216 export MPICH_GNI_MBOX_PLACEMENT=nic export MPICH_GNI_LMT_PATH=disabled export COMEX_MAX_NB_OUTSTANDING=6 export LD_LIBRARY_PATH=/opt/intel/compilers_and_libraries_2018.0.128/linux/compiler/lib/intel64_lin:$LD_LIBRARY_PATH ''', ), ), HighThroughputExecutor( address='localhost', # Using an SSH tunnel worker_ports=(54382, 54008), label="ml", max_workers=1, working_dir='/homes/lward/parsl', worker_logdir_root='/homes/lward/parsl', provider=LocalProvider( channel=SSHChannel('lambda5.cels.anl.gov', script_dir='/home/lward/parsl'), nodes_per_block=1, init_blocks=1, max_blocks=1, launcher=SimpleLauncher(), worker_init=''' source /homes/lward/miniconda3/etc/profile.d/conda.sh conda activate colmena_full export CUDA_VISIBLE_DEVICES=17 # Pins to a GPU worker ''', ), ) ], strategy=None, )
ThreadPoolExecutor(label="local_threads", max_threads=4) ], strategy=None, ) theta_nwchem_config = Config( executors=[ HighThroughputExecutor( address=address_by_hostname(), label="htex", max_workers=int(os.environ.get("COBALT_JOBSIZE", 1)), provider=LocalProvider( nodes_per_block=1, init_blocks=1, max_blocks=1, launcher=SimpleLauncher(), # Places worker on the launch node worker_init=''' module load miniconda-3 export PATH=~/software/psi4/bin:$PATH conda activate /lus/theta-fs0/projects/CSC249ADCD08/colmena/env ''', ), ), ThreadPoolExecutor(label="local_threads", max_threads=4) ], strategy=None, ) theta_interleaved_config = Config( executors=[ HighThroughputExecutor(
def test(): import parsl from pyscf import lib, gto, scf import numpy as np import pandas as pd import logging from parsl.config import Config from parsl.providers import LocalProvider from parsl.channels import LocalChannel from parsl.launchers import SimpleLauncher from parsl.executors import ExtremeScaleExecutor ncore = 4 config = Config( executors=[ ExtremeScaleExecutor(label="Extreme_Local", worker_debug=True, ranks_per_node=ncore, provider=LocalProvider( channel=LocalChannel(), init_blocks=1, max_blocks=1, launcher=SimpleLauncher())) ], strategy=None, ) parsl.load(config) mol = gto.M(atom='H 0. 0. 0.; H 0. 0. 2.0', unit='bohr', ecp='bfd', basis='bfd_vtz') mf = scf.RHF(mol).run() mol.output = None mol.stdout = None mf.output = None mf.stdout = None mf.chkfile = None from pyqmc import ExpCuspFunction, GaussianFunction, MultiplyWF, PySCFSlaterRHF, JastrowSpin, initial_guess, EnergyAccumulator from pyqmc.accumulators import PGradTransform, LinearTransform nconf = 1600 basis = [ ExpCuspFunction(2.0, 1.5), GaussianFunction(0.5), GaussianFunction(2.0), GaussianFunction(.25), GaussianFunction(1.0), GaussianFunction(4.0), GaussianFunction(8.0) ] wf = MultiplyWF(PySCFSlaterRHF(mol, mf), JastrowSpin(mol, basis, basis)) coords = initial_guess(mol, nconf) energy_acc = EnergyAccumulator(mol) pgrad_acc = PGradTransform( energy_acc, LinearTransform(wf.parameters, ['wf2acoeff', 'wf2bcoeff'])) from pyqmc.optsr import gradient_descent gradient_descent(wf, coords, pgrad_acc, vmc=distvmc, vmcoptions={ 'npartitions': ncore, 'nsteps': 100, 'nsteps_per': 100 })
def theta_persistent(log_dir: str, nodes_per_nwchem: int = 1, qc_nodes: int = 8, ml_nodes: int = 8, ml_prefetch: int = 0) -> Config: """Configuration where the application is persistent and sits on the Theta login node. Nodes will be requested from Cobalt using separate jobs for ML and QC tasks. Args: nodes_per_nwchem: Number of nodes per NWChem computation log_dir: Path to store monitoring DB and parsl logs qc_nodes: Number of nodes dedicated to QC tasks ml_prefetch: Number of tasks for ML workers to prefetch for inference Returns: (Config) Parsl configuration """ return Config( retries=8, executors=[ HighThroughputExecutor( address=address_by_hostname(), label="qc", max_workers=qc_nodes // nodes_per_nwchem, prefetch_capacity=ml_prefetch, provider=CobaltProvider( account='CSC249ADCD08', queue='debug-cache-quad' if qc_nodes <= 8 else None, walltime='00:60:00', nodes_per_block=qc_nodes, init_blocks=0, max_blocks=1, launcher=SimpleLauncher(), cmd_timeout=360, worker_init=''' module load miniconda-3 conda activate /lus/theta-fs0/projects/CSC249ADCD08/edw/env export OMP_NUM_THREADS=64 export KMP_INIT_AT_FORK=FALSE export PYTHONPATH=$PYTHONPATH:$(pwd) export PATH="/lus/theta-fs0/projects/CSC249ADCD08/software/nwchem-6.8.1/bin/LINUX64:$PATH" mkdir -p scratch # For the NWChem tasks pwd which nwchem hostname module load atp export MPICH_GNI_MAX_EAGER_MSG_SIZE=16384 export MPICH_GNI_MAX_VSHORT_MSG_SIZE=10000 export MPICH_GNI_MAX_EAGER_MSG_SIZE=131072 export MPICH_GNI_NUM_BUFS=300 export MPICH_GNI_NDREG_MAXSIZE=16777216 export MPICH_GNI_MBOX_PLACEMENT=nic export MPICH_GNI_LMT_PATH=disabled export COMEX_MAX_NB_OUTSTANDING=6 export LD_LIBRARY_PATH=/opt/intel/mkl/lib/intel64_lin/:/opt/intel/compilers_and_libraries_2020.0.166/linux/compiler/lib/intel64_lin:$LD_LIBRARY_PATH ''', ), ), HighThroughputExecutor( address=address_by_hostname(), label="ml", max_workers=1, prefetch_capacity=ml_prefetch, provider=CobaltProvider( account='CSC249ADCD08', queue='debug-flat-quad', nodes_per_block=ml_nodes, scheduler_options='#COBALT --attrs enable_ssh=1', walltime='00:60:00', init_blocks=0, max_blocks=1, cmd_timeout=360, launcher=AprunLauncher( overrides='-d 256 --cc depth -j 4' ), # Places worker on the compute node worker_init=''' module load miniconda-3 conda activate /lus/theta-fs0/projects/CSC249ADCD08/edw/env''', ), ) ], run_dir=log_dir, strategy='simple', max_idletime=15.)