def cluster(self, num_nodes): from distributed import Client from dask_jobqueue import PBSCluster cluster_ = PBSCluster(walltime='00:15:00', cores=36, memory='60GB', processes=1) self.client = Client(cluster_) cluster_.scale(num_nodes)
def test_log_directory(tmpdir): shutil.rmtree(tmpdir.strpath, ignore_errors=True) with PBSCluster(cores=1, memory="1GB"): assert not os.path.exists(tmpdir.strpath) with PBSCluster(cores=1, memory="1GB", log_directory=tmpdir.strpath): assert os.path.exists(tmpdir.strpath)
def test_job_script(): with PBSCluster(walltime='00:02:00', processes=4, threads=2, memory='7GB') as cluster: job_script = cluster.job_script() assert '#PBS' in job_script assert '#PBS -N dask-worker' in job_script assert '#PBS -l select=1:ncpus=8:mem=27GB' in job_script assert '#PBS -l walltime=00:02:00' in job_script assert '#PBS -q' not in job_script assert '#PBS -A' not in job_script assert '{} -m distributed.cli.dask_worker tcp://'.format( sys.executable) in job_script assert '--nthreads 2 --nprocs 4 --memory-limit 7GB' in job_script with PBSCluster(queue='regular', project='DaskOnPBS', processes=4, threads=2, memory='7GB', resource_spec='select=1:ncpus=24:mem=100GB') as cluster: job_script = cluster.job_script() assert '#PBS -q regular' in job_script assert '#PBS -N dask-worker' in job_script assert '#PBS -l select=1:ncpus=24:mem=100GB' in job_script assert '#PBS -l select=1:ncpus=8:mem=27GB' not in job_script assert '#PBS -l walltime=' in job_script assert '#PBS -A DaskOnPBS' in job_script assert '{} -m distributed.cli.dask_worker tcp://'.format( sys.executable) in job_script assert '--nthreads 2 --nprocs 4 --memory-limit 7GB' in job_script
def test_informative_errors(): with pytest.raises(ValueError) as info: PBSCluster(memory=None, cores=4) assert 'memory' in str(info.value) with pytest.raises(ValueError) as info: PBSCluster(memory='1GB', cores=None) assert 'cores' in str(info.value)
def test_informative_errors(): with pytest.raises(ValueError) as info: PBSCluster(memory=None, cores=4) assert "memory" in str(info.value) with pytest.raises(ValueError) as info: PBSCluster(memory="1GB", cores=None) assert "cores" in str(info.value)
def create_cluster(self, queue, maxcore, memory, wpn, walltime): cluster = PBSCluster( queue=queue, cores=maxcore, memory=memory, processes=wpn, local_directory='$TMPDIR', walltime=walltime, # extra=['--nthreads', '1', '--lifetime', '55m', '--lifetime-stagger', '4m'], # resource_spec='select=1:ncpus=12:ompthreads=12:mem=109GB', ) logger.warning(cluster.job_script()) self.client = Client(cluster)
def test_adaptive(loop): with PBSCluster(walltime='00:02:00', processes=1, threads=2, memory='2GB', local_directory='/tmp', job_extra=['-V'], loop=loop) as cluster: cluster.adapt() with Client(cluster) as client: future = client.submit(lambda x: x + 1, 10) assert future.result(60) == 11 assert cluster.jobs start = time() processes = cluster.worker_processes while len(client.scheduler_info()['workers']) != processes: sleep(0.1) assert time() < start + 10 del future start = time() while len(client.scheduler_info()['workers']) > 0: sleep(0.100) assert time() < start + 10
def test_adaptive_grouped(loop): with PBSCluster( walltime="00:02:00", processes=1, cores=2, memory="2GB", local_directory="/tmp", job_extra=["-V"], loop=loop, ) as cluster: cluster.adapt(minimum=1) # at least 1 worker with Client(cluster) as client: start = time() while not (cluster.pending_jobs or cluster.running_jobs): sleep(0.100) assert time() < start + QUEUE_WAIT future = client.submit(lambda x: x + 1, 10) assert future.result(QUEUE_WAIT) == 11 start = time() while not cluster.running_jobs: sleep(0.100) assert time() < start + QUEUE_WAIT start = time() processes = cluster.worker_processes while len(client.scheduler_info()["workers"]) != processes: sleep(0.1) assert time() < start + QUEUE_WAIT
def test_adaptive_cores_mem(loop): with PBSCluster(walltime='00:02:00', processes=1, cores=2, memory='2GB', local_directory='/tmp', job_extra=['-V'], loop=loop) as cluster: cluster.adapt(minimum_cores=0, maximum_memory='4GB') with Client(cluster) as client: future = client.submit(lambda x: x + 1, 10) assert future.result(QUEUE_WAIT) == 11 start = time() processes = cluster.worker_processes while len(client.scheduler_info()['workers']) != processes: sleep(0.1) assert time() < start + QUEUE_WAIT del future start = time() while cluster.pending_jobs or cluster.running_jobs: sleep(0.100) assert time() < start + QUEUE_WAIT assert cluster.finished_jobs
def get_cluster(cluster_type, **kwargs): """Generic dask cluster wrapper""" # check input cluster type cluster_type = cluster_type.lower() cluster_list = ['lsf','pbs','slurm'] if cluster_type not in cluster_list: msg = "Cluster type '{}' not supported".format(cluster_type) msg += '\nsupported cluster types: {}'.format(cluster_list) raise ValueError(msg) print("Dask cluster type: {}".format(cluster_type)) # check input config name if 'config_name' in kwargs.keys(): kwargs['config_name'] = check_config_name(kwargs['config_name'], cluster_type) print("Dask config name: {}".format(kwargs['config_name'])) # check walltime format for each cluster type if 'walltime' in kwargs.keys(): kwargs['walltime'] = check_walltime_format(kwargs["walltime"], cluster_type) print('Dask worker walltime: {}'.format(kwargs['walltime'])) # initiate cluster object if cluster_type == 'lsf': cluster = LSFCluster(**kwargs) elif cluster_type == 'pbs': cluster = PBSCluster(**kwargs) elif cluster_type == 'slurm': cluster = SLURMCluster(**kwargs) return cluster
def test_basic(loop): with PBSCluster(walltime='00:02:00', processes=1, cores=2, memory='2GB', local_directory='/tmp', job_extra=['-V'], loop=loop) as cluster: with Client(cluster) as client: cluster.scale(2) start = time() while not(cluster.pending_jobs or cluster.running_jobs): sleep(0.100) assert time() < start + QUEUE_WAIT future = client.submit(lambda x: x + 1, 10) assert future.result(QUEUE_WAIT) == 11 assert cluster.running_jobs workers = list(client.scheduler_info()['workers'].values()) w = workers[0] assert w['memory_limit'] == 2e9 assert w['ncores'] == 2 cluster.scale(0) start = time() while cluster.running_jobs: sleep(0.100) assert time() < start + QUEUE_WAIT assert not cluster.running_jobs
def test_command_template(): with PBSCluster(cores=2, memory='4GB') as cluster: assert '%s -m distributed.cli.dask_worker' % (sys.executable) \ in cluster._command_template assert ' --nthreads 2' in cluster._command_template assert ' --memory-limit ' in cluster._command_template assert ' --name ' in cluster._command_template with PBSCluster(cores=2, memory='4GB', death_timeout=60, local_directory='/scratch', extra=['--preload', 'mymodule']) as cluster: assert ' --death-timeout 60' in cluster._command_template assert ' --local-directory /scratch' in cluster._command_template assert ' --preload mymodule' in cluster._command_template
def _initialize_pbs_cluster( name: str = 'epigenomics-integration-pipeline', queue: str = 'batch', interface: str = 'ib0', cores: int = 2, processes: int = 2, memory: str = '220GB', walltime: str = '00:30:00', env_extra: List[str] = None, log_dir: str = 'logs', temp: str = None, **kwargs ) -> PBSCluster: """ Initialize a dask distributed cluster for submission on an HPC system running PBS/TORQUE. arguments name: job name queue: queue used for submission interface: interconnect interface (e.g. ib0 = Infiniband, eth0 = ethernet) This is system specific and you should find the proper interface first by running 'ip addr'. cores: number of cores per job procs: number of processes per job memory: total memory per job, so memory = 120GB and cores = 2, means each process will have 60GB of usable memory walltime: max. runtime for each job env_extra: extra arguments to use with the submission shell script temp: location of the local working, or temp, directory returns a PBSCluster """ if not env_extra: env_extra = ['cd $PBS_O_WORKDIR'] ## Ensure the log directory exists Path(log_dir).mkdir(parents=True, exist_ok=True) return PBSCluster( name=name, queue=queue, interface=interface, cores=cores, processes=processes, memory=memory, walltime=walltime, local_directory=temp, ## Helix requires this, kodiak doesn't like this resource_spec=f'nodes=1:ppn={cores}', job_extra=[ f'-N {name}', f'-l mem={memory}', f'-e {log_dir}', f'-o {log_dir}' ], env_extra=env_extra )
def get_ClusterClient(): import dask from dask_jobqueue import PBSCluster from dask.distributed import Client USER = os.environ['USER'] cluster = PBSCluster( cores=1, memory='25GB', processes=1, queue='casper', local_directory=f'/glade/scratch/{USER}/dask-workers', log_directory=f'/glade/scratch/{USER}/dask-workers', resource_spec='select=1:ncpus=1:mem=25GB', project='NCGD0011', walltime='06:00:00', interface='ib0', ) dask.config.set({ 'distributed.dashboard.link': 'https://jupyterhub.hpc.ucar.edu/stable/user/{USER}/proxy/{port}/status' }) client = Client(cluster) return cluster, client
def setup_cluster(run_strategy="local", ncore=2, nnodes=1): if run_strategy == "local": cluster = LocalCluster(n_workers=ncore, threads_per_worker=1) elif run_strategy == "PBSjobqueue": from dask_jobqueue import PBSCluster cluster = PBSCluster( cores=ncore, processes=ncore, resource_spec=f"nodes=1:ppn={ncore}", group='wagner', queue='secondary', memory='16G', walltime='02:00:00', env_extra=[ 'cd ${PBS_O_WORKDIR}', 'export PYTHONPATH=/home/lkwagner/pyqmc:$PYTHONPATH', 'export OMP_NUM_THREADS=1', 'source /home/lkwagner/.bashrc', 'conda activate pyscf' ], local_directory=os.getenv('TMPDIR', '/tmp')) cluster.submit_command = "/usr/local/torque-releases/torque-6.1.2-el7/bin/qsub" cluster.cancel_command = "/usr/local/torque-releases/torque-6.1.2-e17/bin/qdel" print(cluster.job_script()) cluster.scale(nnodes) return Client(cluster), cluster
def test_basic(loop): with PBSCluster(walltime='00:02:00', processes=1, threads=2, memory='2GB', local_directory='/tmp', job_extra=['-V'], loop=loop) as cluster: with Client(cluster) as client: workers = cluster.start_workers(2) future = client.submit(lambda x: x + 1, 10) assert future.result(60) == 11 assert cluster.jobs info = client.scheduler_info() w = list(info['workers'].values())[0] assert w['memory_limit'] == 2e9 assert w['ncores'] == 2 cluster.stop_workers(workers) start = time() while len(client.scheduler_info()['workers']) > 0: sleep(0.100) assert time() < start + 10 assert not cluster.jobs
def test_config_name_pbs_takes_custom_config(): conf = { 'queue': 'myqueue', 'project': 'myproject', 'ncpus': 1, 'cores': 1, 'memory': '2 GB', 'walltime': '00:02', 'job-extra': [], 'name': 'myname', 'processes': 1, 'interface': None, 'death-timeout': None, 'local-directory': '/foo', 'extra': [], 'env-extra': [], 'log-directory': None, 'shebang': '#!/usr/bin/env bash', 'job-cpu': None, 'job-mem': None, 'resource-spec': None } with dask.config.set({'jobqueue.pbs-config-name': conf}): with PBSCluster(config_name='pbs-config-name') as cluster: assert cluster.name == 'myname'
def test_config_name_pbs_takes_custom_config(): conf = { "queue": "myqueue", "project": "myproject", "ncpus": 1, "cores": 1, "memory": "2 GB", "walltime": "00:02", "job-extra": [], "name": "myname", "processes": 1, "interface": None, "death-timeout": None, "local-directory": "/foo", "shared-temp-directory": None, "extra": [], "env-extra": [], "log-directory": None, "shebang": "#!/usr/bin/env bash", "job-cpu": None, "job-mem": None, "resource-spec": None, } with dask.config.set({"jobqueue.pbs-config-name": conf}): with PBSCluster(config_name="pbs-config-name") as cluster: assert cluster.job_name == "myname"
def test_adaptive_cores_mem(loop): with PBSCluster( walltime="00:02:00", processes=1, cores=2, memory="2GB", local_directory="/tmp", job_extra=["-V"], loop=loop, ) as cluster: cluster.adapt(minimum_cores=0, maximum_memory="4GB") with Client(cluster) as client: future = client.submit(lambda x: x + 1, 10) assert future.result(QUEUE_WAIT) == 11 start = time() processes = cluster._dummy_job.worker_processes while len(client.scheduler_info()["workers"]) != processes: sleep(0.1) assert time() < start + QUEUE_WAIT del future start = time() while cluster.workers: sleep(0.100) assert time() < start + QUEUE_WAIT
def test_scale_cores_memory(loop): with PBSCluster( walltime="00:02:00", processes=1, cores=2, memory="2GB", local_directory="/tmp", job_extra=["-V"], loop=loop, ) as cluster: with Client(cluster) as client: cluster.scale(cores=2) client.wait_for_workers(1) future = client.submit(lambda x: x + 1, 10) assert future.result(QUEUE_WAIT) == 11 assert cluster.workers workers = list(client.scheduler_info()["workers"].values()) w = workers[0] assert w["memory_limit"] == 2e9 assert w["nthreads"] == 2 cluster.scale(memory="0GB") start = time() while client.scheduler_info()["workers"]: sleep(0.100) assert time() < start + QUEUE_WAIT assert not cluster.workers
def test_config(loop): # noqa: F811 with dask.config.set({ 'jobqueue.pbs.walltime': '00:02:00', 'jobqueue.pbs.local-directory': '/foo' }): with PBSCluster(loop=loop) as cluster: assert '00:02:00' in cluster.job_script() assert '--local-directory /foo' in cluster.job_script()
def test_forward_ip(): ip = '127.0.0.1' with PBSCluster(walltime='00:02:00', processes=4, cores=8, memory='28GB', name='dask-worker', ip=ip) as cluster: assert cluster.local_cluster.scheduler.ip == ip default_ip = socket.gethostbyname('') with PBSCluster(walltime='00:02:00', processes=4, cores=8, memory='28GB', name='dask-worker') as cluster: assert cluster.local_cluster.scheduler.ip == default_ip
def test_config(loop): with dask.config.set({ "jobqueue.pbs.walltime": "00:02:00", "jobqueue.pbs.local-directory": "/foo" }): with PBSCluster(loop=loop, cores=1, memory="2GB") as cluster: assert "00:02:00" in cluster.job_script() assert "--local-directory /foo" in cluster.job_script()
def test_forward_ip(): ip = "127.0.0.1" with PBSCluster( walltime="00:02:00", processes=4, cores=8, memory="28GB", name="dask-worker", host=ip, ) as cluster: assert cluster.local_cluster.scheduler.ip == ip default_ip = socket.gethostbyname("") with PBSCluster( walltime="00:02:00", processes=4, cores=8, memory="28GB", name="dask-worker" ) as cluster: assert cluster.local_cluster.scheduler.ip == default_ip
def test_command_template(): with PBSCluster(cores=2, memory="4GB") as cluster: assert ("%s -m distributed.cli.dask_worker" % (sys.executable) in cluster._command_template) assert " --nthreads 2" in cluster._command_template assert " --memory-limit " in cluster._command_template assert " --name " in cluster._command_template with PBSCluster( cores=2, memory="4GB", death_timeout=60, local_directory="/scratch", extra=["--preload", "mymodule"], ) as cluster: assert " --death-timeout 60" in cluster._command_template assert " --local-directory /scratch" in cluster._command_template assert " --preload mymodule" in cluster._command_template
def dask(hardware='single', client=None, processes=False, n_workers=1, threads_per_worker=1, **kwargscluster): r"""Dask backend initialization. Create connection to drive computations using Dask distributed. Parameters ---------- hardware : :obj:`str`, optional Hardware used to run Dask distributed. Currently available options are ``single`` for single-machine distribution, ``ssh`` for SSH-bases multi-machine distribution and ``pbs`` for PBS-bases multi-machine distribution client : :obj:`str`, optional Name of scheduler (use ``None`` for ``hardware=single``). processes : :obj:`str`, optional Whether to use processes (``True``) or threads (``False``). n_workers : :obj:`int`, optional Number of workers threads_per_worker : :obj:`int`, optional Number of threads per each worker kwargscluster: Additional parameters to be passed to the cluster creation routine Returns ------- client : :obj:`dask.distributed.client.Client` Client cluster : Cluster Raises ------ NotImplementedError If ``hardware`` is not ``single``, ``ssh``, or ``pbs`` """ if hardware == 'single': cluster = LocalCluster(processes=processes, n_workers=n_workers, threads_per_worker=threads_per_worker) elif hardware == 'ssh': cluster = client elif hardware == 'pbs': if jobqueue == False: raise ModuleNotFoundError('dask-jobqueue not installed. ' \ 'Run "pip install dask-jobqueue".') cluster = PBSCluster(**kwargscluster) cluster.scale(jobs=n_workers) else: raise NotImplementedError('hardware must be single, ssh, or pbs') client = Client(cluster) return client, cluster
def dask_distributed_setup(machine='cheyenne', cluster_kws={}, client_kws={}): if machine == 'cheyenne': cheyenne_cluster.update(cluster_kws) cluster = PBSCluster(**cheyenne_cluster) client = Client(cluster) else: raise NotImplementedError('only cheyenne is supported at this time') return cluster, client
def test_header(): with PBSCluster(walltime='00:02:00', processes=4, threads=2, memory='7GB') as cluster: assert '#PBS' in cluster.job_header assert '#PBS -N dask-worker' in cluster.job_header assert '#PBS -l select=1:ncpus=8:mem=27GB' in cluster.job_header assert '#PBS -l walltime=00:02:00' in cluster.job_header assert '#PBS -q' not in cluster.job_header assert '#PBS -A' not in cluster.job_header with PBSCluster(queue='regular', project='DaskOnPBS', processes=4, threads=2, memory='7GB', resource_spec='select=1:ncpus=24:mem=100GB') as cluster: assert '#PBS -q regular' in cluster.job_header assert '#PBS -N dask-worker' in cluster.job_header assert '#PBS -l select=1:ncpus=24:mem=100GB' in cluster.job_header assert '#PBS -l select=1:ncpus=8:mem=27GB' not in cluster.job_header assert '#PBS -l walltime=' in cluster.job_header assert '#PBS -A DaskOnPBS' in cluster.job_header with PBSCluster() as cluster: assert '#PBS -j oe' not in cluster.job_header assert '#PBS -N' in cluster.job_header assert '#PBS -l select=1:ncpus=' in cluster.job_header assert '#PBS -l walltime=' in cluster.job_header assert '#PBS -A' not in cluster.job_header assert '#PBS -q' not in cluster.job_header with PBSCluster(job_extra=['-j oe']) as cluster: assert '#PBS -j oe' in cluster.job_header assert '#PBS -N' in cluster.job_header assert '#PBS -l select=1:ncpus=' in cluster.job_header assert '#PBS -l walltime=' in cluster.job_header assert '#PBS -A' not in cluster.job_header assert '#PBS -q' not in cluster.job_header
def test_jobqueue_cluster_call(tmpdir): cluster = PBSCluster(cores=1, memory="1GB") path = tmpdir.join("test.py") path.write('print("this is the stdout")') out = cluster._call([sys.executable, path.strpath]) assert out == "this is the stdout\n" path_with_error = tmpdir.join("non-zero-exit-code.py") path_with_error.write('print("this is the stdout")\n1/0') match = ("Command exited with non-zero exit code.+" "Exit code: 1.+" "stdout:\nthis is the stdout.+" "stderr:.+ZeroDivisionError") match = re.compile(match, re.DOTALL) with pytest.raises(RuntimeError, match=match): cluster._call([sys.executable, path_with_error.strpath])
def test_basic_scale_edge_cases(loop): with PBSCluster(walltime='00:02:00', processes=1, cores=2, memory='2GB', local_directory='/tmp', job_extra=['-V'], loop=loop) as cluster: cluster.scale(2) cluster.scale(0) # Wait to see what happens sleep(0.2) assert not(cluster.pending_jobs or cluster.running_jobs)