Beispiel #1
0
    def cluster(self, num_nodes):
        from distributed import Client
        from dask_jobqueue import PBSCluster

        cluster_ = PBSCluster(walltime='00:15:00', cores=36, memory='60GB', processes=1)
        self.client = Client(cluster_)
        cluster_.scale(num_nodes)
def test_log_directory(tmpdir):
    shutil.rmtree(tmpdir.strpath, ignore_errors=True)
    with PBSCluster(cores=1, memory="1GB"):
        assert not os.path.exists(tmpdir.strpath)

    with PBSCluster(cores=1, memory="1GB", log_directory=tmpdir.strpath):
        assert os.path.exists(tmpdir.strpath)
Beispiel #3
0
def test_job_script():
    with PBSCluster(walltime='00:02:00', processes=4, threads=2,
                    memory='7GB') as cluster:

        job_script = cluster.job_script()
        assert '#PBS' in job_script
        assert '#PBS -N dask-worker' in job_script
        assert '#PBS -l select=1:ncpus=8:mem=27GB' in job_script
        assert '#PBS -l walltime=00:02:00' in job_script
        assert '#PBS -q' not in job_script
        assert '#PBS -A' not in job_script

        assert '{} -m distributed.cli.dask_worker tcp://'.format(
            sys.executable) in job_script
        assert '--nthreads 2 --nprocs 4 --memory-limit 7GB' in job_script

    with PBSCluster(queue='regular',
                    project='DaskOnPBS',
                    processes=4,
                    threads=2,
                    memory='7GB',
                    resource_spec='select=1:ncpus=24:mem=100GB') as cluster:

        job_script = cluster.job_script()
        assert '#PBS -q regular' in job_script
        assert '#PBS -N dask-worker' in job_script
        assert '#PBS -l select=1:ncpus=24:mem=100GB' in job_script
        assert '#PBS -l select=1:ncpus=8:mem=27GB' not in job_script
        assert '#PBS -l walltime=' in job_script
        assert '#PBS -A DaskOnPBS' in job_script

        assert '{} -m distributed.cli.dask_worker tcp://'.format(
            sys.executable) in job_script
        assert '--nthreads 2 --nprocs 4 --memory-limit 7GB' in job_script
Beispiel #4
0
def test_informative_errors():
    with pytest.raises(ValueError) as info:
        PBSCluster(memory=None, cores=4)
    assert 'memory' in str(info.value)

    with pytest.raises(ValueError) as info:
        PBSCluster(memory='1GB', cores=None)
    assert 'cores' in str(info.value)
Beispiel #5
0
def test_informative_errors():
    with pytest.raises(ValueError) as info:
        PBSCluster(memory=None, cores=4)
    assert "memory" in str(info.value)

    with pytest.raises(ValueError) as info:
        PBSCluster(memory="1GB", cores=None)
    assert "cores" in str(info.value)
Beispiel #6
0
 def create_cluster(self, queue, maxcore, memory, wpn, walltime):
     cluster = PBSCluster(
         queue=queue,
         cores=maxcore,
         memory=memory,
         processes=wpn,
         local_directory='$TMPDIR',
         walltime=walltime,
         # extra=['--nthreads', '1', '--lifetime', '55m', '--lifetime-stagger', '4m'],
         # resource_spec='select=1:ncpus=12:ompthreads=12:mem=109GB',
     )
     logger.warning(cluster.job_script())
     self.client = Client(cluster)
Beispiel #7
0
def test_adaptive(loop):
    with PBSCluster(walltime='00:02:00',
                    processes=1,
                    threads=2,
                    memory='2GB',
                    local_directory='/tmp',
                    job_extra=['-V'],
                    loop=loop) as cluster:
        cluster.adapt()
        with Client(cluster) as client:
            future = client.submit(lambda x: x + 1, 10)
            assert future.result(60) == 11

            assert cluster.jobs

            start = time()
            processes = cluster.worker_processes
            while len(client.scheduler_info()['workers']) != processes:
                sleep(0.1)
                assert time() < start + 10

            del future

            start = time()
            while len(client.scheduler_info()['workers']) > 0:
                sleep(0.100)
                assert time() < start + 10
Beispiel #8
0
def test_adaptive_grouped(loop):
    with PBSCluster(
            walltime="00:02:00",
            processes=1,
            cores=2,
            memory="2GB",
            local_directory="/tmp",
            job_extra=["-V"],
            loop=loop,
    ) as cluster:
        cluster.adapt(minimum=1)  # at least 1 worker
        with Client(cluster) as client:
            start = time()
            while not (cluster.pending_jobs or cluster.running_jobs):
                sleep(0.100)
                assert time() < start + QUEUE_WAIT

            future = client.submit(lambda x: x + 1, 10)
            assert future.result(QUEUE_WAIT) == 11

            start = time()
            while not cluster.running_jobs:
                sleep(0.100)
                assert time() < start + QUEUE_WAIT

            start = time()
            processes = cluster.worker_processes
            while len(client.scheduler_info()["workers"]) != processes:
                sleep(0.1)
                assert time() < start + QUEUE_WAIT
Beispiel #9
0
def test_adaptive_cores_mem(loop):
    with PBSCluster(walltime='00:02:00',
                    processes=1,
                    cores=2,
                    memory='2GB',
                    local_directory='/tmp',
                    job_extra=['-V'],
                    loop=loop) as cluster:
        cluster.adapt(minimum_cores=0, maximum_memory='4GB')
        with Client(cluster) as client:
            future = client.submit(lambda x: x + 1, 10)
            assert future.result(QUEUE_WAIT) == 11

            start = time()
            processes = cluster.worker_processes
            while len(client.scheduler_info()['workers']) != processes:
                sleep(0.1)
                assert time() < start + QUEUE_WAIT

            del future

            start = time()
            while cluster.pending_jobs or cluster.running_jobs:
                sleep(0.100)
                assert time() < start + QUEUE_WAIT

            assert cluster.finished_jobs
Beispiel #10
0
def get_cluster(cluster_type, **kwargs):
    """Generic dask cluster wrapper"""

    # check input cluster type
    cluster_type = cluster_type.lower()
    cluster_list = ['lsf','pbs','slurm']
    if cluster_type not in cluster_list:
        msg = "Cluster type '{}' not supported".format(cluster_type)
        msg += '\nsupported cluster types: {}'.format(cluster_list)
        raise ValueError(msg)
    print("Dask cluster type: {}".format(cluster_type))

    # check input config name
    if 'config_name' in kwargs.keys():
        kwargs['config_name'] = check_config_name(kwargs['config_name'], cluster_type)
    print("Dask config name: {}".format(kwargs['config_name']))

    # check walltime format for each cluster type
    if 'walltime' in kwargs.keys():
        kwargs['walltime'] = check_walltime_format(kwargs["walltime"], cluster_type)
    print('Dask worker walltime: {}'.format(kwargs['walltime']))

    # initiate cluster object
    if cluster_type == 'lsf':
        cluster = LSFCluster(**kwargs)
    elif cluster_type == 'pbs':
        cluster = PBSCluster(**kwargs)
    elif cluster_type == 'slurm':
        cluster = SLURMCluster(**kwargs)

    return cluster
Beispiel #11
0
def test_basic(loop):
    with PBSCluster(walltime='00:02:00', processes=1, cores=2, memory='2GB', local_directory='/tmp',
                    job_extra=['-V'], loop=loop) as cluster:
        with Client(cluster) as client:

            cluster.scale(2)

            start = time()
            while not(cluster.pending_jobs or cluster.running_jobs):
                sleep(0.100)
                assert time() < start + QUEUE_WAIT

            future = client.submit(lambda x: x + 1, 10)
            assert future.result(QUEUE_WAIT) == 11
            assert cluster.running_jobs

            workers = list(client.scheduler_info()['workers'].values())
            w = workers[0]
            assert w['memory_limit'] == 2e9
            assert w['ncores'] == 2

            cluster.scale(0)

            start = time()
            while cluster.running_jobs:
                sleep(0.100)
                assert time() < start + QUEUE_WAIT

            assert not cluster.running_jobs
Beispiel #12
0
def test_command_template():
    with PBSCluster(cores=2, memory='4GB') as cluster:
        assert '%s -m distributed.cli.dask_worker' % (sys.executable) \
               in cluster._command_template
        assert ' --nthreads 2' in cluster._command_template
        assert ' --memory-limit ' in cluster._command_template
        assert ' --name ' in cluster._command_template

    with PBSCluster(cores=2,
                    memory='4GB',
                    death_timeout=60,
                    local_directory='/scratch',
                    extra=['--preload', 'mymodule']) as cluster:
        assert ' --death-timeout 60' in cluster._command_template
        assert ' --local-directory /scratch' in cluster._command_template
        assert ' --preload mymodule' in cluster._command_template
Beispiel #13
0
def _initialize_pbs_cluster(
    name: str = 'epigenomics-integration-pipeline',
    queue: str = 'batch',
    interface: str = 'ib0',
    cores: int = 2,
    processes: int = 2,
    memory: str = '220GB',
    walltime: str = '00:30:00',
    env_extra: List[str] = None,
    log_dir: str = 'logs',
    temp: str = None,
    **kwargs
) -> PBSCluster:
    """
    Initialize a dask distributed cluster for submission on an HPC system running
    PBS/TORQUE.

    arguments
        name:      job name
        queue:     queue used for submission
        interface: interconnect interface (e.g. ib0 = Infiniband, eth0 = ethernet)
                   This is system specific and you should find the proper interface
                   first by running 'ip addr'.
        cores:     number of cores per job
        procs:     number of processes per job
        memory:    total memory per job, so memory = 120GB and cores = 2, means each
                   process will have 60GB of usable memory
        walltime:  max. runtime for each job
        env_extra: extra arguments to use with the submission shell script
        temp:      location of the local working, or temp, directory

    returns
        a PBSCluster
    """

    if not env_extra:
        env_extra = ['cd $PBS_O_WORKDIR']

    ## Ensure the log directory exists
    Path(log_dir).mkdir(parents=True, exist_ok=True)

    return PBSCluster(
        name=name,
        queue=queue,
        interface=interface,
        cores=cores,
        processes=processes,
        memory=memory,
        walltime=walltime,
        local_directory=temp,
        ## Helix requires this, kodiak doesn't like this
        resource_spec=f'nodes=1:ppn={cores}',
        job_extra=[
            f'-N {name}',
            f'-l mem={memory}',
            f'-e {log_dir}',
            f'-o {log_dir}'
        ],
        env_extra=env_extra
    )
Beispiel #14
0
def get_ClusterClient():
    import dask
    from dask_jobqueue import PBSCluster
    from dask.distributed import Client

    USER = os.environ['USER']

    cluster = PBSCluster(
        cores=1,
        memory='25GB',
        processes=1,
        queue='casper',
        local_directory=f'/glade/scratch/{USER}/dask-workers',
        log_directory=f'/glade/scratch/{USER}/dask-workers',
        resource_spec='select=1:ncpus=1:mem=25GB',
        project='NCGD0011',
        walltime='06:00:00',
        interface='ib0',
    )

    dask.config.set({
        'distributed.dashboard.link':
        'https://jupyterhub.hpc.ucar.edu/stable/user/{USER}/proxy/{port}/status'
    })
    client = Client(cluster)
    return cluster, client
Beispiel #15
0
def setup_cluster(run_strategy="local", ncore=2, nnodes=1):
    if run_strategy == "local":
        cluster = LocalCluster(n_workers=ncore, threads_per_worker=1)
    elif run_strategy == "PBSjobqueue":
        from dask_jobqueue import PBSCluster
        cluster = PBSCluster(
            cores=ncore,
            processes=ncore,
            resource_spec=f"nodes=1:ppn={ncore}",
            group='wagner',
            queue='secondary',
            memory='16G',
            walltime='02:00:00',
            env_extra=[
                'cd ${PBS_O_WORKDIR}',
                'export PYTHONPATH=/home/lkwagner/pyqmc:$PYTHONPATH',
                'export OMP_NUM_THREADS=1', 'source /home/lkwagner/.bashrc',
                'conda activate pyscf'
            ],
            local_directory=os.getenv('TMPDIR', '/tmp'))
        cluster.submit_command = "/usr/local/torque-releases/torque-6.1.2-el7/bin/qsub"
        cluster.cancel_command = "/usr/local/torque-releases/torque-6.1.2-e17/bin/qdel"
        print(cluster.job_script())
        cluster.scale(nnodes)

    return Client(cluster), cluster
Beispiel #16
0
def test_basic(loop):
    with PBSCluster(walltime='00:02:00',
                    processes=1,
                    threads=2,
                    memory='2GB',
                    local_directory='/tmp',
                    job_extra=['-V'],
                    loop=loop) as cluster:
        with Client(cluster) as client:
            workers = cluster.start_workers(2)
            future = client.submit(lambda x: x + 1, 10)
            assert future.result(60) == 11
            assert cluster.jobs

            info = client.scheduler_info()
            w = list(info['workers'].values())[0]
            assert w['memory_limit'] == 2e9
            assert w['ncores'] == 2

            cluster.stop_workers(workers)

            start = time()
            while len(client.scheduler_info()['workers']) > 0:
                sleep(0.100)
                assert time() < start + 10

            assert not cluster.jobs
Beispiel #17
0
def test_config_name_pbs_takes_custom_config():
    conf = {
        'queue': 'myqueue',
        'project': 'myproject',
        'ncpus': 1,
        'cores': 1,
        'memory': '2 GB',
        'walltime': '00:02',
        'job-extra': [],
        'name': 'myname',
        'processes': 1,
        'interface': None,
        'death-timeout': None,
        'local-directory': '/foo',
        'extra': [],
        'env-extra': [],
        'log-directory': None,
        'shebang': '#!/usr/bin/env bash',
        'job-cpu': None,
        'job-mem': None,
        'resource-spec': None
    }

    with dask.config.set({'jobqueue.pbs-config-name': conf}):
        with PBSCluster(config_name='pbs-config-name') as cluster:
            assert cluster.name == 'myname'
Beispiel #18
0
def test_config_name_pbs_takes_custom_config():
    conf = {
        "queue": "myqueue",
        "project": "myproject",
        "ncpus": 1,
        "cores": 1,
        "memory": "2 GB",
        "walltime": "00:02",
        "job-extra": [],
        "name": "myname",
        "processes": 1,
        "interface": None,
        "death-timeout": None,
        "local-directory": "/foo",
        "shared-temp-directory": None,
        "extra": [],
        "env-extra": [],
        "log-directory": None,
        "shebang": "#!/usr/bin/env bash",
        "job-cpu": None,
        "job-mem": None,
        "resource-spec": None,
    }

    with dask.config.set({"jobqueue.pbs-config-name": conf}):
        with PBSCluster(config_name="pbs-config-name") as cluster:
            assert cluster.job_name == "myname"
Beispiel #19
0
def test_adaptive_cores_mem(loop):
    with PBSCluster(
            walltime="00:02:00",
            processes=1,
            cores=2,
            memory="2GB",
            local_directory="/tmp",
            job_extra=["-V"],
            loop=loop,
    ) as cluster:
        cluster.adapt(minimum_cores=0, maximum_memory="4GB")
        with Client(cluster) as client:
            future = client.submit(lambda x: x + 1, 10)
            assert future.result(QUEUE_WAIT) == 11

            start = time()
            processes = cluster._dummy_job.worker_processes
            while len(client.scheduler_info()["workers"]) != processes:
                sleep(0.1)
                assert time() < start + QUEUE_WAIT

            del future

            start = time()
            while cluster.workers:
                sleep(0.100)
                assert time() < start + QUEUE_WAIT
Beispiel #20
0
def test_scale_cores_memory(loop):
    with PBSCluster(
            walltime="00:02:00",
            processes=1,
            cores=2,
            memory="2GB",
            local_directory="/tmp",
            job_extra=["-V"],
            loop=loop,
    ) as cluster:
        with Client(cluster) as client:

            cluster.scale(cores=2)
            client.wait_for_workers(1)

            future = client.submit(lambda x: x + 1, 10)
            assert future.result(QUEUE_WAIT) == 11
            assert cluster.workers

            workers = list(client.scheduler_info()["workers"].values())
            w = workers[0]
            assert w["memory_limit"] == 2e9
            assert w["nthreads"] == 2

            cluster.scale(memory="0GB")

            start = time()
            while client.scheduler_info()["workers"]:
                sleep(0.100)
                assert time() < start + QUEUE_WAIT

            assert not cluster.workers
Beispiel #21
0
def test_config(loop):  # noqa: F811
    with dask.config.set({
            'jobqueue.pbs.walltime': '00:02:00',
            'jobqueue.pbs.local-directory': '/foo'
    }):
        with PBSCluster(loop=loop) as cluster:
            assert '00:02:00' in cluster.job_script()
            assert '--local-directory /foo' in cluster.job_script()
def test_forward_ip():
    ip = '127.0.0.1'
    with PBSCluster(walltime='00:02:00',
                    processes=4,
                    cores=8,
                    memory='28GB',
                    name='dask-worker',
                    ip=ip) as cluster:
        assert cluster.local_cluster.scheduler.ip == ip

    default_ip = socket.gethostbyname('')
    with PBSCluster(walltime='00:02:00',
                    processes=4,
                    cores=8,
                    memory='28GB',
                    name='dask-worker') as cluster:
        assert cluster.local_cluster.scheduler.ip == default_ip
Beispiel #23
0
def test_config(loop):
    with dask.config.set({
            "jobqueue.pbs.walltime": "00:02:00",
            "jobqueue.pbs.local-directory": "/foo"
    }):
        with PBSCluster(loop=loop, cores=1, memory="2GB") as cluster:
            assert "00:02:00" in cluster.job_script()
            assert "--local-directory /foo" in cluster.job_script()
Beispiel #24
0
def test_forward_ip():
    ip = "127.0.0.1"
    with PBSCluster(
        walltime="00:02:00",
        processes=4,
        cores=8,
        memory="28GB",
        name="dask-worker",
        host=ip,
    ) as cluster:
        assert cluster.local_cluster.scheduler.ip == ip

    default_ip = socket.gethostbyname("")
    with PBSCluster(
        walltime="00:02:00", processes=4, cores=8, memory="28GB", name="dask-worker"
    ) as cluster:
        assert cluster.local_cluster.scheduler.ip == default_ip
def test_command_template():
    with PBSCluster(cores=2, memory="4GB") as cluster:
        assert ("%s -m distributed.cli.dask_worker" % (sys.executable)
                in cluster._command_template)
        assert " --nthreads 2" in cluster._command_template
        assert " --memory-limit " in cluster._command_template
        assert " --name " in cluster._command_template

    with PBSCluster(
            cores=2,
            memory="4GB",
            death_timeout=60,
            local_directory="/scratch",
            extra=["--preload", "mymodule"],
    ) as cluster:
        assert " --death-timeout 60" in cluster._command_template
        assert " --local-directory /scratch" in cluster._command_template
        assert " --preload mymodule" in cluster._command_template
Beispiel #26
0
def dask(hardware='single',
         client=None,
         processes=False,
         n_workers=1,
         threads_per_worker=1,
         **kwargscluster):
    r"""Dask backend initialization.

    Create connection to drive computations using Dask distributed.

    Parameters
    ----------
    hardware : :obj:`str`, optional
        Hardware used to run Dask distributed. Currently available options
        are ``single`` for single-machine distribution, ``ssh`` for
        SSH-bases multi-machine distribution and ``pbs`` for
        PBS-bases multi-machine distribution
    client : :obj:`str`, optional
        Name of scheduler (use ``None`` for ``hardware=single``).
    processes : :obj:`str`, optional
        Whether to use processes (``True``) or threads (``False``).
    n_workers : :obj:`int`, optional
        Number of workers
    threads_per_worker : :obj:`int`, optional
        Number of threads per each worker
    kwargscluster:
        Additional parameters to be passed to the cluster creation routine

    Returns
    -------
    client : :obj:`dask.distributed.client.Client`
        Client
    cluster :
        Cluster

    Raises
    ------
    NotImplementedError
        If ``hardware`` is not ``single``, ``ssh``, or ``pbs``

    """
    if hardware == 'single':
        cluster = LocalCluster(processes=processes,
                               n_workers=n_workers,
                               threads_per_worker=threads_per_worker)
    elif hardware == 'ssh':
        cluster = client
    elif hardware == 'pbs':
        if jobqueue == False:
            raise ModuleNotFoundError('dask-jobqueue not installed. ' \
                                      'Run "pip install dask-jobqueue".')
        cluster = PBSCluster(**kwargscluster)
        cluster.scale(jobs=n_workers)
    else:
        raise NotImplementedError('hardware must be single, ssh, or pbs')
    client = Client(cluster)
    return client, cluster
def dask_distributed_setup(machine='cheyenne', cluster_kws={}, client_kws={}):

    if machine == 'cheyenne':
        cheyenne_cluster.update(cluster_kws)
        cluster = PBSCluster(**cheyenne_cluster)
        client = Client(cluster)
    else:
        raise NotImplementedError('only cheyenne is supported at this time')

    return cluster, client
Beispiel #28
0
def test_header():
    with PBSCluster(walltime='00:02:00', processes=4, threads=2,
                    memory='7GB') as cluster:

        assert '#PBS' in cluster.job_header
        assert '#PBS -N dask-worker' in cluster.job_header
        assert '#PBS -l select=1:ncpus=8:mem=27GB' in cluster.job_header
        assert '#PBS -l walltime=00:02:00' in cluster.job_header
        assert '#PBS -q' not in cluster.job_header
        assert '#PBS -A' not in cluster.job_header

    with PBSCluster(queue='regular',
                    project='DaskOnPBS',
                    processes=4,
                    threads=2,
                    memory='7GB',
                    resource_spec='select=1:ncpus=24:mem=100GB') as cluster:

        assert '#PBS -q regular' in cluster.job_header
        assert '#PBS -N dask-worker' in cluster.job_header
        assert '#PBS -l select=1:ncpus=24:mem=100GB' in cluster.job_header
        assert '#PBS -l select=1:ncpus=8:mem=27GB' not in cluster.job_header
        assert '#PBS -l walltime=' in cluster.job_header
        assert '#PBS -A DaskOnPBS' in cluster.job_header

    with PBSCluster() as cluster:

        assert '#PBS -j oe' not in cluster.job_header
        assert '#PBS -N' in cluster.job_header
        assert '#PBS -l select=1:ncpus=' in cluster.job_header
        assert '#PBS -l walltime=' in cluster.job_header
        assert '#PBS -A' not in cluster.job_header
        assert '#PBS -q' not in cluster.job_header

    with PBSCluster(job_extra=['-j oe']) as cluster:

        assert '#PBS -j oe' in cluster.job_header
        assert '#PBS -N' in cluster.job_header
        assert '#PBS -l select=1:ncpus=' in cluster.job_header
        assert '#PBS -l walltime=' in cluster.job_header
        assert '#PBS -A' not in cluster.job_header
        assert '#PBS -q' not in cluster.job_header
def test_jobqueue_cluster_call(tmpdir):
    cluster = PBSCluster(cores=1, memory="1GB")

    path = tmpdir.join("test.py")
    path.write('print("this is the stdout")')

    out = cluster._call([sys.executable, path.strpath])
    assert out == "this is the stdout\n"

    path_with_error = tmpdir.join("non-zero-exit-code.py")
    path_with_error.write('print("this is the stdout")\n1/0')

    match = ("Command exited with non-zero exit code.+"
             "Exit code: 1.+"
             "stdout:\nthis is the stdout.+"
             "stderr:.+ZeroDivisionError")

    match = re.compile(match, re.DOTALL)
    with pytest.raises(RuntimeError, match=match):
        cluster._call([sys.executable, path_with_error.strpath])
Beispiel #30
0
def test_basic_scale_edge_cases(loop):
    with PBSCluster(walltime='00:02:00', processes=1, cores=2, memory='2GB', local_directory='/tmp',
                    job_extra=['-V'], loop=loop) as cluster:

        cluster.scale(2)
        cluster.scale(0)

        # Wait to see what happens
        sleep(0.2)

        assert not(cluster.pending_jobs or cluster.running_jobs)