Esempio n. 1
0
def init_cluster(num_workers, wait_for_all_workers=True):
    """
    Start up a dask cluster, optionally wait until all workers have been launched,
    and then return the resulting distributed.Client object.
    
    Args:
        num_workers:
            How many workers to launch.
        wait_for_all_workers:
            If True, pause until all workers have been launched before returning.
            Otherwise, just wait for a single worker to launch.
    
    Returns:
        distributed.Client
    """
    # Local import: LSFCluster probably isn't importable on your local machine,
    # so it's nice to avoid importing it when you're just running local tests without a cluster.
    from dask_jobqueue import LSFCluster
    cluster = LSFCluster(ip='0.0.0.0')
    cluster.scale(num_workers)

    required_workers = 1
    if wait_for_all_workers:
        required_workers = num_workers

    client = Client(cluster)
    while (wait_for_all_workers and client.status == "running"
           and len(cluster.scheduler.workers) < required_workers):
        print(
            f"Waiting for {required_workers - len(cluster.scheduler.workers)} workers..."
        )
        time.sleep(1.0)

    return client
Esempio n. 2
0
def test_informative_errors():
    with pytest.raises(ValueError) as info:
        LSFCluster(memory=None, cores=4)
    assert "memory" in str(info.value)

    with pytest.raises(ValueError) as info:
        LSFCluster(memory="1GB", cores=None)
    assert "cores" in str(info.value)
Esempio n. 3
0
def test_informative_errors():
    with pytest.raises(ValueError) as info:
        LSFCluster(memory=None, cores=4)
    assert 'memory' in str(info.value)

    with pytest.raises(ValueError) as info:
        LSFCluster(memory='1GB', cores=None)
    assert 'cores' in str(info.value)
Esempio n. 4
0
 def adapt(self, minimum, maximum, **kwargs):
     # Merge kwargs with default kwargs
     kwargs = {**self.cluster_kwargs, **kwargs}
     # Call parent adapt method (check values)
     super().adapt(minimum, maximum, **kwargs)
     # Make new cluster
     self._cluster = LSFCluster(**kwargs)
     # Make client
     self._client = Client(self._cluster)
     # Adapt cluster
     self._cluster.adapt(minimum=minimum, maximum=maximum)
     # Return client reference
     return self.client
Esempio n. 5
0
def test_header():
    with LSFCluster(walltime="00:02", processes=4, cores=8,
                    memory="8GB") as cluster:

        assert "#BSUB" in cluster.job_header
        assert "#BSUB -J dask-worker" in cluster.job_header
        assert "#BSUB -n 8" in cluster.job_header
        assert "#BSUB -M 8000" in cluster.job_header
        assert "#BSUB -W 00:02" in cluster.job_header
        assert "#BSUB -q" not in cluster.job_header
        assert "#BSUB -P" not in cluster.job_header
        assert "--name dask-worker--${JOB_ID}--" in cluster.job_script()

    with LSFCluster(
            queue="general",
            project="DaskOnLSF",
            processes=4,
            cores=8,
            memory="28GB",
            ncpus=24,
            mem=100000000000,
    ) as cluster:

        assert "#BSUB -q general" in cluster.job_header
        assert "#BSUB -J dask-worker" in cluster.job_header
        assert "#BSUB -n 24" in cluster.job_header
        assert "#BSUB -n 8" not in cluster.job_header
        assert "#BSUB -M 100000" in cluster.job_header
        assert "#BSUB -M 28000" not in cluster.job_header
        assert "#BSUB -W" in cluster.job_header
        assert "#BSUB -P DaskOnLSF" in cluster.job_header

    with LSFCluster(cores=4, memory="8GB") as cluster:

        assert "#BSUB -n" in cluster.job_header
        assert "#BSUB -W" in cluster.job_header
        assert "#BSUB -M" in cluster.job_header
        assert "#BSUB -q" not in cluster.job_header
        assert "#BSUB -P" not in cluster.job_header

    with LSFCluster(cores=4, memory="8GB",
                    job_extra=["-u [email protected]"]) as cluster:

        assert "#BSUB -u [email protected]" in cluster.job_header
        assert "#BSUB -n" in cluster.job_header
        assert "#BSUB -W" in cluster.job_header
        assert "#BSUB -M" in cluster.job_header
        assert "#BSUB -q" not in cluster.job_header
        assert "#BSUB -P" not in cluster.job_header
Esempio n. 6
0
def test_adaptive_grouped(loop):
    with LSFCluster(
        walltime="00:02",
        processes=1,
        cores=2,
        memory="2GB",
        local_directory="/tmp",
        loop=loop,
    ) as cluster:
        cluster.adapt(minimum=1)  # at least 1 worker
        with Client(cluster) as client:
            start = time()
            while not (cluster.pending_jobs or cluster.running_jobs):
                sleep(0.100)
                assert time() < start + QUEUE_WAIT

            future = client.submit(lambda x: x + 1, 10)
            assert future.result(QUEUE_WAIT) == 11

            start = time()
            while not cluster.running_jobs:
                sleep(0.100)
                assert time() < start + QUEUE_WAIT

            start = time()
            processes = cluster.worker_processes
            while len(client.scheduler_info()["workers"]) != processes:
                sleep(0.1)
                assert time() < start + QUEUE_WAIT
Esempio n. 7
0
def test_config_name_lsf_takes_custom_config():
    conf = {
        "queue": "myqueue",
        "project": "myproject",
        "ncpus": 1,
        "cores": 1,
        "mem": 2,
        "memory": "2 GB",
        "walltime": "00:02",
        "job-extra": [],
        "lsf-units": "TB",
        "name": "myname",
        "processes": 1,
        "interface": None,
        "death-timeout": None,
        "local-directory": "/foo",
        "extra": [],
        "env-extra": [],
        "log-directory": None,
        "shebang": "#!/usr/bin/env bash",
        "use-stdin": None,
    }

    with dask.config.set({"jobqueue.lsf-config-name": conf}):
        with LSFCluster(config_name="lsf-config-name") as cluster:
            assert cluster.job_name == "myname"
Esempio n. 8
0
def test_config(loop):
    with dask.config.set(
        {"jobqueue.lsf.walltime": "00:02", "jobqueue.lsf.local-directory": "/foo"}
    ):
        with LSFCluster(loop=loop, cores=1, memory="2GB") as cluster:
            assert "00:02" in cluster.job_script()
            assert "--local-directory /foo" in cluster.job_script()
Esempio n. 9
0
def get_cluster(cluster_type, **kwargs):
    """Generic dask cluster wrapper"""

    # check input cluster type
    cluster_type = cluster_type.lower()
    cluster_list = ['lsf','pbs','slurm']
    if cluster_type not in cluster_list:
        msg = "Cluster type '{}' not supported".format(cluster_type)
        msg += '\nsupported cluster types: {}'.format(cluster_list)
        raise ValueError(msg)
    print("Dask cluster type: {}".format(cluster_type))

    # check input config name
    if 'config_name' in kwargs.keys():
        kwargs['config_name'] = check_config_name(kwargs['config_name'], cluster_type)
    print("Dask config name: {}".format(kwargs['config_name']))

    # check walltime format for each cluster type
    if 'walltime' in kwargs.keys():
        kwargs['walltime'] = check_walltime_format(kwargs["walltime"], cluster_type)
    print('Dask worker walltime: {}'.format(kwargs['walltime']))

    # initiate cluster object
    if cluster_type == 'lsf':
        cluster = LSFCluster(**kwargs)
    elif cluster_type == 'pbs':
        cluster = PBSCluster(**kwargs)
    elif cluster_type == 'slurm':
        cluster = SLURMCluster(**kwargs)

    return cluster
Esempio n. 10
0
def test_basic(loop):
    with LSFCluster(walltime='00:02',
                    processes=1,
                    cores=2,
                    memory='2GB',
                    local_directory='/tmp',
                    loop=loop) as cluster:
        with Client(cluster) as client:
            cluster.start_workers(2)
            assert cluster.pending_jobs or cluster.running_jobs
            future = client.submit(lambda x: x + 1, 10)
            assert future.result(QUEUE_WAIT) == 11
            assert cluster.running_jobs

            workers = list(client.scheduler_info()['workers'].values())
            w = workers[0]
            assert w['memory_limit'] == 2e9
            assert w['ncores'] == 2

            cluster.stop_workers(workers)

            start = time()
            while client.scheduler_info()['workers']:
                sleep(0.100)
                assert time() < start + QUEUE_WAIT

            assert not cluster.running_jobs
Esempio n. 11
0
def test_basic(loop):
    with LSFCluster(
        walltime="00:02",
        processes=1,
        cores=2,
        memory="2GB",
        local_directory="/tmp",
        loop=loop,
    ) as cluster:
        with Client(cluster) as client:
            cluster.scale(2)
            assert cluster.pending_jobs or cluster.running_jobs
            future = client.submit(lambda x: x + 1, 10)
            assert future.result(QUEUE_WAIT) == 11
            assert cluster.running_jobs

            workers = list(client.scheduler_info()["workers"].values())
            w = workers[0]
            assert w["memory_limit"] == 2e9
            assert w["nthreads"] == 2

            cluster.stop_workers(workers)

            start = time()
            while client.scheduler_info()["workers"]:
                sleep(0.100)
                assert time() < start + QUEUE_WAIT

            assert not cluster.running_jobs
Esempio n. 12
0
def test_header():
    with LSFCluster(walltime='00:02', processes=4, cores=8,
                    memory='8GB') as cluster:

        assert '#BSUB' in cluster.job_header
        assert '#BSUB -J dask-worker' in cluster.job_header
        assert '#BSUB -n 8' in cluster.job_header
        assert '#BSUB -M 8000' in cluster.job_header
        assert '#BSUB -W 00:02' in cluster.job_header
        assert '#BSUB -q' not in cluster.job_header
        assert '#BSUB -P' not in cluster.job_header
        assert '--name dask-worker--${JOB_ID}--' in cluster.job_script()

    with LSFCluster(queue='general',
                    project='DaskOnLSF',
                    processes=4,
                    cores=8,
                    memory='28GB',
                    ncpus=24,
                    mem=100000000000) as cluster:

        assert '#BSUB -q general' in cluster.job_header
        assert '#BSUB -J dask-worker' in cluster.job_header
        assert '#BSUB -n 24' in cluster.job_header
        assert '#BSUB -n 8' not in cluster.job_header
        assert '#BSUB -M 100000' in cluster.job_header
        assert '#BSUB -M 28000' not in cluster.job_header
        assert '#BSUB -W' in cluster.job_header
        assert '#BSUB -P DaskOnLSF' in cluster.job_header

    with LSFCluster(cores=4, memory='8GB') as cluster:

        assert '#BSUB -n' in cluster.job_header
        assert '#BSUB -W' in cluster.job_header
        assert '#BSUB -M' in cluster.job_header
        assert '#BSUB -q' not in cluster.job_header
        assert '#BSUB -P' not in cluster.job_header

    with LSFCluster(cores=4, memory='8GB',
                    job_extra=['-u [email protected]']) as cluster:

        assert '#BSUB -u [email protected]' in cluster.job_header
        assert '#BSUB -n' in cluster.job_header
        assert '#BSUB -W' in cluster.job_header
        assert '#BSUB -M' in cluster.job_header
        assert '#BSUB -q' not in cluster.job_header
        assert '#BSUB -P' not in cluster.job_header
Esempio n. 13
0
def test_config(loop):  # noqa: F811
    with dask.config.set({
            'jobqueue.lsf.walltime': '00:02',
            'jobqueue.lsf.local-directory': '/foo'
    }):
        with LSFCluster(loop=loop, cores=1, memory='2GB') as cluster:
            assert '00:02' in cluster.job_script()
            assert '--local-directory /foo' in cluster.job_script()
Esempio n. 14
0
    def initializeLSFCluster(self,
                             cores=1,
                             memory="16GB",
                             processes=1,
                             death_timeout="600s",
                             queue="normal",
                             walltime="1:00",
                             ncpus=1,
                             threads_per_worker=2,
                             mem=16000,
                             **kwargs):
        """
        Initialize a dask_jobqueue.LSFCluster
        inspired by:
        https://github.com/janelia-cosem/fst/blob/master/fst/distributed.py
        LSFCluster API:
        https://jobqueue.dask.org/en/latest/generated/dask_jobqueue.LSFCluster.html#dask_jobqueue.LSFCluster
        """

        # TODO: add group detection for `project` keyword

        if 1 <= threads_per_worker <= 2 * cores:
            tpw = threads_per_worker  # shorthand
            env_extra = [
                f"export NUM_MKL_THREADS={tpw}",
                f"export OPENBLAS_NUM_THREADS={tpw}",
                f"export OPENMP_NUM_THREADS={tpw}",
                f"export OMP_NUM_THREADS={tpw}",
            ]
        else:
            raise ValueError("Maximum of 2 threads per core")

        USER = os.environ["USER"]
        HOME = os.environ["HOME"]

        if "local_directory" not in kwargs:
            kwargs["local_directory"] = f"/scratch/{USER}/"

        if "log_directory" not in kwargs:
            log_dir = f"{HOME}/.dask_distributed/"
            Path(log_dir).mkdir(parents=False, exist_ok=True)
            kwargs["log_directory"] = log_dir

        cluster = LSFCluster(
            queue=queue,
            walltime=walltime,
            cores=cores,
            ncpus=ncpus,
            memory=memory,
            env_extra=env_extra,
            death_timeout=death_timeout,
            processes=processes,
            mem=mem,
            **kwargs,
        )
        self.setCluster(cluster)
Esempio n. 15
0
def test_job_script():
    with LSFCluster(walltime="00:02", processes=4, cores=8, memory="28GB") as cluster:

        job_script = cluster.job_script()
        assert "#BSUB" in job_script
        assert "#BSUB -J dask-worker" in job_script
        assert "#BSUB -n 8" in job_script
        assert "#BSUB -M 28000" in job_script
        assert "#BSUB -W 00:02" in job_script
        assert "#BSUB -q" not in cluster.job_header
        assert "#BSUB -P" not in cluster.job_header

        assert (
            "{} -m distributed.cli.dask_worker tcp://".format(sys.executable)
            in job_script
        )
        assert "--nthreads 2 --nprocs 4 --memory-limit 7.00GB" in job_script

    with LSFCluster(
        queue="general",
        project="DaskOnLSF",
        processes=4,
        cores=8,
        memory="28GB",
        ncpus=24,
        mem=100000000000,
    ) as cluster:

        job_script = cluster.job_script()
        assert "#BSUB -q general" in cluster.job_header
        assert "#BSUB -J dask-worker" in cluster.job_header
        assert "#BSUB -n 24" in cluster.job_header
        assert "#BSUB -n 8" not in cluster.job_header
        assert "#BSUB -M 100000" in cluster.job_header
        assert "#BSUB -M 28000" not in cluster.job_header
        assert "#BSUB -W" in cluster.job_header
        assert "#BSUB -P DaskOnLSF" in cluster.job_header

        assert (
            "{} -m distributed.cli.dask_worker tcp://".format(sys.executable)
            in job_script
        )
        assert "--nthreads 2 --nprocs 4 --memory-limit 7.00GB" in job_script
Esempio n. 16
0
def get_cluster(type, **kwargs):
    print("Using cluster type: {}".format(type))

    if type == 'LSF':
        cluster = LSFCluster(**kwargs)
    elif type == 'PBS':
        cluster = PBSCluster(**kwargs)
    elif type == 'SLURM':
        cluster = SLURMCluster(**kwargs)

    return cluster
Esempio n. 17
0
class LSFScheduler(Scheduler):

    # Constructor
    def __init__(self,
                 min_cores=1,
                 max_cores=1,
                 min_memory='1 GB',
                 max_memory='1 GB',
                 processes=1,
                 walltime='02:00',
                 **kwargs):
        # Call parent constructor
        super().__init__(min_cores=min_cores,
                         max_cores=max_cores,
                         min_memory=min_memory,
                         max_memory=max_memory)
        # Define cluster default parameters
        self.cluster_kwargs = {
            **{
                'memory': max_memory,
                'cores': min_cores,
                'processes': processes,
                'walltime': walltime
            },
            **kwargs
        }

    # Define adapt method
    def adapt(self, minimum, maximum, **kwargs):
        # Merge kwargs with default kwargs
        kwargs = {**self.cluster_kwargs, **kwargs}
        # Call parent adapt method (check values)
        super().adapt(minimum, maximum, **kwargs)
        # Make new cluster
        self._cluster = LSFCluster(**kwargs)
        # Make client
        self._client = Client(self._cluster)
        # Adapt cluster
        self._cluster.adapt(minimum=minimum, maximum=maximum)
        # Return client reference
        return self.client
def start_dask_lsfcluster(cluster_size=5):
    """Start a dask cluster."""
    if cluster_size < 4:
        raise Exception('Too small of a cluster')
    # Settings for Sanger farm
    memory_in_gb = 20
    cluster = LSFCluster(
        queue='normal',
        walltime='00:30',
        log_directory='{}/dask_logs'.format(os.getcwd()),
        cores=4,
        memory='{} Gb'.format(memory_in_gb),
        mem=memory_in_gb * 1e+9,  # should be in bytes
        lsf_units='mb',
        job_extra=[
            '-G team152', '-g /lt9/dask',
            '-R "select[mem>{}] rusage[mem={}]"'.format(
                int(memory_in_gb * 1e+3), int(memory_in_gb * 1e+3))
        ],
        use_stdin=True)

    # View the job submission from Dask
    # cluster.job_script()

    # Scale cluster
    cluster.scale(cluster_size)

    # auto-scale between 10 and 100 jobs
    # cluster.adapt(
    #     minimum_jobs=int(cluster_size/4),
    #     maximum_jobs=cluster_size
    # )
    # cluster.adapt(maximum_memory="10 TB")  # use core/memory limits

    client = Client(cluster, timeout=120)
    client.wait_for_workers(n_workers=cluster_size)
    # print(client.scheduler_info()['services'])

    return cluster, client
Esempio n. 19
0
def test_use_stdin(loop, config_value, constructor_value):
    """
    Verify that use-stdin is respected when passed via the
    config OR the LSFCluster() constructor
    """
    with dask.config.set({"jobqueue.lsf.use-stdin": config_value}):
        with LSFCluster(
            loop=loop, cores=1, memory="2GB", use_stdin=constructor_value
        ) as cluster:
            if constructor_value is not None:
                assert cluster._dummy_job.use_stdin == constructor_value
            else:
                assert cluster._dummy_job.use_stdin == config_value
Esempio n. 20
0
def setup_dask_lsf_cluster(
    n_workers: int,
    queue: str,
    memory_gigabytes: int,
    wall_time: str,
    environment_name: str,
) -> "LSFCluster":
    """Set up a dask cluster which integrates with an existing LSF queue manager to
    spawn and manage workers.

    Args:
        n_workers: The number of workers to spawn.
        queue: The queue to submit the workers to.
        memory_gigabytes: The maximum memory to request per worker in GB.
        wall_time: The maximum wall-clock time to spawn each worker for.
        environment_name: The conda environment to activate for each worker.

    Returns:
        The initialized cluster.
    """
    import dask
    from dask_jobqueue import LSFCluster

    env_extra = dask.config.get("jobqueue.lsf.env-extra", default=[])
    env_extra.append(f"conda activate {environment_name}")

    cluster = LSFCluster(
        queue=queue,
        cores=1,
        memory=f"{memory_gigabytes * 1e9}B",
        walltime=wall_time,
        local_directory="dask-worker-space",
        log_directory="dask-worker-logs",
        env_extra=env_extra,
    )
    cluster.scale(n=n_workers)

    return cluster
Esempio n. 21
0
def test_job_script():
    with LSFCluster(walltime='00:02', processes=4, cores=8,
                    memory='28GB') as cluster:

        job_script = cluster.job_script()
        assert '#BSUB' in job_script
        assert '#BSUB -J dask-worker' in job_script
        assert '#BSUB -n 8' in job_script
        assert '#BSUB -M 28000' in job_script
        assert '#BSUB -W 00:02' in job_script
        assert '#BSUB -q' not in cluster.job_header
        assert '#BSUB -P' not in cluster.job_header

        assert '{} -m distributed.cli.dask_worker tcp://'.format(
            sys.executable) in job_script
        assert '--nthreads 2 --nprocs 4 --memory-limit 7.00GB' in job_script

    with LSFCluster(queue='general',
                    project='DaskOnLSF',
                    processes=4,
                    cores=8,
                    memory='28GB',
                    ncpus=24,
                    mem=100000000000) as cluster:

        job_script = cluster.job_script()
        assert '#BSUB -q general' in cluster.job_header
        assert '#BSUB -J dask-worker' in cluster.job_header
        assert '#BSUB -n 24' in cluster.job_header
        assert '#BSUB -n 8' not in cluster.job_header
        assert '#BSUB -M 100000' in cluster.job_header
        assert '#BSUB -M 28000' not in cluster.job_header
        assert '#BSUB -W' in cluster.job_header
        assert '#BSUB -P DaskOnLSF' in cluster.job_header

        assert '{} -m distributed.cli.dask_worker tcp://'.format(
            sys.executable) in job_script
        assert '--nthreads 2 --nprocs 4 --memory-limit 7.00GB' in job_script
Esempio n. 22
0
def lsf_unit_detection_helper(expected_unit, conf_text=None):
    temp_dir = tempfile.mkdtemp()
    current_lsf_envdir = os.environ.get("LSF_ENVDIR", None)
    os.environ["LSF_ENVDIR"] = temp_dir
    if conf_text is not None:
        with open(os.path.join(temp_dir, "lsf.conf"), "w") as conf_file:
            conf_file.write(conf_text)
    memory_string = "13GB"
    memory_base = parse_bytes(memory_string)
    correct_memory = lsf.lsf_format_bytes_ceil(memory_base, lsf_units=expected_unit)
    with LSFCluster(memory=memory_string, cores=1) as cluster:
        assert "#BSUB -M %s" % correct_memory in cluster.job_header
    rmtree(temp_dir)
    if current_lsf_envdir is None:
        del os.environ["LSF_ENVDIR"]
    else:
        os.environ["LSF_ENVDIR"] = current_lsf_envdir
def start_dask(num_workers, msg, logger):
    """Context manager used for starting/shutting down dask

    Args:
        num_workers (`int`): Number of dask workers
        msg (`str`): Message for timer
        logger: The logger being used

    Yields:
        client: Dask client
    """

    # Update dask
    with open("dask-config.yaml") as f:
        config = yaml.load(f, Loader=SafeLoader)
        dask.config.update(dask.config.config, config)

    cluster_type = next(iter(dask.config.config['jobqueue']))
    set_local_directory(cluster_type)

    if cluster_type == 'local':
        from dask.distributed import LocalCluster
        cluster = LocalCluster(n_workers=num_workers, threads_per_worker=1)
    else:
        if cluster_type == 'lsf':
            from dask_jobqueue import LSFCluster
            cluster = LSFCluster()
        elif cluster_type == 'slurm':
            from dask_jobqueue import SLURMCluster
            cluster = SLURMCluster()
        elif cluster_type == 'sge':
            from dask_jobqueue import SGECluster
            cluster = SGECluster()
        cluster.scale(num_workers)
    try:
        with io_util.Timing_Messager(f"Starting dask cluster for {msg}",
                                     logger):
            client = Client(cluster)
        io_util.print_with_datetime(
            f"Check {client.cluster.dashboard_link} for {msg} status.", logger)
        yield client
    finally:
        client.shutdown()
        client.close()
Esempio n. 24
0
def get_jobqueue_cluster(walltime='12:00',
                         ncpus=1,
                         cores=1,
                         local_directory=None,
                         memory='15GB',
                         env_extra=None,
                         **kwargs):
    """
    Instantiate a dask_jobqueue cluster using the LSF scheduler on the Janelia Research Campus compute cluster.
    This function wraps the class dask_jobqueue.LSFCLuster and instantiates this class with some sensible defaults.
    Extra kwargs added to this function will be passed to LSFCluster().
    The full API for the LSFCluster object can be found here:
    https://jobqueue.dask.org/en/latest/generated/dask_jobqueue.LSFCluster.html#dask_jobqueue.LSFCluster
    Some of the functions requires dask-jobqueue < 0.7
    """
    import dask
    # this is necessary to ensure that workers get the job script from stdin
    dask.config.set({"jobqueue.lsf.use-stdin": True})
    from dask_jobqueue import LSFCluster
    import os

    if env_extra is None:
        env_extra = [
            "export NUM_MKL_THREADS=1",
            "export OPENBLAS_NUM_THREADS=1",
            "export OPENMP_NUM_THREADS=1",
            "export OMP_NUM_THREADS=1",
        ]

    if local_directory is None:
        local_directory = '/scratch/' + os.environ['USER'] + '/'

    cluster = LSFCluster(queue='normal',
                         walltime=walltime,
                         ncpus=ncpus,
                         cores=cores,
                         local_directory=local_directory,
                         memory=memory,
                         env_extra=env_extra,
                         job_extra=["-o /dev/null"],
                         **kwargs)
    return cluster
Esempio n. 25
0
    def _build_lsf(self):
        from dask_jobqueue import LSFCluster
        num_jobs = math.ceil(self.num_workers / self.workers_per_job)
        cores = self.workers_per_job * self.cores_per_worker
        memory = self.workers_per_job * self.memory_per_worker
        jextra = ['-R rusage[mem={}]'.format(self.memory or "")]

        if "job_extra" in self.cluster_kwargs:
            self.cluster_kwargs["job_extra"].extend(jextra)
        elif "job_extra" not in self.cluster_kwargs:
            self.cluster_kwargs["job_extra"] = jextra

        cluster = LSFCluster(n_workers=self.num_workers,
                             processes=self.workers_per_job,
                             cores=cores,
                             memory=memory,
                             ncpus=cores,
                             **self.cluster_kwargs)

        return cluster
Esempio n. 26
0
def get_jobqueue_cluster(walltime="12:00",
                         cores=1,
                         local_directory=None,
                         memory="16GB",
                         env_extra=None,
                         **kwargs):
    """
    Instantiate a dask_jobqueue cluster using the LSF scheduler on the Janelia Research Campus compute cluster.
    This function wraps the class dask_jobqueue.LSFCLuster and instantiates this class with some sensible defaults.
    Extra kwargs added to this function will be passed to LSFCluster().
    The full API for the LSFCluster object can be found here:
    https://jobqueue.dask.org/en/latest/generated/dask_jobqueue.LSFCluster.html#dask_jobqueue.LSFCluster

    """
    from dask_jobqueue import LSFCluster
    import os

    if env_extra is None:
        env_extra = [
            "export NUM_MKL_THREADS=1",
            "export OPENBLAS_NUM_THREADS=1",
            "export OPENMP_NUM_THREADS=1",
            "export OMP_NUM_THREADS=1",
        ]

    if local_directory is None:
        local_directory = "/scratch/" + os.environ["USER"] + "/"

    cluster = LSFCluster(queue="normal",
                         walltime=walltime,
                         cores=cores,
                         local_directory=local_directory,
                         memory=memory,
                         env_extra=env_extra,
                         job_extra=['-o /dev/null'],
                         **kwargs)
    return cluster
Esempio n. 27
0
def test_config_name_lsf_takes_custom_config():
    conf = {
        'queue': 'myqueue',
        'project': 'myproject',
        'ncpus': 1,
        'cores': 1,
        'mem': 2,
        'memory': '2 GB',
        'walltime': '00:02',
        'job-extra': [],
        'name': 'myname',
        'processes': 1,
        'interface': None,
        'death-timeout': None,
        'local-directory': '/foo',
        'extra': [],
        'env-extra': [],
        'log-directory': None,
        'shebang': '#!/usr/bin/env bash'
    }

    with dask.config.set({'jobqueue.lsf-config-name': conf}):
        with LSFCluster(config_name='lsf-config-name') as cluster:
            assert cluster.name == 'myname'
Esempio n. 28
0
def test_adaptive(loop):
    with LSFCluster(walltime='00:02',
                    processes=1,
                    cores=2,
                    memory='2GB',
                    local_directory='/tmp',
                    loop=loop) as cluster:
        cluster.adapt()
        with Client(cluster) as client:
            future = client.submit(lambda x: x + 1, 10)

            start = time()
            while not (cluster.pending_jobs or cluster.running_jobs):
                sleep(0.100)
                assert time() < start + QUEUE_WAIT

            assert future.result(QUEUE_WAIT) == 11

            start = time()
            processes = cluster.worker_processes
            while len(client.scheduler_info()['workers']) != processes:
                sleep(0.1)
                assert time() < start + QUEUE_WAIT

            del future

            start = time()
            while len(client.scheduler_info()['workers']) > 0:
                sleep(0.100)
                assert time() < start + QUEUE_WAIT

            start = time()
            while cluster.pending_jobs or cluster.running_jobs:
                sleep(0.100)
                assert time() < start + QUEUE_WAIT
            assert cluster.finished_jobs
Esempio n. 29
0
def get_cluster(cluster_type, **kwargs):
    """Generic dask cluster wrapper"""

    # check input cluster type
    cluster_type = cluster_type.lower()
    cluster_list = ['lsf','pbs','slurm','local']
    if cluster_type not in cluster_list:
        msg = "Cluster type '{}' not supported".format(cluster_type)
        msg += '\nsupported cluster types: {}'.format(cluster_list)
        raise ValueError(msg)
    print("Dask cluster type: {}".format(cluster_type))

    # No need to do the extra configuration checking if using LocalCluster
    if cluster_type == 'local':
        return LocalCluster()

    # check input config name
    if 'config_name' in kwargs.keys():
        kwargs['config_name'] = check_config_name(kwargs['config_name'], cluster_type)
    print("Dask config name: {}".format(kwargs['config_name']))

    # check walltime format for each cluster type
    if 'walltime' in kwargs.keys():
        kwargs['walltime'] = check_walltime_format(kwargs["walltime"], cluster_type)
    print('Dask worker walltime: {}'.format(kwargs['walltime']))

    # initiate cluster object
    if cluster_type == 'lsf':
        cluster = LSFCluster(**kwargs)
    elif cluster_type == 'pbs':
        cluster = PBSCluster(**kwargs)
    elif cluster_type == 'slurm':
        cluster = SLURMCluster(**kwargs)

    # Print and write job command file for HPC cluster types
    print("JOB COMMAND CALLED FROM PYTHON:\n\n", cluster.job_script())
    with open('dask_command_run_from_python.txt', 'w') as f:
        f.write(cluster.job_script() + '\n')
    
    return cluster
Esempio n. 30
0
    def execute(self, pipeline_context, execution_plan):
        check.inst_param(pipeline_context, "pipeline_context",
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, "execution_plan", ExecutionPlan)
        check.param_invariant(
            isinstance(pipeline_context.executor, DaskExecutor),
            "pipeline_context",
            "Expected executor to be DaskExecutor got {}".format(
                pipeline_context.executor),
        )

        check.invariant(
            pipeline_context.instance.is_persistent,
            "Dask execution requires a persistent DagsterInstance",
        )

        step_levels = execution_plan.execution_step_levels()

        pipeline_name = pipeline_context.pipeline_def.name

        instance = pipeline_context.instance

        cluster_type = self.cluster_type
        if cluster_type == "local":
            from dask.distributed import LocalCluster

            cluster = LocalCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "yarn":
            from dask_yarn import YarnCluster

            cluster = YarnCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "ssh":
            from dask.distributed import SSHCluster

            cluster = SSHCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "pbs":
            from dask_jobqueue import PBSCluster

            cluster = PBSCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "moab":
            from dask_jobqueue import MoabCluster

            cluster = MoabCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "sge":
            from dask_jobqueue import SGECluster

            cluster = SGECluster(**self.build_dict(pipeline_name))
        elif cluster_type == "lsf":
            from dask_jobqueue import LSFCluster

            cluster = LSFCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "slurm":
            from dask_jobqueue import SLURMCluster

            cluster = SLURMCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "oar":
            from dask_jobqueue import OARCluster

            cluster = OARCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "kube":
            from dask_kubernetes import KubeCluster

            cluster = KubeCluster(**self.build_dict(pipeline_name))
        else:
            raise ValueError(
                f"Must be providing one of the following ('local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}"
            )

        with dask.distributed.Client(cluster) as client:
            execution_futures = []
            execution_futures_dict = {}

            for step_level in step_levels:
                for step in step_level:
                    # We ensure correctness in sequencing by letting Dask schedule futures and
                    # awaiting dependencies within each step.
                    dependencies = []
                    for step_input in step.step_inputs:
                        for key in step_input.dependency_keys:
                            dependencies.append(execution_futures_dict[key])

                    run_config = dict(pipeline_context.run_config,
                                      execution={"in_process": {}})
                    recon_repo = pipeline_context.pipeline.get_reconstructable_repository(
                    )

                    dask_task_name = "%s.%s" % (pipeline_name, step.key)

                    recon_pipeline = recon_repo.get_reconstructable_pipeline(
                        pipeline_name)

                    future = client.submit(
                        query_on_dask_worker,
                        dependencies,
                        recon_pipeline,
                        pipeline_context.pipeline_run,
                        run_config,
                        [step.key],
                        pipeline_context.mode_def.name,
                        instance.get_ref(),
                        key=dask_task_name,
                        resources=get_dask_resource_requirements(step.tags),
                    )

                    execution_futures.append(future)
                    execution_futures_dict[step.key] = future

            # This tells Dask to awaits the step executions and retrieve their results to the
            # master
            futures = dask.distributed.as_completed(execution_futures,
                                                    with_results=True)

            # Allow interrupts while waiting for the results from Dask
            for future, result in iterate_with_context(
                    raise_interrupts_immediately, futures):
                for step_event in result:
                    check.inst(step_event, DagsterEvent)
                    yield step_event