Beispiel #1
0
def init_cluster(num_workers, wait_for_all_workers=True):
    """
    Start up a dask cluster, optionally wait until all workers have been launched,
    and then return the resulting distributed.Client object.
    
    Args:
        num_workers:
            How many workers to launch.
        wait_for_all_workers:
            If True, pause until all workers have been launched before returning.
            Otherwise, just wait for a single worker to launch.
    
    Returns:
        distributed.Client
    """
    # Local import: LSFCluster probably isn't importable on your local machine,
    # so it's nice to avoid importing it when you're just running local tests without a cluster.
    from dask_jobqueue import LSFCluster
    cluster = LSFCluster(ip='0.0.0.0')
    cluster.scale(num_workers)

    required_workers = 1
    if wait_for_all_workers:
        required_workers = num_workers

    client = Client(cluster)
    while (wait_for_all_workers and client.status == "running"
           and len(cluster.scheduler.workers) < required_workers):
        print(
            f"Waiting for {required_workers - len(cluster.scheduler.workers)} workers..."
        )
        time.sleep(1.0)

    return client
def start_dask_lsfcluster(cluster_size=5):
    """Start a dask cluster."""
    if cluster_size < 4:
        raise Exception('Too small of a cluster')
    # Settings for Sanger farm
    memory_in_gb = 20
    cluster = LSFCluster(
        queue='normal',
        walltime='00:30',
        log_directory='{}/dask_logs'.format(os.getcwd()),
        cores=4,
        memory='{} Gb'.format(memory_in_gb),
        mem=memory_in_gb * 1e+9,  # should be in bytes
        lsf_units='mb',
        job_extra=[
            '-G team152', '-g /lt9/dask',
            '-R "select[mem>{}] rusage[mem={}]"'.format(
                int(memory_in_gb * 1e+3), int(memory_in_gb * 1e+3))
        ],
        use_stdin=True)

    # View the job submission from Dask
    # cluster.job_script()

    # Scale cluster
    cluster.scale(cluster_size)

    # auto-scale between 10 and 100 jobs
    # cluster.adapt(
    #     minimum_jobs=int(cluster_size/4),
    #     maximum_jobs=cluster_size
    # )
    # cluster.adapt(maximum_memory="10 TB")  # use core/memory limits

    client = Client(cluster, timeout=120)
    client.wait_for_workers(n_workers=cluster_size)
    # print(client.scheduler_info()['services'])

    return cluster, client
Beispiel #3
0
def setup_dask_lsf_cluster(
    n_workers: int,
    queue: str,
    memory_gigabytes: int,
    wall_time: str,
    environment_name: str,
) -> "LSFCluster":
    """Set up a dask cluster which integrates with an existing LSF queue manager to
    spawn and manage workers.

    Args:
        n_workers: The number of workers to spawn.
        queue: The queue to submit the workers to.
        memory_gigabytes: The maximum memory to request per worker in GB.
        wall_time: The maximum wall-clock time to spawn each worker for.
        environment_name: The conda environment to activate for each worker.

    Returns:
        The initialized cluster.
    """
    import dask
    from dask_jobqueue import LSFCluster

    env_extra = dask.config.get("jobqueue.lsf.env-extra", default=[])
    env_extra.append(f"conda activate {environment_name}")

    cluster = LSFCluster(
        queue=queue,
        cores=1,
        memory=f"{memory_gigabytes * 1e9}B",
        walltime=wall_time,
        local_directory="dask-worker-space",
        log_directory="dask-worker-logs",
        env_extra=env_extra,
    )
    cluster.scale(n=n_workers)

    return cluster
Beispiel #4
0
    def _init_dask(self):
        """
        Starts a dask cluster, according to the cluster type specified in the constructor.
        Sets self.client.
        Also writes useful URLs to graph-links.txt.

        If the 'cluster-type' is 'synchronous', then the cluster will be
        a special stub class (DebugCluster), which provides dummy
        implementations of a few functions from the DistributedCluster API.
        (Mostly just for convenient unit testing.)
        """

        # Consider using client.register_worker_callbacks() to configure
        # - faulthandler (later)
        # - excepthook?
        # - (okay, maybe it's just best to put that stuff in __init__.py, like in DSS)

        load_and_overwrite_dask_config(self.cluster_type, 'dask-config.yaml',
                                       True)
        self._write_driver_graph_urls()

        if self.cluster_type in JOBQUEUE_CLUSTERS:
            update_jobqueue_config_with_defaults(self.cluster_type)

            if self.cluster_type == "lsf":
                from dask_jobqueue import LSFCluster
                cluster = LSFCluster()  #ip='0.0.0.0')
            elif self.cluster_type == "sge":
                from dask_jobqueue import SGECluster
                cluster = SGECluster(ip='0.0.0.0')
            elif self.cluster_type == "slurm":
                from dask_jobqueue import SLURMCluster
                cluster = SLURMCluster(ip='0.0.0.0')
            else:
                raise AssertionError("Unimplemented jobqueue cluster")

            cluster.scale(self.num_workers)

        elif self.cluster_type == "local-cluster":
            cluster = LocalCluster(self.num_workers,
                                   threads_per_worker=1,
                                   processes=True,
                                   ip='0.0.0.0')

        elif self.cluster_type in ("synchronous", "processes"):
            cluster = None
            # synchronous/processes mode is for testing and debugging only
            assert dask.config.get('scheduler', self.cluster_type) == self.cluster_type, \
                "Inconsistency between the dask-config and the scheduler you chose."

            dask.config.set(scheduler=self.cluster_type)
            self.client = DebugClient(self.cluster_type)
        else:
            raise AssertionError("Unknown cluster type")

        dump_dask_config('full-dask-config.yaml')

        if cluster:
            dashboard = cluster.dashboard_link
            logger.info(f"Dashboard running on {dashboard}")
            dashboard_ip = extract_ip_from_link(dashboard)
            dashboard = dashboard.replace(dashboard_ip, socket.gethostname())
            logger.info(f"              a.k.a. {dashboard}")

            # Note: Overrides config value: distributed.comm.timeouts.connect
            self.client = Client(cluster, timeout='60s')

            # Wait for the workers to spin up.
            with Timer(f"Waiting for {self.num_workers} workers to launch",
                       logger) as wait_timer:
                while (self.wait_for_workers
                       and self.client.status == "running"
                       and len(self.client.cluster.scheduler.workers) <
                       self.num_workers):

                    if wait_timer.seconds > (60 * self.cluster_max_wait):
                        msg = (
                            f"Not all cluster workers could be launched within the "
                            "allotted time ({self.cluster_max_wait} minutes).\n"
                            "Try again or adjust the 'cluster-max-wait' setting.\n"
                        )
                        raise RuntimeError(msg)
                    time.sleep(0.1)

            if self.wait_for_workers and self.cluster_type == "lsf":
                self._write_worker_graph_urls('graph-links.txt')
    return total


if __name__ == "__main__":
    cluster = LSFCluster(
        name='worker_bee',
        queue='general',  # the queue on Pegasus
        project='insarlab',  # your project name
        cores=2,
        memory='2GB',  # unused by Pegasus but a required param
        walltime='00:30',  # how long the worker will run for
        interface='ib0',  # which network to use. NECESSARY PARAM
        job_extra=[
            '-R "rusage[mem=2500]"',  # how to actually define memory usage
            "-o WORKER-%J.out"
        ],  # where to write worker output files
        python=sys.executable,  # Where to look for Python executable
        config_name='lsf')  # define your own config in a .yaml file
    cluster.scale(20)
    print("JOB FILE:", cluster.job_script())

    client = Client(cluster)
    print("Time to run sequential code:", timeit(stmt=sequential_main,
                                                 number=1))
    print("Time to run parallel code:", timeit(stmt=distributed_main,
                                               number=1))
    print("Time to run parallel code with ~0 data transfer:",
          timeit(stmt=distributed_main2, number=1))

    client.close()
Beispiel #6
0
    def activate_client(self,
                        library=('dask', 'LSF'),
                        num_processes=2,
                        timeout=1800):
        """
        Parameters
        ----------
        library : tuple(str, str), default ('dask', 'LSF')
            parallelism and scheduler tuple
        num_processes : int or None
            number of workers to run with the new client
            if None, num_processes will be adaptive
        timeout : int
            number of seconds to wait to fulfill the workers order
        """
        self.library = library
        if library is not None:
            _logger.debug(f"library is not None")
            assert library[0] in list(
                self.supported_libraries.keys()
            ), f"{library[0]} is not a supported parallelism. (supported parallelisms are {self.supported_libraries.keys()})"
            assert library[1] in list(
                self.supported_libraries[library[0]]
            ), f"{library[1]} is not a supported . (supported parallelisms are {self.supported_libraries[library[0]]})"
        elif library is None:
            _logger.debug(f"library is None")
            self.client = None
            self._adapt = False
            self.num_processes = 0
            self.workers = {}
            return

        if library[0] == 'dask':
            _logger.debug(f"detected dask parallelism...")
            if library[1] == 'LSF':
                _logger.debug(f"detected LSF scheduler")
                from dask_jobqueue import LSFCluster
                _logger.debug(f"creating cluster...")
                cluster = LSFCluster()
                if num_processes is None:
                    _logger.debug(f"adaptive cluster")
                    self._adapt = True
                    cluster.adapt(minimum=1, interval='1s')
                else:
                    _logger.debug(f"nonadaptive cluster")
                    self._adapt = False
                    self.num_processes = num_processes
                    cluster.scale(self.num_processes)

                _logger.debug(f"creating client with cluster")
                self.client = distributed.Client(cluster, timeout=timeout)
                if not self._adapt:
                    while len(self.client.nthreads()) != self.num_processes:
                        _logger.debug(
                            f"waiting for worker request fulfillment...")
                        time.sleep(5)
                worker_threads = self.client.nthreads()
                self.workers = {
                    i: _worker
                    for i, _worker in zip(range(len(worker_threads)),
                                          worker_threads.keys())
                }
                _logger.debug(f"workers initialized: {self.workers}")
            else:
                raise Exception(
                    f"{library[1]} is supported, but without client-activation functionality!"
                )