def start_dask(workers):

    ######################################################
    # Setup dask cluster
    ######################################################

    cluster = SLURMCluster(processes=1,
                           queue='hpg2-compute',
                           threads=2,
                           memory='4GB',
                           walltime='144:00:00')

    print('Starting up workers')
    workers = []
    for _ in range(config.num_hipergator_workers):
        workers.extend(cluster.start_workers(1))
        sleep(60)
    dask_client = Client(cluster)

    wait_time = 0
    while len(dask_client.scheduler_info()
              ['workers']) < config.num_hipergator_workers:
        print('waiting on workers: {s} sec. so far'.format(s=wait_time))
        sleep(10)
        wait_time += 10

        # If 5 minutes goes by try adding them again
        if wait_time > 300:
            workers.extend(cluster.start_workers(1))

    print('All workers accounted for')
    # xr import must be after dask.array, and I think after setup
    # up the cluster/client.
    import dask.array as da
    import xarray as xr
Exemple #2
0
def test_run_sorters_dask():
    cache_folder = './local_cache'
    working_folder = 'test_run_sorters_dask'
    if os.path.exists(cache_folder):
        shutil.rmtree(cache_folder)
    if os.path.exists(working_folder):
        shutil.rmtree(working_folder)

    # create recording
    recording_dict = {}
    for i in range(8):
        rec, _ = toy_example(num_channels=4, duration=30, seed=0, num_segments=1)
        # make dumpable
        rec = rec.save(name=f'rec_{i}')
        recording_dict[f'rec_{i}'] = rec

    sorter_list = ['tridesclous', ]

    # create a dask Client for a slurm queue
    from dask.distributed import Client
    from dask_jobqueue import SLURMCluster

    python = '/home/samuel.garcia/.virtualenvs/py36/bin/python3.6'
    cluster = SLURMCluster(processes=1, cores=1, memory="12GB", python=python, walltime='12:00:00', )
    cluster.scale(5)
    client = Client(cluster)

    # dask
    t0 = time.perf_counter()
    run_sorters(sorter_list, recording_dict, working_folder,
                engine='dask', engine_kwargs={'client': client},
                with_output=False,
                mode_if_folder_exists='keep')
    t1 = time.perf_counter()
    print(t1 - t0)
def make_cluster():
    if socket.gethostname() == 'sgw1':

        # number of processing units per node. for ease of use, cores to the
        # number of CPU per node warning: this is the unitary increment by
        # which you can scale your number of workers inside your cluster.
        proc_per_worker = 24

        # total number of slurm node to request. Max number of dask workers
        # will be proc_per_worker * max_slurm_nodes
        max_slurm_nodes = 4

        cluster = SLURMCluster(
            workers=0,  # number of (initial slurm jobs)
            memory="16GB",
            # cores = number processing units per worker, can be
            # dask.Worker (processes) or threads of a worker's
            # ThreadPoolExecutor
            cores=proc_per_worker,
            # among those $cores workers, how many should be dask Workers,
            # (each worker will then have cores // processes threads inside
            # their ThreadPoolExecutor)
            # sets cpus-per-task=processes inside batch script
            processes=proc_per_worker,
            # job_extra=[get_sbatch_args(max_workers, proc_per_worker)],
        )
        # scale the number of unitary dask workers (and not batch jobs)
        cluster.scale(96)
    else:
        cluster = LocalCluster(
            n_workers=2, threads_per_worker=1, processes=False,
            dashboard_address=':7777'
        )
    return cluster
Exemple #4
0
def test_header():
    with SLURMCluster(walltime='00:02:00', processes=4, cores=8, memory='28GB') as cluster:

        assert '#SBATCH' in cluster.job_header
        assert '#SBATCH -J dask-worker' in cluster.job_header
        assert '#SBATCH -n 1' in cluster.job_header
        assert '#SBATCH --cpus-per-task=8' in cluster.job_header
        assert '#SBATCH --mem=27G' in cluster.job_header
        assert '#SBATCH -t 00:02:00' in cluster.job_header
        assert '#SBATCH -p' not in cluster.job_header
        assert '#SBATCH -A' not in cluster.job_header

    with SLURMCluster(queue='regular', project='DaskOnSlurm', processes=4, cores=8, memory='28GB',
                      job_cpu=16, job_mem='100G') as cluster:

        assert '#SBATCH --cpus-per-task=16' in cluster.job_header
        assert '#SBATCH --cpus-per-task=8' not in cluster.job_header
        assert '#SBATCH --mem=100G' in cluster.job_header
        assert '#SBATCH -t ' in cluster.job_header
        assert '#SBATCH -A DaskOnSlurm' in cluster.job_header
        assert '#SBATCH -p regular' in cluster.job_header

    with SLURMCluster(cores=4, memory='8GB') as cluster:

        assert '#SBATCH' in cluster.job_header
        assert '#SBATCH -J ' in cluster.job_header
        assert '#SBATCH -n 1' in cluster.job_header
        assert '#SBATCH -t ' in cluster.job_header
        assert '#SBATCH -p' not in cluster.job_header
        assert '#SBATCH -A' not in cluster.job_header
def train_on_jz_dask(job_name, train_function, *args, **kwargs):
    cluster = SLURMCluster(
        cores=1,
        job_cpu=20,
        memory='80GB',
        job_name=job_name,
        walltime='60:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:1',
            '--qos=qos_gpu-t4',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    cluster.scale(1)

    print(cluster.job_script())

    client = Client(cluster)
    futures = client.submit(
        # function to execute
        train_function,
        *args,
        **kwargs,
        # this function has potential side effects
        pure=True,
    )
    client.gather(futures)
    print('Shutting down dask workers')
Exemple #6
0
class ManagedSLURMCluster(ManagedCluster):
    """
    Args:
        project (str, optional): project name
        queue (str, optional): queue to submit to
        walltime (str, optional): maximum wall time
    """
    def __init__(self,
                 project=None,
                 queue=None,
                 walltime="24:00:00",
                 **kwargs):
        super().__init__(**kwargs)
        self._project = project
        self._queue = queue
        self._walltime = walltime

    def open(self):
        from dask_jobqueue import SLURMCluster

        args = {
            "cores": self.threads_per_worker,
            "processes": 1,
            "memory": self.memory,
            "project": self._project,
            "queue": self._queue,
            "walltime": self._walltime,
            "log_directory": "/tmp",
        }
        self._cluster = SLURMCluster(**args)
        self._cluster.scale(self.n_workers)
def main(args):

    split_files = split_file(args.url_file)


    if args.distribute:
        extra_args = [
            "-J newsnet_worker"
            "--mail-type=ALL",
            "[email protected]"
            "--gres=nvme:100"]

        cluster = SLURMCluster(
            name = "newsnet_worker",
            cores = 20,
            memory="2GB",
            queue="small",
            walltime="3:00:00",
            local_directory = '/tmp',
            log_directory = f"{os.environ.get('PWD')}/dask-worker-space",
            project = args.project,
            job_extra = extra_args)

        with Client(cluster) as client:
            print("\n\nLaunching Dask SLURM cluster...")
            cluster.scale(4)
            to_upload = f'{os.path.dirname(os.path.abspath(sys.argv[0]))}/parse_articles.py'
            client.upload_file(to_upload)
            print(to_upload)
            _ = [run_parse(args, file) for file in split_files]
            [os.remove(sf) for sf in split_files]
    else:
        with Client() as client:
            _ = [run_parse(args, file) for file in split_files]
            [os.remove(sf) for sf in split_files]
def slurm_cluster(n_workers, cores_per_worker, mem_per_worker, walltime,
                  dask_folder):
    """helper function to start a Dask Slurm-based cluster

    :param n_workers: maximum number of workers to use
    :param cores_per_worker: number of cores per worker
    :param mem_per_worker: maximum of RAM for workers
    :param walltime: maximum time for workers
    :param dask_folder: folder to keep workers temporary data
    """
    dask.config.set({
        "distributed.worker.memory.target": False,  # avoid spilling to disk
        "distributed.worker.memory.spill": False,  # avoid spilling to disk
    })
    cluster = SLURMCluster(
        cores=cores_per_worker,
        processes=1,
        memory=mem_per_worker,
        walltime=walltime,
        log_directory=dask_folder /
        "logs",  # folder for SLURM logs for each worker
        local_directory=dask_folder,  # folder for workers data
    )
    cluster.adapt(minimum=1, maximum=n_workers)

    client = Client(cluster)
    return client
Exemple #9
0
def run_HPC():
        
    #################
    # Setup dask cluster
    #################
    
    config = utils.read_config()
    num_workers = config["num_hipergator_workers"]
    
    #job args
    extra_args=[
        "--error=/home/b.weinstein/logs/dask-worker-%j.err",
        "--account=ewhite",
        "--output=/home/b.weinstein/logs/dask-worker-%j.out"
    ]
    
    cluster = SLURMCluster(
        processes=2,
        queue='hpg2-compute',
        cores=3, 
        memory='11GB', 
        walltime='24:00:00',
        job_extra=extra_args,
        local_directory="/home/b.weinstein/logs/", death_timeout=150)
    
    print(cluster.job_script())
    cluster.adapt(minimum=num_workers, maximum=num_workers)
    
    dask_client = Client(cluster)
    
    #Start dask
    dask_client.run_on_scheduler(start_tunnel)  
    run(config, debug=False)
def main(args):
    config_file = args.config_file

    # Configure on cluster
    if config_file:
        stream = open(config_file, 'r')
        inp = yaml.load(stream)
        cores = inp['jobqueue']['slurm']['cores']
        memory = inp['jobqueue']['slurm']['memory']
        jobs = inp['jobqueue']['slurm']['jobs']
        cluster = SLURMCluster(
            cores=cores,
            memory=memory,
        )
        cluster.scale(jobs=jobs)

    # Configure locally
    else:
        cluster = LocalCluster()

    client = Client(cluster)
    raised_futures = client.map(sleep_more, range(100))
    progress(raised_futures)
    raised = client.gather(raised_futures)
    print('\n', raised)
def train_on_jz_dask(job_name, train_function, *args, **kwargs):
    cluster = SLURMCluster(
        cores=1,
        job_cpu=40,
        memory='80GB',
        job_name=job_name,
        walltime='20:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:4',
            '--qos=qos_gpu-t3',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/understanding-unets',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    cluster.scale(1)

    print(cluster.job_script())

    client = Client(cluster)
    futures = client.submit(
        # function to execute
        train_function,
        *args,
        **kwargs,
        # this function has potential side effects
        pure=True,
    )
    run_id = client.gather(futures)
    print(f'Train run id: {run_id}')
Exemple #12
0
def initialize_dask(n, factor = 5, slurm = False):

    if not slurm:
        cores =  len(os.sched_getaffinity(0))
        cluster = distributed.LocalCluster(processes = False,
                                           n_workers = 1,
                                           threads_per_worker = 1)

    else:
        n = min(100, n)
        py = './enter_conda.sh python3'
        params = {
            'python' : py,
            'cores' : 1,
            'memory' : '512MB',
            'walltime' : '180',
            'processes' : 1,
            'job_extra' : [
                '--qos use-everything',
                '--array 0-{0:d}'.format(n - 1),
                '--requeue',
                '--output "/dev/null"'
            ],
            'env_extra' : [
                'JOB_ID=${SLURM_ARRAY_JOB_ID%;*}_${SLURM_ARRAY_TASK_ID%;*}',
                'source /etc/profile.d/modules.sh',
                'cd {0!s}'.format(CONFIG['PATHS', 'root']),
            ]
        }
        cluster = SLURMCluster(**params)
        print(cluster.job_script())
        cluster.scale(1)

    print(cluster.dashboard_link)
    return distributed.Client(cluster)
Exemple #13
0
def dask_slurm_cluster(queue=None,
                       cores=None,
                       memory=None,
                       minimum_workers=None,
                       maximum_workers=None,
                       address=None,
                       port=None,
                       **kwargs):
    __doc__ = _doc_dask_slurm_cluster  # noqa

    queue = queue or DEFAULT_QUEUE
    cores = cores or DEFAULT_NUM_CORES
    memory = memory or DEFAULT_MEMORY
    minimum_workers = minimum_workers or DEFAULT_MINIMUM_WORKERS
    maximum_workers = maximum_workers or DEFAULT_MAXIMUM_WORKERS
    address = address or DEFAULT_ADDRESS
    port = port or DEFAULT_PORT

    cluster = SLURMCluster(queue=queue,
                           cores=cores,
                           memory=memory,
                           host=f'tcp://{address}:{port}',
                           **kwargs)
    cluster.adapt(minimum=minimum_workers, maximum=maximum_workers)
    return cluster
Exemple #14
0
class SwissFelCluster:
    def __init__(self, cores=8, memory="24 GB", workers=5):
        self.cluster = SLURMCluster(cores=cores, memory=memory)
        self.client = Client(self.cluster)
        self.ip = socket.gethostbyname(socket.gethostname())
        self.dashboard_port_scheduler = self.client._scheduler_identity.get(
            "services")["dashboard"]
        self.username = getpass.getuser()

    def _repr_html_(self):
        return self.client._repr_html_()

    def scale_workers(self, N_workers):
        self.cluster.scale(N_workers)

    def create_dashboard_tunnel(self, ssh_host="ra"):
        print(
            "type following commant in a terminal, if port is taken, change first number in command."
        )
        print(" ".join([
            f"jupdbport={self.dashboard_port_scheduler}",
            "&&",
            "ssh",
            "-f",
            "-L",
            f"$jupdbport:{self.ip}:{self.dashboard_port_scheduler}",
            f"{self.username}@{ssh_host}",
            "sleep 10",
            "&&",
            "firefox",
            "http://localhost:$jupdbport",
        ]))
Exemple #15
0
 def __init__(self, cores=8, memory="24 GB", workers=5):
     self.cluster = SLURMCluster(cores=cores, memory=memory)
     self.client = Client(self.cluster)
     self.ip = socket.gethostbyname(socket.gethostname())
     self.dashboard_port_scheduler = self.client._scheduler_identity.get(
         "services")["dashboard"]
     self.username = getpass.getuser()
def start_dask_cluster(number_of_workers, mem_size="10GB"):

    #################
    # Setup dask cluster
    #################

    #job args
    extra_args = [
        "--error=/home/b.weinstein/logs/dask-worker-%j.err",
        "--account=ewhite",
        "--output=/home/b.weinstein/logs/dask-worker-%j.out"
    ]

    cluster = SLURMCluster(processes=1,
                           queue='hpg2-compute',
                           cores=1,
                           memory=mem_size,
                           walltime='24:00:00',
                           job_extra=extra_args,
                           local_directory="/home/b.weinstein/logs/dask/",
                           death_timeout=300)

    print(cluster.job_script())
    cluster.adapt(minimum=number_of_workers, maximum=number_of_workers)

    dask_client = Client(cluster)

    #Start dask
    dask_client.run_on_scheduler(start_tunnel)

    return dask_client
Exemple #17
0
def launch_dask_cluster(queue, nodes, localcluster):
    """
    Usage from script:
        from distributed import Client
        from lsst_dashboard.cli import launch_dask_cluster
        cluster, port = launch_dask_cluster('normal', 6, False)
        client = Client(cluster)
    """
    # Launch Dask Cluster
    if "lsst-dev" in host:
        # Set up allowed ports
        (scheduler_port, ) = find_available_ports(1, *DASK_ALLOWED_PORTS)
        (lsst_dashboard_port, ) = find_available_ports(
            1, *DASHBOARD_ALLOWED_PORTS)
        (dask_dashboard_port, ) = find_available_ports(
            1, *DASK_DASHBOARD_ALLOWED_PORTS)
    else:
        localcluster = True
        lsst_dashboard_port = 52001
        dask_dashboard_port = 52002

    if not localcluster:
        from dask_jobqueue import SLURMCluster

        print(
            f"...starting dask cluster using slurm on {host} (queue={queue})")
        procs_per_node = 6
        cluster = SLURMCluster(
            queue=queue,
            cores=24,
            processes=procs_per_node,
            memory="128GB",
            scheduler_port=scheduler_port,
            extra=[
                f'--worker-port {":".join(str(p) for p in DASK_ALLOWED_PORTS)}'
            ],
            dashboard_address=f":{dask_dashboard_port}",
        )

        print(f"...requesting {nodes} nodes")
        cluster.scale(nodes * procs_per_node)
        print(
            "run the command below from your local machine to forward ports for view dashboard and dask diagnostics:"
        )
        print(
            f"\nssh -N -L {lsst_dashboard_port}:{host}:{lsst_dashboard_port} -L {dask_dashboard_port}:{host}:{dask_dashboard_port} {username}@{hostname}\n"
        )
    else:
        from dask.distributed import LocalCluster

        print(f"starting local dask cluster on {host}")
        cluster = LocalCluster(dashboard_address=f":{dask_dashboard_port}")

    print(
        f"### dask dashboard available at http://localhost:{dask_dashboard_port} ###"
    )
    return cluster, lsst_dashboard_port
Exemple #18
0
 def startdask(self):
     if self.local:
         self.daskclient = Client()
         self.daskclient.cluster.scale(self.n_workers)
     else:
         self.daskcluster = SLURMCluster(queue=self.queue,walltime=self.walltime,\
                                processes=self.processes,memory=self.memory,
                               cores=self.cores,job_extra=self.job_extra)
         self.workers = self.daskcluster.start_workers(self.n_workers)
         self.daskclient = Client(self.daskcluster)
Exemple #19
0
 def __init__(self):
     print("Start Cluster")
     self.cluster = SLURMCluster(memory='16g',
                                 processes=1,
                                 cores=1,
                                 death_timeout=200,
                                 walltime="168:00:00",
                                 job_extra=['--partition=Sibirien'])
     self.cluster.start_workers(25)
     self.cli = Client(self.cluster.scheduler.address)
Exemple #20
0
def get_slurm_dask_client(n_workers):
    cluster = SLURMCluster(cores=24,
                           memory='128GB',
                           project="co_aiolos",
                           walltime="24:00:00",
                           queue="savio2_bigmem")

    cluster.scale(n_workers)
    client = Client(cluster)
    return client
Exemple #21
0
 def startdask(self):
     if self.local:
         self.daskclient = Client()
         self.daskclient.cluster.scale(self.n_workers)
     else:
         self.daskcluster = SLURMCluster(queue=self.queue,death_timeout=self.death_timeout,walltime=self.walltime,\
                                processes=self.processes,memory=self.memory,\
                               cores=self.cores,local_directory=self.working_directory,\
                             log_directory=self.working_directory,job_extra=self.job_extra)
         self.workers = self.daskcluster.start_workers(self.n_workers)
         self.daskclient = Client(self.daskcluster)
Exemple #22
0
def get_slurm_dask_client_bigmem(n_nodes):
    cluster = SLURMCluster(cores=24,
                           memory='128GB',
                           project="co_aiolos",
                           walltime="02:00:00",
                           queue="savio2_bigmem",
                           job_extra=['--qos="savio_lowprio"'])

    cluster.scale(n_nodes*6)
    client = Client(cluster)
    return client
Exemple #23
0
def test_job_script():
    with SLURMCluster(
        walltime="00:02:00", processes=4, cores=8, memory="28GB"
    ) as cluster:

        job_script = cluster.job_script()
        assert "#SBATCH" in job_script
        assert "#SBATCH -J dask-worker" in job_script
        assert "--memory-limit 7.00GB " in job_script
        assert "#SBATCH -n 1" in job_script
        assert "#SBATCH --cpus-per-task=8" in job_script
        assert "#SBATCH --mem=27G" in job_script
        assert "#SBATCH -t 00:02:00" in job_script
        assert "#SBATCH -p" not in job_script
        assert "#SBATCH -A" not in job_script

        assert "export " not in job_script

        assert (
            "{} -m distributed.cli.dask_worker tcp://".format(sys.executable)
            in job_script
        )
        assert "--nthreads 2 --nprocs 4 --memory-limit 7.00GB" in job_script

    with SLURMCluster(
        walltime="00:02:00",
        processes=4,
        cores=8,
        memory="28GB",
        env_extra=[
            'export LANG="en_US.utf8"',
            'export LANGUAGE="en_US.utf8"',
            'export LC_ALL="en_US.utf8"',
        ],
    ) as cluster:
        job_script = cluster.job_script()
        assert "#SBATCH" in job_script
        assert "#SBATCH -J dask-worker" in job_script
        assert "#SBATCH -n 1" in job_script
        assert "#SBATCH --cpus-per-task=8" in job_script
        assert "#SBATCH --mem=27G" in job_script
        assert "#SBATCH -t 00:02:00" in job_script
        assert "#SBATCH -p" not in job_script
        assert "#SBATCH -A" not in job_script

        assert 'export LANG="en_US.utf8"' in job_script
        assert 'export LANGUAGE="en_US.utf8"' in job_script
        assert 'export LC_ALL="en_US.utf8"' in job_script

        assert (
            "{} -m distributed.cli.dask_worker tcp://".format(sys.executable)
            in job_script
        )
        assert "--nthreads 2 --nprocs 4 --memory-limit 7.00GB" in job_script
Exemple #24
0
def get_slurm_dask_client(n_workers, n_cores):
    cluster = SLURMCluster(cores=n_cores,
                           memory='32GB',
                           project="co_aiolos",
                           walltime="02:00:00",
                           queue="savio2_gpu",
                           job_extra=['--gres=gpu:1','--cpus-per-task=2'])

    cluster.scale(n_workers)
    client = Client(cluster)
    return client
Exemple #25
0
def get_slurm_dask_client_savio3(n_nodes):
    cluster = SLURMCluster(cores=32,
                           memory='96GB',
                           project="co_aiolos",
                           walltime="72:00:00",
                           queue="savio3",
                           job_extra=['--qos="aiolos_savio3_normal"'])

    cluster.scale(n_nodes*32)
    client = Client(cluster)
    return client
Exemple #26
0
def get_slurm_dask_client_bigmem(n_nodes):
    cluster = SLURMCluster(cores=24,
                           memory='128GB',
                           project="co_aiolos",
                           walltime="02:00:00",
                           queue="savio2_bigmem",
                           local_directory = '/global/home/users/qindan_zhu/myscratch/qindan_zhu/SatelliteNO2',
                            job_extra=['--qos="savio_lowprio"'])

    cluster.scale(n_nodes*4)
    client = Client(cluster)
    return client
Exemple #27
0
def get_slurm_dask_client_savio3(n_nodes):
    cluster = SLURMCluster(cores=32,
                           memory='96GB',
                           project="co_aiolos",
                           walltime="72:00:00",
                           queue="savio3",
                           local_directory = '/global/home/users/qindan_zhu/myscratch/qindan_zhu/SatelliteNO2',
                           job_extra=['--qos="aiolos_savio3_normal"'])

    cluster.scale(n_nodes*8)
    client = Client(cluster)
    return client
Exemple #28
0
def _slurmclient(memory: int,
                 partition="epp,taskfarm",
                 account="epp") -> Client:
    # For slurm usage instructions see:
    # https://wiki.csc.warwick.ac.uk/twiki/bin/view/Desktop2018/CowUserGuide
    cluster = SLURMCluster(queue=partition,
                           memory=memory,
                           project=account,
                           cores=1,
                           walltime="24:00:00")
    cluster.adapt(minimum_jobs=1, maximum_jobs=200)
    return Client(address=cluster)
Exemple #29
0
 def getSlurmCluster(self, queue: str):
     self.logger.info(f"Initializing Slurm cluster using queue {queue}")
     cluster = self.slurm_clusters.setdefault(
         queue,
         SLURMCluster(cores=self.cores) if queue == "default" else
         SLURMCluster(queue=queue, cores=self.cores))
     cluster.adapt(minimum=1,
                   maximum=self.maxworkers,
                   interval="2s",
                   wait_count=500)
     print("CLUSTER JOB SCRIPT: " + cluster.job_script())
     return cluster
Exemple #30
0
def createSLURMCluster():
    cluster = SLURMCluster(queue=single_worker['queue'],
                           project=single_worker['project'],
                           cores=single_worker['cores'],
                           memory=single_worker['memory'],
                           walltime=single_worker['time'],
                           interface='ib0',
                           local_directory=single_worker['temp_folder'])

    cluster.scale(number_of_workers)
    client = Client(cluster)
    print(client)