def start_dask(workers): ###################################################### # Setup dask cluster ###################################################### cluster = SLURMCluster(processes=1, queue='hpg2-compute', threads=2, memory='4GB', walltime='144:00:00') print('Starting up workers') workers = [] for _ in range(config.num_hipergator_workers): workers.extend(cluster.start_workers(1)) sleep(60) dask_client = Client(cluster) wait_time = 0 while len(dask_client.scheduler_info() ['workers']) < config.num_hipergator_workers: print('waiting on workers: {s} sec. so far'.format(s=wait_time)) sleep(10) wait_time += 10 # If 5 minutes goes by try adding them again if wait_time > 300: workers.extend(cluster.start_workers(1)) print('All workers accounted for') # xr import must be after dask.array, and I think after setup # up the cluster/client. import dask.array as da import xarray as xr
def test_run_sorters_dask(): cache_folder = './local_cache' working_folder = 'test_run_sorters_dask' if os.path.exists(cache_folder): shutil.rmtree(cache_folder) if os.path.exists(working_folder): shutil.rmtree(working_folder) # create recording recording_dict = {} for i in range(8): rec, _ = toy_example(num_channels=4, duration=30, seed=0, num_segments=1) # make dumpable rec = rec.save(name=f'rec_{i}') recording_dict[f'rec_{i}'] = rec sorter_list = ['tridesclous', ] # create a dask Client for a slurm queue from dask.distributed import Client from dask_jobqueue import SLURMCluster python = '/home/samuel.garcia/.virtualenvs/py36/bin/python3.6' cluster = SLURMCluster(processes=1, cores=1, memory="12GB", python=python, walltime='12:00:00', ) cluster.scale(5) client = Client(cluster) # dask t0 = time.perf_counter() run_sorters(sorter_list, recording_dict, working_folder, engine='dask', engine_kwargs={'client': client}, with_output=False, mode_if_folder_exists='keep') t1 = time.perf_counter() print(t1 - t0)
def make_cluster(): if socket.gethostname() == 'sgw1': # number of processing units per node. for ease of use, cores to the # number of CPU per node warning: this is the unitary increment by # which you can scale your number of workers inside your cluster. proc_per_worker = 24 # total number of slurm node to request. Max number of dask workers # will be proc_per_worker * max_slurm_nodes max_slurm_nodes = 4 cluster = SLURMCluster( workers=0, # number of (initial slurm jobs) memory="16GB", # cores = number processing units per worker, can be # dask.Worker (processes) or threads of a worker's # ThreadPoolExecutor cores=proc_per_worker, # among those $cores workers, how many should be dask Workers, # (each worker will then have cores // processes threads inside # their ThreadPoolExecutor) # sets cpus-per-task=processes inside batch script processes=proc_per_worker, # job_extra=[get_sbatch_args(max_workers, proc_per_worker)], ) # scale the number of unitary dask workers (and not batch jobs) cluster.scale(96) else: cluster = LocalCluster( n_workers=2, threads_per_worker=1, processes=False, dashboard_address=':7777' ) return cluster
def test_header(): with SLURMCluster(walltime='00:02:00', processes=4, cores=8, memory='28GB') as cluster: assert '#SBATCH' in cluster.job_header assert '#SBATCH -J dask-worker' in cluster.job_header assert '#SBATCH -n 1' in cluster.job_header assert '#SBATCH --cpus-per-task=8' in cluster.job_header assert '#SBATCH --mem=27G' in cluster.job_header assert '#SBATCH -t 00:02:00' in cluster.job_header assert '#SBATCH -p' not in cluster.job_header assert '#SBATCH -A' not in cluster.job_header with SLURMCluster(queue='regular', project='DaskOnSlurm', processes=4, cores=8, memory='28GB', job_cpu=16, job_mem='100G') as cluster: assert '#SBATCH --cpus-per-task=16' in cluster.job_header assert '#SBATCH --cpus-per-task=8' not in cluster.job_header assert '#SBATCH --mem=100G' in cluster.job_header assert '#SBATCH -t ' in cluster.job_header assert '#SBATCH -A DaskOnSlurm' in cluster.job_header assert '#SBATCH -p regular' in cluster.job_header with SLURMCluster(cores=4, memory='8GB') as cluster: assert '#SBATCH' in cluster.job_header assert '#SBATCH -J ' in cluster.job_header assert '#SBATCH -n 1' in cluster.job_header assert '#SBATCH -t ' in cluster.job_header assert '#SBATCH -p' not in cluster.job_header assert '#SBATCH -A' not in cluster.job_header
def train_on_jz_dask(job_name, train_function, *args, **kwargs): cluster = SLURMCluster( cores=1, job_cpu=20, memory='80GB', job_name=job_name, walltime='60:00:00', interface='ib0', job_extra=[ f'--gres=gpu:1', '--qos=qos_gpu-t4', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) cluster.scale(1) print(cluster.job_script()) client = Client(cluster) futures = client.submit( # function to execute train_function, *args, **kwargs, # this function has potential side effects pure=True, ) client.gather(futures) print('Shutting down dask workers')
class ManagedSLURMCluster(ManagedCluster): """ Args: project (str, optional): project name queue (str, optional): queue to submit to walltime (str, optional): maximum wall time """ def __init__(self, project=None, queue=None, walltime="24:00:00", **kwargs): super().__init__(**kwargs) self._project = project self._queue = queue self._walltime = walltime def open(self): from dask_jobqueue import SLURMCluster args = { "cores": self.threads_per_worker, "processes": 1, "memory": self.memory, "project": self._project, "queue": self._queue, "walltime": self._walltime, "log_directory": "/tmp", } self._cluster = SLURMCluster(**args) self._cluster.scale(self.n_workers)
def main(args): split_files = split_file(args.url_file) if args.distribute: extra_args = [ "-J newsnet_worker" "--mail-type=ALL", "[email protected]" "--gres=nvme:100"] cluster = SLURMCluster( name = "newsnet_worker", cores = 20, memory="2GB", queue="small", walltime="3:00:00", local_directory = '/tmp', log_directory = f"{os.environ.get('PWD')}/dask-worker-space", project = args.project, job_extra = extra_args) with Client(cluster) as client: print("\n\nLaunching Dask SLURM cluster...") cluster.scale(4) to_upload = f'{os.path.dirname(os.path.abspath(sys.argv[0]))}/parse_articles.py' client.upload_file(to_upload) print(to_upload) _ = [run_parse(args, file) for file in split_files] [os.remove(sf) for sf in split_files] else: with Client() as client: _ = [run_parse(args, file) for file in split_files] [os.remove(sf) for sf in split_files]
def slurm_cluster(n_workers, cores_per_worker, mem_per_worker, walltime, dask_folder): """helper function to start a Dask Slurm-based cluster :param n_workers: maximum number of workers to use :param cores_per_worker: number of cores per worker :param mem_per_worker: maximum of RAM for workers :param walltime: maximum time for workers :param dask_folder: folder to keep workers temporary data """ dask.config.set({ "distributed.worker.memory.target": False, # avoid spilling to disk "distributed.worker.memory.spill": False, # avoid spilling to disk }) cluster = SLURMCluster( cores=cores_per_worker, processes=1, memory=mem_per_worker, walltime=walltime, log_directory=dask_folder / "logs", # folder for SLURM logs for each worker local_directory=dask_folder, # folder for workers data ) cluster.adapt(minimum=1, maximum=n_workers) client = Client(cluster) return client
def run_HPC(): ################# # Setup dask cluster ################# config = utils.read_config() num_workers = config["num_hipergator_workers"] #job args extra_args=[ "--error=/home/b.weinstein/logs/dask-worker-%j.err", "--account=ewhite", "--output=/home/b.weinstein/logs/dask-worker-%j.out" ] cluster = SLURMCluster( processes=2, queue='hpg2-compute', cores=3, memory='11GB', walltime='24:00:00', job_extra=extra_args, local_directory="/home/b.weinstein/logs/", death_timeout=150) print(cluster.job_script()) cluster.adapt(minimum=num_workers, maximum=num_workers) dask_client = Client(cluster) #Start dask dask_client.run_on_scheduler(start_tunnel) run(config, debug=False)
def main(args): config_file = args.config_file # Configure on cluster if config_file: stream = open(config_file, 'r') inp = yaml.load(stream) cores = inp['jobqueue']['slurm']['cores'] memory = inp['jobqueue']['slurm']['memory'] jobs = inp['jobqueue']['slurm']['jobs'] cluster = SLURMCluster( cores=cores, memory=memory, ) cluster.scale(jobs=jobs) # Configure locally else: cluster = LocalCluster() client = Client(cluster) raised_futures = client.map(sleep_more, range(100)) progress(raised_futures) raised = client.gather(raised_futures) print('\n', raised)
def train_on_jz_dask(job_name, train_function, *args, **kwargs): cluster = SLURMCluster( cores=1, job_cpu=40, memory='80GB', job_name=job_name, walltime='20:00:00', interface='ib0', job_extra=[ f'--gres=gpu:4', '--qos=qos_gpu-t3', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/understanding-unets', '. ./submission_scripts_jean_zay/env_config.sh', ], ) cluster.scale(1) print(cluster.job_script()) client = Client(cluster) futures = client.submit( # function to execute train_function, *args, **kwargs, # this function has potential side effects pure=True, ) run_id = client.gather(futures) print(f'Train run id: {run_id}')
def initialize_dask(n, factor = 5, slurm = False): if not slurm: cores = len(os.sched_getaffinity(0)) cluster = distributed.LocalCluster(processes = False, n_workers = 1, threads_per_worker = 1) else: n = min(100, n) py = './enter_conda.sh python3' params = { 'python' : py, 'cores' : 1, 'memory' : '512MB', 'walltime' : '180', 'processes' : 1, 'job_extra' : [ '--qos use-everything', '--array 0-{0:d}'.format(n - 1), '--requeue', '--output "/dev/null"' ], 'env_extra' : [ 'JOB_ID=${SLURM_ARRAY_JOB_ID%;*}_${SLURM_ARRAY_TASK_ID%;*}', 'source /etc/profile.d/modules.sh', 'cd {0!s}'.format(CONFIG['PATHS', 'root']), ] } cluster = SLURMCluster(**params) print(cluster.job_script()) cluster.scale(1) print(cluster.dashboard_link) return distributed.Client(cluster)
def dask_slurm_cluster(queue=None, cores=None, memory=None, minimum_workers=None, maximum_workers=None, address=None, port=None, **kwargs): __doc__ = _doc_dask_slurm_cluster # noqa queue = queue or DEFAULT_QUEUE cores = cores or DEFAULT_NUM_CORES memory = memory or DEFAULT_MEMORY minimum_workers = minimum_workers or DEFAULT_MINIMUM_WORKERS maximum_workers = maximum_workers or DEFAULT_MAXIMUM_WORKERS address = address or DEFAULT_ADDRESS port = port or DEFAULT_PORT cluster = SLURMCluster(queue=queue, cores=cores, memory=memory, host=f'tcp://{address}:{port}', **kwargs) cluster.adapt(minimum=minimum_workers, maximum=maximum_workers) return cluster
class SwissFelCluster: def __init__(self, cores=8, memory="24 GB", workers=5): self.cluster = SLURMCluster(cores=cores, memory=memory) self.client = Client(self.cluster) self.ip = socket.gethostbyname(socket.gethostname()) self.dashboard_port_scheduler = self.client._scheduler_identity.get( "services")["dashboard"] self.username = getpass.getuser() def _repr_html_(self): return self.client._repr_html_() def scale_workers(self, N_workers): self.cluster.scale(N_workers) def create_dashboard_tunnel(self, ssh_host="ra"): print( "type following commant in a terminal, if port is taken, change first number in command." ) print(" ".join([ f"jupdbport={self.dashboard_port_scheduler}", "&&", "ssh", "-f", "-L", f"$jupdbport:{self.ip}:{self.dashboard_port_scheduler}", f"{self.username}@{ssh_host}", "sleep 10", "&&", "firefox", "http://localhost:$jupdbport", ]))
def __init__(self, cores=8, memory="24 GB", workers=5): self.cluster = SLURMCluster(cores=cores, memory=memory) self.client = Client(self.cluster) self.ip = socket.gethostbyname(socket.gethostname()) self.dashboard_port_scheduler = self.client._scheduler_identity.get( "services")["dashboard"] self.username = getpass.getuser()
def start_dask_cluster(number_of_workers, mem_size="10GB"): ################# # Setup dask cluster ################# #job args extra_args = [ "--error=/home/b.weinstein/logs/dask-worker-%j.err", "--account=ewhite", "--output=/home/b.weinstein/logs/dask-worker-%j.out" ] cluster = SLURMCluster(processes=1, queue='hpg2-compute', cores=1, memory=mem_size, walltime='24:00:00', job_extra=extra_args, local_directory="/home/b.weinstein/logs/dask/", death_timeout=300) print(cluster.job_script()) cluster.adapt(minimum=number_of_workers, maximum=number_of_workers) dask_client = Client(cluster) #Start dask dask_client.run_on_scheduler(start_tunnel) return dask_client
def launch_dask_cluster(queue, nodes, localcluster): """ Usage from script: from distributed import Client from lsst_dashboard.cli import launch_dask_cluster cluster, port = launch_dask_cluster('normal', 6, False) client = Client(cluster) """ # Launch Dask Cluster if "lsst-dev" in host: # Set up allowed ports (scheduler_port, ) = find_available_ports(1, *DASK_ALLOWED_PORTS) (lsst_dashboard_port, ) = find_available_ports( 1, *DASHBOARD_ALLOWED_PORTS) (dask_dashboard_port, ) = find_available_ports( 1, *DASK_DASHBOARD_ALLOWED_PORTS) else: localcluster = True lsst_dashboard_port = 52001 dask_dashboard_port = 52002 if not localcluster: from dask_jobqueue import SLURMCluster print( f"...starting dask cluster using slurm on {host} (queue={queue})") procs_per_node = 6 cluster = SLURMCluster( queue=queue, cores=24, processes=procs_per_node, memory="128GB", scheduler_port=scheduler_port, extra=[ f'--worker-port {":".join(str(p) for p in DASK_ALLOWED_PORTS)}' ], dashboard_address=f":{dask_dashboard_port}", ) print(f"...requesting {nodes} nodes") cluster.scale(nodes * procs_per_node) print( "run the command below from your local machine to forward ports for view dashboard and dask diagnostics:" ) print( f"\nssh -N -L {lsst_dashboard_port}:{host}:{lsst_dashboard_port} -L {dask_dashboard_port}:{host}:{dask_dashboard_port} {username}@{hostname}\n" ) else: from dask.distributed import LocalCluster print(f"starting local dask cluster on {host}") cluster = LocalCluster(dashboard_address=f":{dask_dashboard_port}") print( f"### dask dashboard available at http://localhost:{dask_dashboard_port} ###" ) return cluster, lsst_dashboard_port
def startdask(self): if self.local: self.daskclient = Client() self.daskclient.cluster.scale(self.n_workers) else: self.daskcluster = SLURMCluster(queue=self.queue,walltime=self.walltime,\ processes=self.processes,memory=self.memory, cores=self.cores,job_extra=self.job_extra) self.workers = self.daskcluster.start_workers(self.n_workers) self.daskclient = Client(self.daskcluster)
def __init__(self): print("Start Cluster") self.cluster = SLURMCluster(memory='16g', processes=1, cores=1, death_timeout=200, walltime="168:00:00", job_extra=['--partition=Sibirien']) self.cluster.start_workers(25) self.cli = Client(self.cluster.scheduler.address)
def get_slurm_dask_client(n_workers): cluster = SLURMCluster(cores=24, memory='128GB', project="co_aiolos", walltime="24:00:00", queue="savio2_bigmem") cluster.scale(n_workers) client = Client(cluster) return client
def startdask(self): if self.local: self.daskclient = Client() self.daskclient.cluster.scale(self.n_workers) else: self.daskcluster = SLURMCluster(queue=self.queue,death_timeout=self.death_timeout,walltime=self.walltime,\ processes=self.processes,memory=self.memory,\ cores=self.cores,local_directory=self.working_directory,\ log_directory=self.working_directory,job_extra=self.job_extra) self.workers = self.daskcluster.start_workers(self.n_workers) self.daskclient = Client(self.daskcluster)
def get_slurm_dask_client_bigmem(n_nodes): cluster = SLURMCluster(cores=24, memory='128GB', project="co_aiolos", walltime="02:00:00", queue="savio2_bigmem", job_extra=['--qos="savio_lowprio"']) cluster.scale(n_nodes*6) client = Client(cluster) return client
def test_job_script(): with SLURMCluster( walltime="00:02:00", processes=4, cores=8, memory="28GB" ) as cluster: job_script = cluster.job_script() assert "#SBATCH" in job_script assert "#SBATCH -J dask-worker" in job_script assert "--memory-limit 7.00GB " in job_script assert "#SBATCH -n 1" in job_script assert "#SBATCH --cpus-per-task=8" in job_script assert "#SBATCH --mem=27G" in job_script assert "#SBATCH -t 00:02:00" in job_script assert "#SBATCH -p" not in job_script assert "#SBATCH -A" not in job_script assert "export " not in job_script assert ( "{} -m distributed.cli.dask_worker tcp://".format(sys.executable) in job_script ) assert "--nthreads 2 --nprocs 4 --memory-limit 7.00GB" in job_script with SLURMCluster( walltime="00:02:00", processes=4, cores=8, memory="28GB", env_extra=[ 'export LANG="en_US.utf8"', 'export LANGUAGE="en_US.utf8"', 'export LC_ALL="en_US.utf8"', ], ) as cluster: job_script = cluster.job_script() assert "#SBATCH" in job_script assert "#SBATCH -J dask-worker" in job_script assert "#SBATCH -n 1" in job_script assert "#SBATCH --cpus-per-task=8" in job_script assert "#SBATCH --mem=27G" in job_script assert "#SBATCH -t 00:02:00" in job_script assert "#SBATCH -p" not in job_script assert "#SBATCH -A" not in job_script assert 'export LANG="en_US.utf8"' in job_script assert 'export LANGUAGE="en_US.utf8"' in job_script assert 'export LC_ALL="en_US.utf8"' in job_script assert ( "{} -m distributed.cli.dask_worker tcp://".format(sys.executable) in job_script ) assert "--nthreads 2 --nprocs 4 --memory-limit 7.00GB" in job_script
def get_slurm_dask_client(n_workers, n_cores): cluster = SLURMCluster(cores=n_cores, memory='32GB', project="co_aiolos", walltime="02:00:00", queue="savio2_gpu", job_extra=['--gres=gpu:1','--cpus-per-task=2']) cluster.scale(n_workers) client = Client(cluster) return client
def get_slurm_dask_client_savio3(n_nodes): cluster = SLURMCluster(cores=32, memory='96GB', project="co_aiolos", walltime="72:00:00", queue="savio3", job_extra=['--qos="aiolos_savio3_normal"']) cluster.scale(n_nodes*32) client = Client(cluster) return client
def get_slurm_dask_client_bigmem(n_nodes): cluster = SLURMCluster(cores=24, memory='128GB', project="co_aiolos", walltime="02:00:00", queue="savio2_bigmem", local_directory = '/global/home/users/qindan_zhu/myscratch/qindan_zhu/SatelliteNO2', job_extra=['--qos="savio_lowprio"']) cluster.scale(n_nodes*4) client = Client(cluster) return client
def get_slurm_dask_client_savio3(n_nodes): cluster = SLURMCluster(cores=32, memory='96GB', project="co_aiolos", walltime="72:00:00", queue="savio3", local_directory = '/global/home/users/qindan_zhu/myscratch/qindan_zhu/SatelliteNO2', job_extra=['--qos="aiolos_savio3_normal"']) cluster.scale(n_nodes*8) client = Client(cluster) return client
def _slurmclient(memory: int, partition="epp,taskfarm", account="epp") -> Client: # For slurm usage instructions see: # https://wiki.csc.warwick.ac.uk/twiki/bin/view/Desktop2018/CowUserGuide cluster = SLURMCluster(queue=partition, memory=memory, project=account, cores=1, walltime="24:00:00") cluster.adapt(minimum_jobs=1, maximum_jobs=200) return Client(address=cluster)
def getSlurmCluster(self, queue: str): self.logger.info(f"Initializing Slurm cluster using queue {queue}") cluster = self.slurm_clusters.setdefault( queue, SLURMCluster(cores=self.cores) if queue == "default" else SLURMCluster(queue=queue, cores=self.cores)) cluster.adapt(minimum=1, maximum=self.maxworkers, interval="2s", wait_count=500) print("CLUSTER JOB SCRIPT: " + cluster.job_script()) return cluster
def createSLURMCluster(): cluster = SLURMCluster(queue=single_worker['queue'], project=single_worker['project'], cores=single_worker['cores'], memory=single_worker['memory'], walltime=single_worker['time'], interface='ib0', local_directory=single_worker['temp_folder']) cluster.scale(number_of_workers) client = Client(cluster) print(client)