def train_on_jz_dask(job_name, train_function, *args, **kwargs): cluster = SLURMCluster( cores=1, job_cpu=20, memory='80GB', job_name=job_name, walltime='60:00:00', interface='ib0', job_extra=[ f'--gres=gpu:1', '--qos=qos_gpu-t4', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) cluster.scale(1) print(cluster.job_script()) client = Client(cluster) futures = client.submit( # function to execute train_function, *args, **kwargs, # this function has potential side effects pure=True, ) client.gather(futures) print('Shutting down dask workers')
def train_on_jz_dask(job_name, train_function, *args, **kwargs): cluster = SLURMCluster( cores=1, job_cpu=40, memory='80GB', job_name=job_name, walltime='20:00:00', interface='ib0', job_extra=[ f'--gres=gpu:4', '--qos=qos_gpu-t3', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/understanding-unets', '. ./submission_scripts_jean_zay/env_config.sh', ], ) cluster.scale(1) print(cluster.job_script()) client = Client(cluster) futures = client.submit( # function to execute train_function, *args, **kwargs, # this function has potential side effects pure=True, ) run_id = client.gather(futures) print(f'Train run id: {run_id}')
def run_HPC(): ################# # Setup dask cluster ################# config = utils.read_config() num_workers = config["num_hipergator_workers"] #job args extra_args=[ "--error=/home/b.weinstein/logs/dask-worker-%j.err", "--account=ewhite", "--output=/home/b.weinstein/logs/dask-worker-%j.out" ] cluster = SLURMCluster( processes=2, queue='hpg2-compute', cores=3, memory='11GB', walltime='24:00:00', job_extra=extra_args, local_directory="/home/b.weinstein/logs/", death_timeout=150) print(cluster.job_script()) cluster.adapt(minimum=num_workers, maximum=num_workers) dask_client = Client(cluster) #Start dask dask_client.run_on_scheduler(start_tunnel) run(config, debug=False)
def start_dask_cluster(number_of_workers, mem_size="10GB"): ################# # Setup dask cluster ################# #job args extra_args = [ "--error=/home/b.weinstein/logs/dask-worker-%j.err", "--account=ewhite", "--output=/home/b.weinstein/logs/dask-worker-%j.out" ] cluster = SLURMCluster(processes=1, queue='hpg2-compute', cores=1, memory=mem_size, walltime='24:00:00', job_extra=extra_args, local_directory="/home/b.weinstein/logs/dask/", death_timeout=300) print(cluster.job_script()) cluster.adapt(minimum=number_of_workers, maximum=number_of_workers) dask_client = Client(cluster) #Start dask dask_client.run_on_scheduler(start_tunnel) return dask_client
def initialize_dask(n, factor = 5, slurm = False): if not slurm: cores = len(os.sched_getaffinity(0)) cluster = distributed.LocalCluster(processes = False, n_workers = 1, threads_per_worker = 1) else: n = min(100, n) py = './enter_conda.sh python3' params = { 'python' : py, 'cores' : 1, 'memory' : '512MB', 'walltime' : '180', 'processes' : 1, 'job_extra' : [ '--qos use-everything', '--array 0-{0:d}'.format(n - 1), '--requeue', '--output "/dev/null"' ], 'env_extra' : [ 'JOB_ID=${SLURM_ARRAY_JOB_ID%;*}_${SLURM_ARRAY_TASK_ID%;*}', 'source /etc/profile.d/modules.sh', 'cd {0!s}'.format(CONFIG['PATHS', 'root']), ] } cluster = SLURMCluster(**params) print(cluster.job_script()) cluster.scale(1) print(cluster.dashboard_link) return distributed.Client(cluster)
def start(cpus=0, gpus=0, mem_size="10GB"): ################# # Setup dask cluster ################# if cpus > 0: #job args extra_args = [ "--error=/orange/idtrees-collab/logs/dask-worker-%j.err", "--account=ewhite", "--output=/orange/idtrees-collab/logs/dask-worker-%j.out" ] cluster = SLURMCluster( processes=1, queue='hpg2-compute', cores=1, memory=mem_size, walltime='24:00:00', job_extra=extra_args, extra=['--resources cpu=1'], scheduler_options={"dashboard_address": ":8781"}, local_directory="/orange/idtrees-collab/tmp/", death_timeout=300) print(cluster.job_script()) cluster.scale(cpus) if gpus: #job args extra_args = [ "--error=/orange/idtrees-collab/logs/dask-worker-%j.err", "--account=ewhite", "--output=/orange/idtrees-collab/logs/dask-worker-%j.out", "--partition=gpu", "--gpus=1" ] cluster = SLURMCluster( processes=1, cores=1, memory=mem_size, walltime='24:00:00', job_extra=extra_args, extra=['--resources gpu=1'], scheduler_options={"dashboard_address": ":8787"}, local_directory="/orange/idtrees-collab/tmp/", death_timeout=300) cluster.scale(gpus) dask_client = Client(cluster) #Start dask dask_client.run_on_scheduler(start_tunnel) return dask_client
def evaluate_pdnet_sense_dask(run_id, contrast, af, n_iter, cuda_visible_devices, n_samples): job_name = f'evaluate_pdnet_sense_{af}' if contrast is not None: job_name += f'_{contrast}' cluster = SLURMCluster( cores=1, job_cpu=40, memory='160GB', job_name=job_name, walltime='20:00:00', interface='ib0', job_extra=[ f'--gres=gpu:4', '--qos=qos_gpu-t3', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) cluster.scale(1) print(cluster.job_script()) client = Client(cluster) futures = client.submit( # function to execute evaluate_pdnet_sense, # *args run_id, contrast, int(af), n_iter, n_samples, cuda_visible_devices, # this function has potential side effects pure=True, ) metrics_names, eval_res = client.gather(futures) print(metrics_names) print(eval_res) print('Shutting down dask workers')
def run_HPC(data_paths): ################# # Setup dask cluster ################# from dask_jobqueue import SLURMCluster from dask.distributed import Client, wait DeepForest_config = config.load_config() num_workers = DeepForest_config["num_hipergator_workers"] #job args extra_args = [ "--error=/home/b.weinstein/logs/dask-worker-%j.err", "--account=ewhite", "--output=/home/b.weinstein/logs/dask-worker-%j.out" ] cluster = SLURMCluster(processes=1, queue='hpg2-compute', cores=1, memory='13GB', walltime='24:00:00', job_extra=extra_args, local_directory="/home/b.weinstein/logs/", death_timeout=300) print(cluster.job_script()) cluster.adapt(minimum=num_workers, maximum=num_workers) dask_client = Client(cluster) #Start dask dask_client.run_on_scheduler(start_tunnel) for site in data_paths: futures = dask_client.map(Generate.run, data_paths[site], site=site, DeepForest_config=DeepForest_config) wait(futures) print("{} complete".format(site)) print("All sites complete")
def launch_dask_tasks(batch_sizes, save): job_name = 'dask_mnist_tf_example' cluster = SLURMCluster( cores=1, job_cpu=10, memory='10GB', job_name=job_name, walltime='1:00:00', interface='ib0', job_extra=[ f'--gres=gpu:1', '--qos=qos_gpu-dev', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], ) n_jobs = len(batch_sizes) cluster.scale(jobs=n_jobs) print(cluster.job_script()) client = Client(cluster) futures = [ client.submit( # function to execute train_dense_model, # *args None, save, batch_size, # this function has potential side effects pure=not save, ) for batch_size in batch_sizes ] job_result = client.gather(futures) if all(job_result): print('All jobs finished without errors') else: print('One job errored out') print('Shutting down dask workers')
def eval_on_jz_dask(job_name, eval_function, *args, **kwargs): cluster = SLURMCluster( cores=1, job_cpu=40, memory='80GB', job_name=job_name, walltime='20:00:00', interface='ib0', job_extra=[ # for now we can't use 4 GPUs because of # https://github.com/tensorflow/tensorflow/issues/39268 f'--gres=gpu:1', '--qos=qos_gpu-t3', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) cluster.scale(1) print(cluster.job_script()) client = Client(cluster) futures = client.submit( # function to execute eval_function, *args, **kwargs, # this function has potential side effects pure=True, ) metrics_names, eval_res = client.gather(futures) print(metrics_names) print(eval_res) print('Shutting down dask workers')
def get_slurm_dask_client(n_workers, n_cores, n_processes): cluster = SLURMCluster(cores=n_cores, processes=n_processes, memory='80GB', interface='ib0', queue='standard', job_extra=['-e slurm-%j.err', '-o slurm-%j.out', '--time=72:00:00 --requeue']) header_lines = cluster.job_header.split('\n') mem_pos = find_mem_pos(header_lines) header_lines = header_lines[:mem_pos]+header_lines[mem_pos+1:] cluster.job_header = '\n'.join(header_lines) print(cluster.job_script()) # Scale cluster to n_workers cluster.scale(n_workers) # Wait for cluster to start time.sleep(30) client = Client(cluster) print(client.scheduler_info()) return client
import logging, time import xarray as xr from dask.distributed import Client from typing import List, Optional, Tuple, Dict, Any from dask_jobqueue import SLURMCluster logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) variable = "tas" uri = 'https://dataserver.nccs.nasa.gov/thredds/dodsC/bypass/CREATE-IP/reanalysis/MERRA2/mon/atmos/tas.ncml' cluster = SLURMCluster(queue="myNodes") cluster.adapt(minimum=1, maximum=4, interval="2s", wait_count=500) print("CLUSTER JOB SCRIPT: " + cluster.job_script()) client = Client(cluster) t0 = time.time() dset: xr.Dataset = xr.open_dataset(uri) da: xr.DataArray = dset['tas'] da2: xr.DataArray = da.groupby('time.month').mean('time') da_monthly = da2.load() print(da_monthly) print(" Completed computation in " + str(time.time() - t0) + " seconds") client.close() cluster.close()
def main(): parser = argparse.ArgumentParser( description = 'Simple example for using dask-joqueue in SLURM') parser.add_argument('--proc_per_job', type = int, default = 1, help = 'Number of processes per job.') parser.add_argument('--cores_per_proc', type = float, default = 2, help = 'Number of cores per process.') parser.add_argument('--n_jobs', type = int, default = 1, help = 'Number of jobs') parser.add_argument('--array', type = int, default = 0, help = 'EXPERIMENTAL. If >0, then submit an job-array '+\ 'of this size. The total number of jobs will'+\ ' be `array * n_jobs`.') parser.add_argument('--container', type = str, help = 'Path to singularity container. If `None`, '+\ 'then assumes conda environment.') parser.add_argument('--qos', type = str, help = 'QOS to use.') parser.add_argument('--dry', action = 'store_true', help = 'Print job script and exit (no submission)') parser.add_argument('--load', type = int, default = 1000, help = 'Load for the function.') args = parser.parse_args() n_procs = args.proc_per_job * args.n_jobs params = { 'cores' : int(args.cores_per_proc * args.proc_per_job), 'memory' : '{0:d}00MB'.format(args.proc_per_job*5), 'processes' : args.proc_per_job, # The name to assign to each worker 'name' : 'dask_test' } job_extra = ['--requeue'] env_extra = [] if not args.qos is None: job_extra.append('--qos {}'.format(args.qos)) if args.array > 0: n_procs = n_procs * args.array job_extra.append('--array 0-{0:d}'.format(args.array - 1)) """ This is added to ensure that each worker has a unique ID. This may be unnecessary. """ env_extra.append( 'JOB_ID=${SLURM_ARRAY_JOB_ID%;*}_${SLURM_ARRAY_TASK_ID%;*}') if not args.container is None: """ When using a container, dask needs to know how to enter the python environment. Note: The binding `-B..` is cluster(OpenMind) specific but can generalize. The binding is required since `singularity` will not bind by default. """ cont = os.path.normpath(args.container) bind = cont.split(os.sep)[1] bind = '-B /{0!s}:/{0!s}'.format(bind) py = 'singularity exec {0!s} {1!s} python3'.format(bind, cont) params.update({'python' : py}) """ Dask will generate a job script but some elements will be missing due to the way the singularity container with interface with slurm. The `modules` need to initialized and `singularity` needs to be added. """ env_extra += [ 'source /etc/profile.d/modules.sh', 'module add openmind/singularity/2.6.0'] params.update({ 'job_extra' : job_extra, 'env_extra' : env_extra}) cluster = SLURMCluster(**params) """ Display the job script. """ print(cluster.job_script()) pprint(params) t0 = time.time() num_crunch(100) expected_dur = (time.time() - t0) * args.load print('Expected time of linear call: {0:f}'.format(expected_dur)) if args.dry: return """ Scale the cluster to the number of jobs. """ print('Scaling by {}'.format(args.n_jobs)) cluster.scale_up(args.proc_per_job * args.n_jobs) """ Setup a client that interfaces with the workers """ client = distributed.Client(cluster) time.sleep(10) print(cluster) print(client) pprint(client.has_what()) # pprint(client.scheduler_info()) """ Generate a transaction. """ futures = client.map(num_crunch, range(args.load)) t0 = time.time() """ Compute (and then discard) while keeping track of progress. """ distributed.progress(futures) dur = time.time() - t0 msg = '\n\nSpeed up of {0:f}x ({1:f}/{2:f})'.format((expected_dur / dur), expected_dur, dur) print(msg) msg = 'Ideal speed up is {0:f}x'.format(n_procs) print(msg) """
sys.path.insert(0, '/home/albert7a/git/xscale') import xscale from dask_jobqueue import SLURMCluster from dask.distributed import Client cluster = SLURMCluster(cores=1, name='make_profiles', walltime='00:30:00', job_extra=[ '--constraint=HSW24', '--exclusive', '--nodes=1', '--ntasks-per-node=24' ], memory='120GB', interface='ib0') print(cluster.job_script()) cluster.scale(240) from dask.distributed import Client client = Client(cluster) nb_workers = 0 while True: nb_workers = len(client.scheduler_info()["workers"]) if nb_workers >= 2: break time.sleep(1) data_dir = '/store/CT1/hmg2840/lbrodeau/eNATL60/eNATL60-BLBT02-S/' gridfile = '/store/CT1/hmg2840/lbrodeau/eNATL60/eNATL60-I/mesh_mask_eNATL60_3.6_lev1.nc4'
def main(): # If multiprocessing, set cluster and job settings if cfg.MULTI_PHRASE_CNCPTS: cluster = SLURMCluster( queue='main', cores=8, memory="36000 MB", processes=7, job_extra=[ '--job-name=extract_concepts', '--mail-type=NONE', '-e /home/students/suter/logs_suter/worker_logs/slurm-%j.err', '-o /home/students/suter/logs_suter/worker_logs/slurm-%j.out', '--time=24:00:00' ], env_extra=['PATH=/opt/slurm/bin:$PATH']) # Load files to process with open(cfg.FILES_TO_USE) as infile: files_to_use = infile.read().split('\n') # Load captions with open(cfg.DATA_DIR + "flickr30k-captions/results_20130124.token", "r") as infile: data = infile.read() caption_data = data.split('\n') # Load files that are already processed processed_files_dir = [ f for f in os.listdir('data/phrase_concepts/') if f.endswith('.pickle') ] processed_samples = [] # Save files that are already processed for file in processed_files_dir: parts = file.split('_') image_ID = parts[-2] caption_ID = parts[-1].strip('.pickle') processed_samples.append((image_ID, caption_ID)) print(len(processed_samples)) # Load dataset and get captions and entities dataset = sg.Dataset(cfg.DATA_DIR) dataset.get_captions() dataset.get_entities() # Cluster settigs if cfg.MULTI_PHRASE_CNCPTS: cluster.scale(jobs=15) print(cluster.job_script()) client = Client(cluster) # Initalize futures = [] chunk_counter = 0 # For each sample for line in caption_data: # Skip if empty if line.strip() == '': continue # Get ID and sent split_line = line.split('\t') ID = split_line[0] sent = split_line[1] # Get image and caption ID split_ID = ID.split('#') image_ID = split_ID[0].strip('.jpg') caption_ID = split_ID[1] print(image_ID, caption_ID) # Skip if already processed if (image_ID, caption_ID) in processed_samples: continue # Skip if not in "files to process" if image_ID not in files_to_use: continue # Multiprocessing if cfg.MULTI_PHRASE_CNCPTS: # Count multi-processing chunks chunk_counter += 1 # Run: get phrase concepts for given sample future = client.submit(get_phrase_concepts, line, dataset, cfg.BBOXES_DIR) futures.append(future) # Gather after 50 samples if chunk_counter >= 20: client.gather(futures) futures = [] chunk_counter = 0 # Without multi-processing else: get_phrase_concepts(line, dataset, cfg.BBOXES_DIR) # Final gathering of results if cfg.MULTI_PHRASE_CNCPTS: client.gather(futures) # Load all phrase concept files phrase_concepts_files = [ f for f in os.listdir('data/phrase_concepts/') if f.endswith('.txt') ] # Write it all out to one file with open('data/phrase_concepts.txt', 'w') as outfile: outfile.write('') # For each file, load concent and copy to new file for file in phrase_concepts_files: # Load content with open('data/phrase_concepts/' + file, 'r') as infile: line = infile.read() line = line.split('\n')[0] # Write out to new file with open('data/phrase_concepts.txt', 'a') as outfile: outfile.write(line) outfile.write('\n')