def run_HPC(): ################# # Setup dask cluster ################# config = utils.read_config() num_workers = config["num_hipergator_workers"] #job args extra_args=[ "--error=/home/b.weinstein/logs/dask-worker-%j.err", "--account=ewhite", "--output=/home/b.weinstein/logs/dask-worker-%j.out" ] cluster = SLURMCluster( processes=2, queue='hpg2-compute', cores=3, memory='11GB', walltime='24:00:00', job_extra=extra_args, local_directory="/home/b.weinstein/logs/", death_timeout=150) print(cluster.job_script()) cluster.adapt(minimum=num_workers, maximum=num_workers) dask_client = Client(cluster) #Start dask dask_client.run_on_scheduler(start_tunnel) run(config, debug=False)
def slurm_cluster(n_workers, cores_per_worker, mem_per_worker, walltime, dask_folder): """helper function to start a Dask Slurm-based cluster :param n_workers: maximum number of workers to use :param cores_per_worker: number of cores per worker :param mem_per_worker: maximum of RAM for workers :param walltime: maximum time for workers :param dask_folder: folder to keep workers temporary data """ dask.config.set({ "distributed.worker.memory.target": False, # avoid spilling to disk "distributed.worker.memory.spill": False, # avoid spilling to disk }) cluster = SLURMCluster( cores=cores_per_worker, processes=1, memory=mem_per_worker, walltime=walltime, log_directory=dask_folder / "logs", # folder for SLURM logs for each worker local_directory=dask_folder, # folder for workers data ) cluster.adapt(minimum=1, maximum=n_workers) client = Client(cluster) return client
def start_dask_cluster(number_of_workers, mem_size="10GB"): ################# # Setup dask cluster ################# #job args extra_args = [ "--error=/home/b.weinstein/logs/dask-worker-%j.err", "--account=ewhite", "--output=/home/b.weinstein/logs/dask-worker-%j.out" ] cluster = SLURMCluster(processes=1, queue='hpg2-compute', cores=1, memory=mem_size, walltime='24:00:00', job_extra=extra_args, local_directory="/home/b.weinstein/logs/dask/", death_timeout=300) print(cluster.job_script()) cluster.adapt(minimum=number_of_workers, maximum=number_of_workers) dask_client = Client(cluster) #Start dask dask_client.run_on_scheduler(start_tunnel) return dask_client
def dask_slurm_cluster(queue=None, cores=None, memory=None, minimum_workers=None, maximum_workers=None, address=None, port=None, **kwargs): __doc__ = _doc_dask_slurm_cluster # noqa queue = queue or DEFAULT_QUEUE cores = cores or DEFAULT_NUM_CORES memory = memory or DEFAULT_MEMORY minimum_workers = minimum_workers or DEFAULT_MINIMUM_WORKERS maximum_workers = maximum_workers or DEFAULT_MAXIMUM_WORKERS address = address or DEFAULT_ADDRESS port = port or DEFAULT_PORT cluster = SLURMCluster(queue=queue, cores=cores, memory=memory, host=f'tcp://{address}:{port}', **kwargs) cluster.adapt(minimum=minimum_workers, maximum=maximum_workers) return cluster
def train_eval_dealiasers(contrast='CORPD_FBK', n_epochs=200, n_samples=None, model_name=None, model_size=None, loss='mae'): job_name = 'dealiasing_fastmri' model_specs = list(get_model_specs(force_res=True, dealiasing=True)) if model_name is not None: model_specs = [ms for ms in model_specs if ms[0] == model_name] if model_size is not None: model_specs = [ms for ms in model_specs if ms[1] == model_size] n_models = len(model_specs) train_cluster = SLURMCluster( cores=1, job_cpu=20, memory='80GB', job_name=job_name, walltime='20:00:00', interface='ib0', job_extra=[ f'--gres=gpu:1', '--qos=qos_gpu-t3', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) train_cluster.adapt(minimum_jobs=0, maximum_jobs=n_models) client = Client(train_cluster) futures = [client.submit( # function to execute train_dealiaser, model_fun=model_fun, model_kwargs=kwargs, run_id=f'{model_name}_{model_size}', n_scales=n_scales, contrast=contrast, n_epochs=n_epochs, n_samples=n_samples, loss=loss, ) for model_name, model_size, model_fun, kwargs, _, n_scales, _ in model_specs] run_ids = client.gather(futures) client.close() train_cluster.close() # eval eval_dealiasers( run_ids, job_name=job_name, contrast=contrast, n_epochs=n_epochs, model_name=model_name, model_size=model_size, n_samples_train=n_samples, loss=loss, ) return run_ids
def _slurmclient(memory: int, partition="epp,taskfarm", account="epp") -> Client: # For slurm usage instructions see: # https://wiki.csc.warwick.ac.uk/twiki/bin/view/Desktop2018/CowUserGuide cluster = SLURMCluster(queue=partition, memory=memory, project=account, cores=1, walltime="24:00:00") cluster.adapt(minimum_jobs=1, maximum_jobs=200) return Client(address=cluster)
def dask_slurm_cluster(queue=None, cores=None, memory=None, minimum_workers=None, maximum_workers=None): queue = queue or DEFAULT_QUEUE cores = cores or DEFAULT_NUM_CORES memory = memory or DEFAULT_MEMORY minimum_workers = minimum_workers or DEFAULT_MINIMUM_WORKERS maximum_workers = maximum_workers or DEFAULT_MAXIMUM_WORKERS cluster = SLURMCluster(queue=queue, cores=cores, memory=memory) cluster.adapt(minimum=minimum_workers, maximum=maximum_workers) return cluster
def run_HPC(data_paths): ################# # Setup dask cluster ################# from dask_jobqueue import SLURMCluster from dask.distributed import Client, wait DeepForest_config = config.load_config() num_workers = DeepForest_config["num_hipergator_workers"] #job args extra_args = [ "--error=/home/b.weinstein/logs/dask-worker-%j.err", "--account=ewhite", "--output=/home/b.weinstein/logs/dask-worker-%j.out" ] cluster = SLURMCluster(processes=1, queue='hpg2-compute', cores=1, memory='13GB', walltime='24:00:00', job_extra=extra_args, local_directory="/home/b.weinstein/logs/", death_timeout=300) print(cluster.job_script()) cluster.adapt(minimum=num_workers, maximum=num_workers) dask_client = Client(cluster) #Start dask dask_client.run_on_scheduler(start_tunnel) for site in data_paths: futures = dask_client.map(Generate.run, data_paths[site], site=site, DeepForest_config=DeepForest_config) wait(futures) print("{} complete".format(site)) print("All sites complete")
def start_slurm_scheduler(account, cores, walltime, memory, processes, interface, local_dir, scheduler_port, dash_port, num_workers, adapt_min, adapt_max): # choose either adaptive mode or fixed number of walkers mode (you # can always connect to and scale manually without adapt mode), # but adapt mode is the default since it is the most no nonsense # DWIM approach adapt_mode = True if num_workers > -1: adapt_mode = False local_cluster_kwargs = {'scheduler_port' : scheduler_port, 'dashboard_address' : ':{}'.format(dash_port)} cluster = SLURMCluster(project=account, cores=cores, walltime=walltime, memory=memory, processes=processes, interface=interface, **local_cluster_kwargs) with cluster: click.echo("Scheduler address: {}".format(cluster.scheduler_address)) click.echo("Dashboard port: {}".format(cluster.dashboard_link)) if adapt_mode: cluster.adapt(minimum=adapt_min, maximum=adapt_max) else: cluster.scale(num_workers) # loop forever to block while True: # sleep so we avoid evaluating the loop to frequently time.sleep(2)
def train_eval_plug_and_play( contrast='CORPD_FBK', n_epochs=200, n_samples=None, af=4, n_primal=5, loss='compound_mssim', train_partition='gpu_p1', model_name=None, model_size=None, ): job_name = 'plug_and_play' model_specs = list(get_model_specs(force_res=False, n_primal=n_primal)) if model_name is not None: model_specs = [ms for ms in model_specs if ms[0] == model_name] if model_size is not None: model_specs = [ms for ms in model_specs if ms[1] == model_size] n_models = len(model_specs) train_cluster = SLURMCluster( cores=1, job_cpu=20, memory='80GB', job_name=job_name, walltime='60:00:00', interface='ib0', job_extra=[ '--gres=gpu:1', '--qos=qos_gpu-t4', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', f'--partition {train_partition}', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) train_cluster.adapt(minimum_jobs=0, maximum_jobs=n_models) client = Client(train_cluster) futures = [ client.submit( # function to execute train_xpdnet, model_fun=model_fun, model_kwargs=kwargs, model_size=model_size, multicoil=False, n_scales=n_scales, res=res, n_primal=n_primal, contrast=contrast, n_epochs=n_epochs, n_samples=n_samples, af=af, loss=loss, ) for _, model_size, model_fun, kwargs, _, n_scales, res in model_specs ] run_ids = client.gather(futures) client.close() train_cluster.close() # eval eval_plug_and_play( run_ids, job_name=job_name, contrast=contrast, n_epochs=n_epochs, af=af, n_primal=n_primal, model_name=model_name, model_size=model_size, n_samples_train=n_samples, ) return run_ids
#treefile = '../validation_data/16s/16s_salaminWstruct_aln.fasta.treefile' #alnfile = '../validation_data/16s/16s_salaminWstruct_aln.fasta' #treefile = '../validation_data/dengue/dengue_all.aln.fasta.treefile' #alnfile = '../validation_data/dengue/dengue_all.aln.fasta' #create distributed cluster from dask_jobqueue import SLURMCluster cluster = SLURMCluster( cores=10, memory="20 GB" ) cluster.adapt( maximum=NCORE) print(cluster.dashboard_link) client = Client(cluster) print('cluster deploy sleep') #wait for cluster deploy time.sleep( 20 ) print('done') #use blast based annotation to assign codons to column ranges allowed_symbols = [ b'A', b'C', b'G' , b'T' ] allowed_transitions = [ c1+c2 for c1 in allowed_symbols for c2 in allowed_symbols if c1!= c2] print('allowed transitions',allowed_transitions) transition_dict = { c : i for i,c in enumerate( allowed_transitions ) }
def eval_plug_and_play( run_ids, job_name='eval_pandp', contrast='CORPD_FBK', n_samples_train=None, n_epochs=200, af=4, n_primal=5, train_partition='gpu_p1', model_name=None, model_size=None, ): model_specs = list(get_model_specs(force_res=False, n_primal=n_primal)) if model_name is not None: model_specs = [ms for ms in model_specs if ms[0] == model_name] if model_size is not None: model_specs = [ms for ms in model_specs if ms[1] == model_size] n_models = len(model_specs) # eval eval_cluster = SLURMCluster( cores=1, job_cpu=40, memory='80GB', job_name=job_name, walltime='2:00:00', interface='ib0', job_extra=[ f'--gres=gpu:1', '--qos=qos_gpu-t3', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) eval_cluster.adapt(minimum_jobs=0, maximum_jobs=n_models) client = Client(eval_cluster) futures = [ client.submit( # function to execute evaluate_xpdnet, model_fun=model_fun, model_kwargs=kwargs, run_id=run_id, multicoil=False, n_samples=50, contrast=contrast, af=af, n_epochs=n_epochs, n_scales=n_scales, res=res, ) for run_id, (_, _, model_fun, kwargs, _, n_scales, res) in zip(run_ids, model_specs) ] df_results = pd.DataFrame( columns='model_name model_size psnr ssim'.split()) for (name, model_size, _, _, _, _, _), future in zip(model_specs, futures): _, eval_res = client.gather(future) df_results = df_results.append(dict( model_name=name, model_size=model_size, psnr=eval_res[0], ssim=eval_res[1], ), ignore_index=True) print(df_results) outputs_file = f'reconstruction_results_{n_samples_train}.csv' if model_name is not None: outputs_file = f'reconstruction_results_{n_samples_train}_{model_name}.csv' df_results.to_csv(outputs_file) print('Shutting down dask workers') client.close() eval_cluster.close() return run_ids
def main(): # Distributed host for p = argparse.ArgumentParser(prog="process", description="Process the FOV pipeline") p.add_argument( "--min_jobs", type=int, default=300, help="Minimum number of jobs to use", ) p.add_argument( "--max_jobs", type=int, default=300, help="Maximum number of jobs to use", ) p.add_argument( "--walltime", type=int, default=5, help="Walltime in hours", ) p.add_argument( "--up_time", type=int, default=10, help="up time for the scheduler in hours", ) args = p.parse_args() cluster = SLURMCluster( cores=2, memory="16GB", walltime="{}:00:00".format(args.walltime), queue="aics_cpu_general", ) cluster.adapt(minimum_jobs=args.min_jobs, maximum_jobs=args.max_jobs) client = dask.distributed.Client(cluster) # noqa connection_info = {} connection_info["HOSTNAME"] = socket.gethostname() connection_info["PORT"] = cluster.scheduler_info["address"].split(":")[-1] connection_info["DASHBOARD_PORT"] = cluster.scheduler_info["services"]["dashboard"] connection_str = ( "ssh -A -J slurm-master -L {PORT}:{HOSTNAME}:{PORT} -L " "{DASHBOARD_PORT}:{HOSTNAME}:{DASHBOARD_PORT} {HOSTNAME}".format( **connection_info ) ) log.info( ( "In a new terimal the machine that you run the pipeline on, copy and paste the following string to forward " "ports to this server:" ) ) log.info(connection_str) log.info(" ") log.info("Then use the following command to kick off your FPP jobs:") log.info("fpp_process --distributed 1 --port {PORT}".format(**connection_info)) log.info(" ") log.info("You can see the dashboard on:") log.info("localhost:{PORT}".format(**connection_info)) log.info(" ") log.info("Command + C will teardown the server.") try: time.sleep(args.up_time * 60 * 60) except KeyboardInterrupt: log.info("Tearing down scheduler.") cluster.close()
self._depencendies = " ".join(f"'{dep}'" for dep in dependencies) def setup(self, worker: Worker): os.system(f"pip install {self._depencendies}") dependency_installer = DependencyInstaller([ "git+https://github.com/andrzejnovak/boostedhiggs.git#egg=boostedhiggs", "scipy==1.8.1", ]) client = Client("tls://localhost:8786") client.register_worker_plugin(dependency_installer) else: cluster.adapt(minimum=args.scaleout) client = Client(cluster) print("Waiting for at least one worker...") client.wait_for_workers(1) with performance_report(filename="dask-report.html"): output = processor.run_uproot_job( sample_dict, treename='Events', processor_instance=processor_object, executor=processor.dask_executor, executor_args={ 'client': client, # 'skipbadfiles': args.skipbadfiles, 'schema': processor.NanoAODSchema, 'retries': 3, },
def record_dataset(args): if args.resume and not args.overwrite: resume_args = yaml.load((Path(args.resume) / 'config.yaml').read_text()) vars(args).update({k: v for k, v in vars(resume_args).items() if 'resume' not in k}) args.ds_dir = Path(args.ds_dir) if args.ds_dir.is_dir(): if args.resume: assert (args.ds_dir / 'seeds_recorded.txt').exists() elif args.overwrite: shutil.rmtree(args.ds_dir) else: raise ValueError('There is already a dataset with this name') args.ds_dir.mkdir(exist_ok=True) (args.ds_dir / 'config.yaml').write_text(yaml.dump(args)) log_dir = DASK_LOGS_DIR.as_posix() if args.distributed: env_extra = [ 'module purge', f'source {CONDA_BASE_DIR}/bin/activate', f'conda activate {CONDA_ENV}', f'cd {PROJECT_DIR}', f'eval $(python -m job_runner.assign_gpu)', 'export OMP_NUM_THREADS=1', 'export MKL_NUM_THREADS=1', ] n_processes = args.n_processes_per_gpu log_path = (DASK_LOGS_DIR / 'all_logs.out').as_posix() cluster = SLURMCluster(cores=n_processes, memory='160 GB', queue=f'{SLURM_GPU_QUEUE}', walltime='10:00:00', processes=n_processes, local_directory=log_dir, log_directory=log_dir, nthreads=1, memory_monitor_interval='1000000000000000s', env_extra=env_extra, job_extra=[ f'--qos={SLURM_QOS}', '--hint=nomultithread', '--gres=gpu:1', f'--output={log_path}', f'--error={log_path}' ], interface=DASK_NETWORK_INTERFACE) cluster.adapt(minimum_jobs=args.n_workers, maximum_jobs=args.n_workers) else: cluster = LocalCluster(local_directory=log_dir, processes=True, n_workers=4) client = Client(cluster) all_keys = record_dataset_dask(client=client, ds_dir=args.ds_dir, scene_kwargs=args.scene_kwargs, scene_cls=args.scene_cls, start_seed=0, n_chunks=int(args.n_chunks), n_frames_per_chunk=int(args.n_frames_per_chunk), resume=args.resume) n_train = int(args.train_ratio * len(all_keys)) train_keys, val_keys = all_keys[:n_train], all_keys[n_train:] Path(args.ds_dir / 'keys.pkl').write_bytes(pickle.dumps(all_keys)) Path(args.ds_dir / 'train_keys.pkl').write_bytes(pickle.dumps(train_keys)) Path(args.ds_dir / 'val_keys.pkl').write_bytes(pickle.dumps(val_keys)) client.close() del cluster
import distributed.joblib from joblib import Parallel, parallel_backend from skopt import BayesSearchCV from molmimic.torch_model.torch_train import Molmimic cluster = SLURMCluster(walltime="3-00:00:00", memory="12000M", cores=16, project="muragroup", queue="gpu", gres="gpu:p100:1", ntasks="1") cluster.adapt() client = Client(cluster) space = { "learning_rate": Real(1e-6, 1e-1, prior='log-uniform'), "num_epochs": Integer(30, 500, prior='log-uniform'), "batch_size": Integer(1, 30, prior='log-uniform'), "dropout_depth": Integer(0, 1), #boolean "dropout_width": Integer(0, 1), #boolean "dropout_p": Real(0., 1., prior='log-uniform') } opt = BayesSearchCV(Molmimic(), space, n_iter=100) with parallel_backend('dask.distributed', client=client): opt.fit("default")
def run( self, distributed: bool = False, clean: bool = False, debug: bool = False, **kwargs, ): """ Run a flow with your steps. Parameters ---------- distributed: bool Create a SLURMCluster to use for job distribution. Default: False (do not create a cluster) clean: bool Should the local staging directory be cleaned prior to this run. Default: False (Do not clean) debug: bool A debug flag for the developer to use to manipulate how much data runs, how it is processed, etc. Default: False (Do not debug) Notes ----- Documentation on prefect: https://docs.prefect.io/core/ Basic prefect example: https://docs.prefect.io/core/ """ # Initalize steps raw = steps.MappedRaw() invert = steps.MappedInvert() cumsum = steps.MappedSum() plot = steps.Plot() fancyplot = steps.Fancyplot() # Choose executor if distributed: # Log dir settings log_dir_name = datetime.now().isoformat().split(".")[0] # Do not include ms log_dir = Path(f".logs/{log_dir_name}/") log_dir.mkdir(parents=True) # Spawn cluster cluster = SLURMCluster( cores=2, memory="32GB", walltime="10:00:00", queue="aics_cpu_general", local_directory=str(log_dir), log_directory=str(log_dir), ) # Set adaptive scaling cluster.adapt(minimum_jobs=1, maximum_jobs=40) else: # Stop conflicts between Dask and OpenBLAS # Info here: # https://stackoverflow.com/questions/45086246/too-many-memory-regions-error-with-dask os.environ["OMP_NUM_THREADS"] = "1" # Spawn local cluster cluster = LocalCluster() # Log bokeh info if cluster.dashboard_link: log.info(f"Dask UI running at: {cluster.dashboard_link}") # Start local dask cluster exe = DaskExecutor(cluster.scheduler_address) # Configure your flow with Flow("example_step_workflow") as flow: # If your step utilizes a secondary flow with dask pass the executor address # If you want to clean the local staging directories pass clean # If you want to utilize some debugging functionality pass debug # If you don't utilize any of these, just pass the parameters you need. matrices = raw( distributed_executor_address=cluster.scheduler_address, clean=clean, debug=debug, **kwargs, # Allows us to pass `--n {some integer}` or other params ) inversions = invert( matrices, distributed_executor_address=cluster.scheduler_address, clean=clean, debug=debug, ) vectors = cumsum( inversions, distributed_executor_address=cluster.scheduler_address, clean=clean, debug=debug, ) plot( vectors, distributed_executor_address=cluster.scheduler_address, clean=clean, debug=debug, ) fancyplot( vectors, distributed_executor_address=cluster.scheduler_address, clean=clean, debug=debug, ) # Run flow and get ending state state = flow.run(executor=exe) # Get plot location log.info(f"Plot stored to: {plot.get_result(state, flow)}") # Close cluster if distributed: cluster.close()
import logging, time import xarray as xr from dask.distributed import Client from typing import List, Optional, Tuple, Dict, Any from dask_jobqueue import SLURMCluster logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) variable = "tas" uri = 'https://dataserver.nccs.nasa.gov/thredds/dodsC/bypass/CREATE-IP/reanalysis/MERRA2/mon/atmos/tas.ncml' cluster = SLURMCluster(queue="myNodes") cluster.adapt(minimum=1, maximum=4, interval="2s", wait_count=500) print("CLUSTER JOB SCRIPT: " + cluster.job_script()) client = Client(cluster) t0 = time.time() dset: xr.Dataset = xr.open_dataset(uri) da: xr.DataArray = dset['tas'] da2: xr.DataArray = da.groupby('time.month').mean('time') da_monthly = da2.load() print(da_monthly) print(" Completed computation in " + str(time.time() - t0) + " seconds") client.close() cluster.close()
## Pb sur occigen les jobs se terminent a cause de workers killed ... from dask_jobqueue import SLURMCluster from dask.distributed import Client cluster = SLURMCluster(cores=28,name='make_zarr',walltime='00:20:00',job_extra=['--constraint=BDW28','--exclusive','--nodes=1'],memory='20GB') print(cluster.job_script()) cluster.scale(1) cluster.adapt(minimum=1, maximum=4) from dask.distributed import Client client = Client(cluster) client import xarray as xr import numpy as np import glob import time data_dir = '/store/CT1/hmg2840/lbrodeau/eNATL60/eNATL60-BLBT02-S/' tfiles = sorted(glob.glob(data_dir + '*/eNATL60-BLBT02_1h_*_gridT_20090701-20090701.nc')) sfiles = sorted(glob.glob(data_dir + '*/eNATL60-BLBT02_1h_*_gridS_20090701-20090701.nc')) def non_time_coords(ds): return [v for v in ds.data_vars if 'time_counter' not in ds[v].dims] def drop_non_essential_vars_pop(ds):