def create_sge_cluster(self): workers = self.config.parallel queue = self.config.sge_options.queue queue = ",".join([q.strip() for q in queue.split(',')]) memory = self.config.sge_options.memory processes = int(self.config.sge_options.processes) cores = int(self.config.sge_options.cores) resource_spec = self.config.sge_options.resource_spec job_extra = self.config.sge_options.job_extra print("SGE:", "queue=", queue, "memory=", memory, "processes=", processes, "cores=", cores, "resource_spec=", resource_spec, "job_extra=", job_extra) cluster = SGECluster( queue=queue, processes=processes, memory=memory, cores=cores, resource_spec=resource_spec, name="sgains-tools", job_extra=job_extra, walltime='08:00:00', dashboard_address='0.0.0.0:28787', ) cluster.adapt(minimum=workers, maximum=workers) print("SGE cluster dashboard link:", cluster.dashboard_link) print(cluster) print(cluster.job_script()) # print(cluster.job_file()) print("SGE cluster dashboard link:", cluster.dashboard_link) return cluster
def init_cluster(args): env_extra = [ "#$ -e {}".format(args.log_dir or "/dev/null"), "#$ -o {}".format(args.log_dir or "/dev/null"), "#$ -pe serial {}".format(args.ngpus if args.ngpus > 0 else args.ncpus), "export LANG=en_US.UTF-8", "export LC_ALL=en_US.UTF-8", "export MKL_NUM_THREADS=1", "export NUMEXPR_NUM_THREADS=1", "export OMP_NUM_THREADS=1", "export DISABLE_MP_CACHE=1", ] cluster = SGECluster( queue=args.queue, resource_spec="h_vmem={}G,mem_req={}G".format(args.h_vmem, args.mem_req), walltime="720:00:00", name="test_Dask_PytorchDataloader", cores=args.ncpus, memory="{}G".format(args.mem_req), processes=1, interface="ib0", local_directory=".", env_extra=env_extra, spill_dir=".", extra=["--no-nanny"], ) cluster.scale(args.jobs) return cluster
def setup_client_and_cluster(number_processes=1, number_jobs=1, walltime="00:01:00", memory=1): """ Setup Dask client and cluster. Ensure that the number of workers is the right amount for your job and will be fully utilised. """ print("Setting up Dask client and cluster ...") # number of workers used for number of partitions number_workers = number_processes * number_jobs # these are the requirements for a single worker cluster = SGECluster( interface="ib0", walltime=walltime, memory=f"{memory} G", resource_spec=f"h_vmem={memory}G", scheduler_options={"dashboard_address": ":2727"}, job_extra=[ "-V", # export all environment variables f"-pe smp {number_processes}", f"-l disk={memory}G", ], local_directory=os.sep.join( [os.environ.get("PWD"), "dask-worker-space"]), ) client = Client(cluster) cluster.scale(jobs=number_jobs) print("The resources of each worker are: ") print(cluster.job_script()) return client, cluster
def scale_to_sge(n_workers): queue="q_gpu" queue_resource_spec="q_gpu=TRUE" memory="4GB" sge_log= "./logs" from dask_jobqueue import SGECluster cluster = SGECluster(queue=queue, memory=memory, cores=1, processes=1, log_directory=sge_log, local_directory=sge_log, resource_spec=queue_resource_spec) cluster.scale_up(n_workers) return Client(cluster)
def make_client(cluster, type_workers, num_workers, log_dir, no_nanny=False): """ no_nanny option is there to allow workers to create their own workers. usefull if you have gpu workers creating their own cpu workers for data loading. """ if no_nanny: extra = ['--no-nanny', '--no-bokeh'] processes = False else: extra = [] processes = True if cluster == 'paris': from dask_jobqueue import SGECluster job_extra = [ '-pe serial 1', '--stdout={}'.format(os.path.join(log_dir, '%jobid%_stdout.txt')), '--stderr={}'.format(os.path.join(log_dir, '%jobid%_stderr.txt')) ] cluster = SGECluster( queue='gaia.q,chronos.q,titan.q,zeus.q', resource_spec='h_vmem=2000000M,mem_req=2000M', job_extra=['-pe serial 1'], env_extra=[ 'source /sequoia/data1/rstrudel/miniconda3/etc/profile.d/conda.sh', 'conda activate bullet', 'export LANG=en_US.UTF-8', 'export LC_ALL=en_US.UTF-8', 'export PYTHONUNBUFFERED=non_empty' ], walltime='720:00:00', memory='4GB', extra=extra, cores=1, local_directory=os.path.join('/sequoia/data2', getpass.getuser(), 'dask')) cluster.start_workers(num_workers) elif cluster == 'grenoble': from bc.utils.dask_grenoble import GPUCluster dask_log_dir = log_dir.replace('agents', 'dask').replace('/seed', '-s') if not os.path.exists(dask_log_dir): os.mkdir(dask_log_dir) cluster = GPUCluster( extra=['--no-nanny', '--no-bokeh'], walltime='72:00:00', log_dir=dask_log_dir, besteffort=True, interface_node='edgar', ) # cluster.start_workers(num_gpus) cluster.adapt(minimum=0, maximum=num_workers) elif cluster == 'local': cluster = LocalCluster(processes=processes) else: raise ValueError('Unknown cluster name: {}'.format(cluster)) client = Client(cluster) return client
def test_job_script(tmpdir): log_directory = tmpdir.strpath with SGECluster( cores=6, processes=2, memory="12GB", queue="my-queue", project="my-project", walltime="02:00:00", env_extra=["export MY_VAR=my_var"], job_extra=["-w e", "-m e"], log_directory=log_directory, resource_spec="h_vmem=12G,mem_req=12G", ) as cluster: job_script = cluster.job_script() for each in [ "--nprocs 2", "--nthreads 3", "--memory-limit 6.00GB", "-q my-queue", "-P my-project", "-l h_rt=02:00:00", "export MY_VAR=my_var", "#$ -w e", "#$ -m e", "#$ -e {}".format(log_directory), "#$ -o {}".format(log_directory), "-l h_vmem=12G,mem_req=12G", "#$ -cwd", "#$ -j y", ]: assert each in job_script
def test_config_name_sge_takes_custom_config(): conf = { "queue": "myqueue", "project": "myproject", "ncpus": 1, "cores": 1, "memory": "2 GB", "walltime": "00:02", "job-extra": [], "name": "myname", "processes": 1, "interface": None, "death-timeout": None, "local-directory": "/foo", "extra": [], "env-extra": [], "log-directory": None, "shebang": "#!/usr/bin/env bash", "job-cpu": None, "job-mem": None, "resource-spec": None, } with dask.config.set({"jobqueue.sge-config-name": conf}): with SGECluster(config_name="sge-config-name") as cluster: assert cluster.name == "myname"
def test_basic(loop): with SGECluster(walltime="00:02:00", cores=8, processes=4, memory="2GB", loop=loop) as cluster: with Client(cluster, loop=loop) as client: cluster.scale(2) start = time() while not (cluster.pending_jobs or cluster.running_jobs): sleep(0.100) assert time() < start + QUEUE_WAIT future = client.submit(lambda x: x + 1, 10) assert future.result(QUEUE_WAIT) == 11 assert cluster.running_jobs workers = list(client.scheduler_info()["workers"].values()) w = workers[0] assert w["memory_limit"] == 2e9 / 4 assert w["nthreads"] == 2 cluster.scale(0) start = time() while cluster.running_jobs: sleep(0.100) assert time() < start + QUEUE_WAIT
def setup_cluster( memory='2G', gpus=0, log_dir=None, timeout_s=str(3600 * 24 * 7), # a week proc_per_worker=1, cores_per_proc=1, env_extra=None, job_extra=None, grid='clsp', *args, **kwargs) -> SGECluster: if env_extra is None: env_extra = [] # We're creating the "qsub"-like resource specifiation here resource_spec = '' queue = 'all.q' if grid == 'clsp': # Add memory specification (CLSP grid specific) qsub_mem_str = f'mem_free={memory},ram_free={memory}'.replace( 'GB', 'G') # Handle GPU jobs if gpus: # Nun GPUs arg + limit hosts to c nodes (with PyTorch compatible GPUs) resource_spec += f',gpu={gpus},hostname=c*' # Set the queu as needed queue = 'g.q' # Check which GPU is free to use env_extra.append( f'export CUDA_VISIBLE_DEVICES=$(free-gpu -n {gpus})') elif grid == 'coe': # Add memory specification (CLSP grid specific) qsub_mem_str = f'mem_free={memory}'.replace('GB', 'G') # Handle GPU jobs if gpus: # Nun GPUs arg + limit hosts to c nodes (with PyTorch compatible GPUs) resource_spec += f',gpu={gpus}' # Set the queu as needed queue = 'gpu.q' resource_spec += qsub_mem_str # Create a "mini cluster" that our jobs will get submitted to return SGECluster( queue=queue, walltime=timeout_s, processes=proc_per_worker, memory=memory, cores=cores_per_proc, resource_spec=resource_spec, log_directory=log_dir if log_dir is not None else 'log', job_extra=job_extra, env_extra= env_extra, # e.g. ['export ENV_VARIABLE="SOMETHING"', 'source myscript.sh'] *args, **kwargs, )
def test_config_name_sge_takes_custom_config(): conf = { 'queue': 'myqueue', 'project': 'myproject', 'ncpus': 1, 'cores': 1, 'memory': '2 GB', 'walltime': '00:02', 'job-extra': [], 'name': 'myname', 'processes': 1, 'interface': None, 'death-timeout': None, 'local-directory': '/foo', 'extra': [], 'env-extra': [], 'log-directory': None, 'shebang': '#!/usr/bin/env bash', 'job-cpu': None, 'job-mem': None, 'resource-spec': None } with dask.config.set({'jobqueue.sge-config-name': conf}): with SGECluster(config_name='sge-config-name') as cluster: assert cluster.name == 'myname'
def test_basic(loop): # noqa: F811 with SGECluster(walltime='00:02:00', cores=8, processes=4, memory='2GB', loop=loop) as cluster: print(cluster.job_script()) with Client(cluster, loop=loop) as client: cluster.scale(2) start = time() while not (cluster.pending_jobs or cluster.running_jobs): sleep(0.100) assert time() < start + QUEUE_WAIT future = client.submit(lambda x: x + 1, 10) assert future.result(QUEUE_WAIT) == 11 assert cluster.running_jobs workers = list(client.scheduler_info()['workers'].values()) w = workers[0] assert w['memory_limit'] == 2e9 / 4 assert w['ncores'] == 2 cluster.scale(0) start = time() while cluster.running_jobs: sleep(0.100) assert time() < start + QUEUE_WAIT
def test_basic(loop): # noqa: F811 with SGECluster(walltime='00:02:00', cores=8, processes=4, memory='28GB', loop=loop) as cluster: with Client(cluster, loop=loop) as client: workers = cluster.start_workers(2) future = client.submit(lambda x: x + 1, 10) assert future.result(60) == 11 assert cluster.jobs info = client.scheduler_info() for w in info['workers'].values(): assert w['memory_limit'] == 7e9 assert w['ncores'] == 2 cluster.stop_workers(workers) start = time() while len(client.scheduler_info()['workers']) > 0: sleep(0.100) assert time() < start + 10 assert not cluster.jobs
def get_cluster(which="ccin2p3", scale=None, set_client=True, **kwargs): """ """ if which == "ccin2p3": from dask_jobqueue import SGECluster prop = dict(name="dask-worker", walltime="06:00:00", memory='8GB', death_timeout=120, project="P_ztf", resource_spec='sps=1', cores=1, processes=1) cluster = SGECluster(**{**prop,**kwargs}) else: raise NotImplementedError(f"only 'ccin2p3' cluster implemented {which} given") if scale is not None: cluster.scale( int(scale) ) return cluster
def process_dask( funcs, jobs=10, cores=3, processes=3, h_vmem=20, m_mem_free=5, h_rt=3000, ): cluster = SGECluster( n_workers=0, job_cls=None, loop=None, security=None, silence_logs='error', name=None, asynchronous=False, interface=None, host=None, protocol='tcp://', dashboard_address=':8787', config_name=None, processes=processes, queue='low.q', project="labxchem", cores=cores, memory="{}GB".format(h_vmem), walltime=h_rt, resource_spec="m_mem_free={}G,h_vmem={}G,h_rt={}".format( m_mem_free, h_vmem, h_rt), ) cluster.scale(jobs=jobs) client = Client(cluster) results_futures = client.map( call, funcs, ) results = client.gather(results_futures) return results
def init_cluster(name, args): resource_spec = "h_vmem={}G,mem_req={}G".format(args.h_vmem, args.mem_req) exclude_nodes = "&".join(["!" + x for x in args.exclude_nodes]) if len(exclude_nodes) > 0: exclude_nodes = "#$ -l h=" + exclude_nodes env_extra = [ "#$ -e {}".format(args.log_dir or "/dev/null"), "#$ -o {}".format(args.log_dir or "/dev/null"), "#$ -pe serial {}".format( args.ngpus if args.ngpus > 0 else args.ncpus), exclude_nodes, "source " + args.to_source if args.to_source is not None else "", "export LANG=en_US.UTF-8", "export LC_ALL=en_US.UTF-8", "export MKL_NUM_THREADS=1", "export NUMEXPR_NUM_THREADS=1", "export OMP_NUM_THREADS=1", "export DISABLE_MP_CACHE=1", "export TORCH_HOME=/sequoia/data1/rriochet/.torch", ] for var in args.export_var: env_extra.append(f'export {var}="{os.environ[var]}"') cluster = SGECluster( queue=args.queue, resource_spec=resource_spec, walltime="720:00:00", name=name, cores=args.ncpus, memory="{}G".format(args.mem_req), processes=1, interface="ib0", local_directory=args.log_dir, env_extra=env_extra, spill_dir=args.spill_dir, extra=["--no-nanny"], ) # cluster.adapt(maximum_jobs=args.jobs) cluster.scale(args.jobs) return cluster
def get_client(): dask.config.set({"distributed.admin.tick.limit": "300s"}) cluster = SGECluster( queue="medium.q", project="labxchem", cores=10, processes=5, memory="64GB", resource_spec="m_mem_free=64G,redhat_release=rhel7", python= "/dls/science/groups/i04-1/conor_dev/ccp4/build/bin/cctbx.python", walltime="03:00:00", ) cluster.scale(60) time.sleep(15) client = Client(cluster) return client
def run(self, block: bool = True, cluster: bool = False, cluster_kwargs: dict = None, workers: int = 8, debug: bool = False) -> Union[Future, Any]: """Run the pipeline. Parameters ---------- block When True (the default), block until completion. Otherwise, return a :class:`Future`. cluster When True, run on rhino's SGE cluster (default: False). cluster_kwargs A dict of keyword arguments to pass to :class:`SGECluster`. See ``CLUSTER_DEFAULTS`` for default values. workers Number of workers to use when running on the SGE cluster (default: 8). debug When True, disable the cluster and use the single-threaded dask scheduler for debugging. Returns ------- If ``block`` is set, returns the result of running the pipeline. Otherwise returns a :class:`Future` which resolves when the pipeline is complete. """ if cluster and not debug: from dask_jobqueue import SGECluster from dask.distributed import Client if cluster_kwargs is None: kwargs = CLUSTER_DEFAULTS else: kwargs = CLUSTER_DEFAULTS.copy() kwargs.update(cluster_kwargs) cluster = SGECluster(**kwargs) cluster.scale(workers) _ = Client(cluster) if not block and not debug: return self._run_async() else: return self._run_sync(debug)
def main(): # dask cluster and client n_jobs = 20 n_processes = 1 n_workers = n_processes * n_jobs cluster = SGECluster( interface="ib0", walltime="02:00:00", memory=f"48 G", resource_spec=f"h_vmem=48G", scheduler_options={ "dashboard_address": ":7777", }, job_extra=[ "-cwd", "-V", f"-pe smp {n_processes}", f"-l disk=48G", ], local_directory=os.sep.join([os.environ.get("PWD"), "dask-hia-space"]), ) client = Client(cluster) cluster.scale(jobs=n_jobs) time_start = time.time() # dask bag and process simulations = [f'emulator_Base_CLE_2020_{output}'] #simulations = [] #simulations.append(f'wrfchem_Base_CLE_2020_{output}') #simulations.append(f'wrfchem_Base_CLE_2050_{output}') #simulations.append(f'wrfchem_Base_MFR_2050_{output}') #simulations.append(f'wrfchem_SDS_MFR_2050_{output}') #for year in ['2020', '2030', '2040', '2050']: # for scenario in ['Base_CLE', 'Base_MFR', 'SDS_MFR']: # for sim in ['', '_RES', '_IND', '_TRA', '_AGR', '_ENE', '_NO_RES', '_NO_IND', '_NO_TRA', '_NO_AGR', '_NO_ENE']: # simulations.append(f'emulator_{scenario}_{year}{sim}_{output}') print(f"predicting for {len(simulations)} custom outputs ...") bag_simulations = db.from_sequence(simulations, npartitions=n_workers) if output == "PM2_5_DRY": bag_simulations.map(health_impact_assessment_pm25).compute() elif output == "o3_6mDM8h": bag_simulations.map(health_impact_assessment_o3).compute() time_end = time.time() - time_start print( f"completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours" ) client.close() cluster.close()
def main(): # dask cluster and client n_processes = 1 n_jobs = 35 n_workers = n_processes * n_jobs cluster = SGECluster(interface='ib0', walltime='01:00:00', memory=f'64 G', resource_spec=f'h_vmem=64G', scheduler_options={ 'dashboard_address': ':5757', }, job_extra=['-cwd', '-V', f'-pe smp {n_processes}'], local_directory=os.sep.join( [os.environ.get('PWD'), 'dask-worker-space'])) client = Client(cluster) cluster.scale(jobs=n_jobs) time_start = time.time() # regrid custom outputs to pop grid custom_outputs = glob.glob(path + 'ds*' + output + '.nc') custom_outputs_completed = glob.glob(path + 'ds*' + output + '_popgrid_0.05deg.nc') custom_outputs_completed = [ f'{item[0:-19]}.nc' for item in custom_outputs_completed ] custom_outputs_remaining_set = set(custom_outputs) - set( custom_outputs_completed) custom_outputs_remaining = [item for item in custom_outputs_remaining_set] print( f'custom outputs remaining for {output}: {len(custom_outputs_remaining)}' ) # dask bag and process custom_outputs_remaining = custom_outputs_remaining[ 0: 2500] # run in 2,500 chunks over 30 cores, each chunk taking 5 minutes print(f'predicting for {len(custom_outputs_remaining)} custom outputs ...') bag_custom_outputs = db.from_sequence(custom_outputs_remaining, npartitions=n_workers) bag_custom_outputs.map(regrid_to_pop).compute() time_end = time.time() - time_start print( f'completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours' ) print( f'average time per custom output is {time_end / len(custom_outputs_remaining):0.2f} seconds' ) client.close() cluster.close()
def main(): # dask cluster and client n_processes = 1 n_jobs = 35 n_workers = n_processes * n_jobs cluster = SGECluster( interface="ib0", walltime="48:00:00", memory=f"12 G", resource_spec=f"h_vmem=12G", scheduler_options={ "dashboard_address": ":5757", }, job_extra=[ "-cwd", "-V", f"-pe smp {n_processes}", f"-l disk=1G", ], local_directory=os.sep.join( [os.environ.get("PWD"), "dask-worker-space_popweighted_region"]), ) client = Client(cluster) cluster.scale(jobs=n_jobs) # main processing matrix_stacked = np.array( np.meshgrid( np.linspace(0, 1.4, 8), np.linspace(0, 1.4, 8), np.linspace(0, 1.4, 8), np.linspace(0, 1.4, 8), np.linspace(0, 1.4, 8), )).T.reshape(-1, 5) custom_inputs = [np.array(item).reshape(1, -1) for item in matrix_stacked] print(f"processing for {output} over {region} ...") outputs_popweighted = [] bag_custom_inputs = db.from_sequence(custom_inputs, npartitions=n_workers) outputs_popweighted = bag_custom_inputs.map( popweight_outputs_for_input).compute() print("saving ...") joblib.dump( outputs_popweighted, f"/nobackup/earlacoa/machinelearning/data_annual/popweighted/popweighted_{region}_{output}_0.25deg_adjusted_scaled.joblib", ) client.close() cluster.close()
def start_dask(num_workers, msg, logger): """Context manager used for starting/shutting down dask Args: num_workers (`int`): Number of dask workers msg (`str`): Message for timer logger: The logger being used Yields: client: Dask client """ # Update dask with open("dask-config.yaml") as f: config = yaml.load(f, Loader=SafeLoader) dask.config.update(dask.config.config, config) cluster_type = next(iter(dask.config.config['jobqueue'])) set_local_directory(cluster_type) if cluster_type == 'local': from dask.distributed import LocalCluster cluster = LocalCluster(n_workers=num_workers, threads_per_worker=1) else: if cluster_type == 'lsf': from dask_jobqueue import LSFCluster cluster = LSFCluster() elif cluster_type == 'slurm': from dask_jobqueue import SLURMCluster cluster = SLURMCluster() elif cluster_type == 'sge': from dask_jobqueue import SGECluster cluster = SGECluster() cluster.scale(num_workers) try: with io_util.Timing_Messager(f"Starting dask cluster for {msg}", logger): client = Client(cluster) io_util.print_with_datetime( f"Check {client.cluster.dashboard_link} for {msg} status.", logger) yield client finally: client.shutdown() client.close()
def test_complex_cancel_command(loop): with SGECluster( walltime="00:02:00", cores=1, processes=1, memory="2GB", loop=loop ) as cluster: username = "******" cluster.cancel_command = "qdel -u {}".format(username) cluster.scale(2) start = time() while not cluster.running_jobs: sleep(0.100) assert time() < start + QUEUE_WAIT cluster.stop_all_jobs() start = time() while cluster.running_jobs: sleep(0.100) assert time() < start + QUEUE_WAIT
def Start_Client(gpu_name): hostname = socket.gethostname() n_workers = 1 n_cores = 1 wks2 = "wn-wks2.fe.hhi.de" gpu1 = "wn-gpu1.fe.hhi.de" gpu2 = "wn-gpu-104-01.fe.hhi.de" if hostname == wks2: path = "/data/cluster/projects/infineon-radar/daq_x-har/3_Walking_converted/recording-2020-01-28_11-31-55" mem = "20G" # Allocated memory is critical. For this example it must be at least 16GB q = "wn-37.q" # Check current queue status on https://hpc-management.fe.hhi.de/wn/phpqstat/ cluster = SGECluster( n_workers=n_workers, cores=n_cores, memory=mem, resource_spec=f"h_vmem={mem}", host=hostname, queue=q, job_extra=[ "-v MKL_NUM_THREADS=1,NUMEXPR_NUM_THREADS=1,OMP_NUM_THREADS=1" ]) elif hostname in (gpu1, gpu2): os.environ[ "CUDA_VISIBLE_DEVICES"] = gpu_name # Check current status with nvidia-smi and pick GPU from 0-3 cluster = LocalCluster(n_workers=n_workers, threads_per_worker=n_cores, host=hostname) else: raise ValueError( f"{hostname} is not a supported host. Please run this example on {wks}, {gpu1} or {gpu2}." ) client = Client(cluster) client.wait_for_workers(n_workers=n_workers) print(client) return client
def main(): # dask cluster and client n_processes = 1 n_jobs = 35 n_workers = n_processes * n_jobs cluster = SGECluster(interface='ib0', walltime='48:00:00', memory=f'12 G', resource_spec=f'h_vmem=12G', scheduler_options={ 'dashboard_address': ':5757', }, job_extra=[ '-cwd', '-V', f'-pe smp {n_processes}', f'-l disk=1G', ], local_directory=os.sep.join( [os.environ.get('PWD'), 'dask-worker-space'])) client = Client(cluster) cluster.scale(jobs=n_jobs) # main processing matrix_stacked = np.array( np.meshgrid(np.linspace(0, 1.5, 16), np.linspace(0, 1.5, 16), np.linspace(0, 1.5, 16), np.linspace(0, 1.5, 16), np.linspace(0, 1.5, 16))).T.reshape(-1, 5) custom_inputs = [np.array(item).reshape(1, -1) for item in matrix_stacked] print(f'processing for {output} over {region} ...') outputs_popweighted = [] bag_custom_inputs = db.from_sequence(custom_inputs, npartitions=n_workers) outputs_popweighted = bag_custom_inputs.map( popweight_outputs_for_input).compute() print('saving ...') joblib.dump( outputs_popweighted, '/nobackup/earlacoa/machinelearning/data/popweighted/popweighted_' + region + '_' + output + '.joblib') client.close() cluster.close()
def test_complex_cancel_command(loop): with SGECluster(walltime="00:02:00", cores=1, processes=1, memory="2GB", loop=loop) as cluster: with Client(cluster) as client: username = "******" cluster.cancel_command = "qdel -u {}".format(username) cluster.scale(2) start = time() while not client.scheduler_info()["workers"]: sleep(0.100) assert time() < start + QUEUE_WAIT cluster.scale(0) start = time() while client.scheduler_info()["workers"]: sleep(0.100) assert time() < start + QUEUE_WAIT
def main(): # dask cluster and client number_processes = 1 number_jobs = 35 number_workers = number_processes * number_jobs cluster = SGECluster( interface="ib0", walltime="04:00:00", memory=f"2 G", resource_spec=f"h_vmem=2G", scheduler_options={ "dashboard_address": ":2727", }, job_extra=[ "-cwd", "-V", f"-pe smp {number_processes}", f"-l disk=1G", ], local_directory=os.sep.join( [os.environ.get("PWD"), "dask-worker-space"]), ) client = Client(cluster) cluster.scale(jobs=number_jobs) # main processing print("processing ...") results = [] bag = db.from_sequence(nums, npartitions=number_workers) results = bag.map(weird_function).compute() print("saving ...") joblib.dump(results, f"/nobackup/${USER}/results.joblib") client.close() cluster.close()
def main(): # dask cluster and client number_processes = 1 number_jobs = 35 number_workers = number_processes * number_jobs cluster = SGECluster( interface="ib0", walltime="04:00:00", memory=f"12 G", resource_spec=f"h_vmem=12G", scheduler_options={ "dashboard_address": ":2727", }, job_extra=[ "-cwd", "-V", f"-pe smp {number_processes}", f"-l disk=1G", ], local_directory=os.sep.join( [os.environ.get("PWD"), "dask-worker-space"]), ) client = Client(cluster) cluster.scale(jobs=number_jobs) # main processing print("processing ...") results = [] bag = db.from_sequence(sims, npartitions=number_workers) results = bag.map(create_ozone_metric).compute() print("complete") client.close() cluster.close()
def main(): # dask cluster and client n_processes = 1 n_jobs = 35 n_workers = n_processes * n_jobs cluster = SGECluster(interface='ib0', walltime='01:00:00', memory=f'2 G', resource_spec=f'h_vmem=2G', scheduler_options={ 'dashboard_address': ':5757', }, project='admiralty', job_extra=[ '-cwd', '-V', f'-pe smp {n_processes}', f'-l disk=1G', ], local_directory=os.sep.join( [os.environ.get('PWD'), 'dask-worker-space'])) client = Client(cluster) cluster.scale(jobs=n_jobs) time_start = time.time() # custom inputs matrix_stacked = np.array( np.meshgrid( np.linspace( 0, 1.5, 16 ), # 1.5 and 16 for 0.1, 1.5 and 6 for 0.3, 1.4 and 8 for 0.2 np.linspace(0, 1.5, 16), np.linspace(0, 1.5, 16), np.linspace(0, 1.5, 16), np.linspace(0, 1.5, 16))).T.reshape(-1, 5) custom_inputs_set = set( tuple(map(float, map("{:.1f}".format, item))) for item in matrix_stacked) custom_inputs_completed_filenames = glob.glob( '/nobackup/earlacoa/machinelearning/data/summary/ds*' + output + '*') custom_inputs_completed_list = [] for custom_inputs_completed_filename in custom_inputs_completed_filenames: custom_inputs_completed_list.append([ float(item) for item in re.findall( r'\d+\.\d+', custom_inputs_completed_filename) ]) custom_inputs_completed_set = set( tuple(item) for item in custom_inputs_completed_list) custom_inputs_remaining_set = custom_inputs_set - custom_inputs_completed_set custom_inputs = [ np.array(item).reshape(1, -1) for item in custom_inputs_remaining_set ] print(f'custom inputs remaining for {output}: {len(custom_inputs)}') # dask bag and process custom_inputs = custom_inputs[ 0:5000] # run in 1,000 chunks over 30 cores, each chunk taking 1 hour print(f'predicting for {len(custom_inputs)} custom inputs ...') bag_custom_inputs = db.from_sequence(custom_inputs, npartitions=n_workers) bag_custom_inputs.map(custom_predicts).compute() time_end = time.time() - time_start print( f'completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours' ) print( f'average time per custom input is {time_end / len(custom_inputs):0.2f} seconds' ) client.close() cluster.close()
def execute(self, pipeline_context, execution_plan): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) check.param_invariant( isinstance(pipeline_context.executor, DaskExecutor), "pipeline_context", "Expected executor to be DaskExecutor got {}".format( pipeline_context.executor), ) check.invariant( pipeline_context.instance.is_persistent, "Dask execution requires a persistent DagsterInstance", ) step_levels = execution_plan.execution_step_levels() pipeline_name = pipeline_context.pipeline_def.name instance = pipeline_context.instance cluster_type = self.cluster_type if cluster_type == "local": from dask.distributed import LocalCluster cluster = LocalCluster(**self.build_dict(pipeline_name)) elif cluster_type == "yarn": from dask_yarn import YarnCluster cluster = YarnCluster(**self.build_dict(pipeline_name)) elif cluster_type == "ssh": from dask.distributed import SSHCluster cluster = SSHCluster(**self.build_dict(pipeline_name)) elif cluster_type == "pbs": from dask_jobqueue import PBSCluster cluster = PBSCluster(**self.build_dict(pipeline_name)) elif cluster_type == "moab": from dask_jobqueue import MoabCluster cluster = MoabCluster(**self.build_dict(pipeline_name)) elif cluster_type == "sge": from dask_jobqueue import SGECluster cluster = SGECluster(**self.build_dict(pipeline_name)) elif cluster_type == "lsf": from dask_jobqueue import LSFCluster cluster = LSFCluster(**self.build_dict(pipeline_name)) elif cluster_type == "slurm": from dask_jobqueue import SLURMCluster cluster = SLURMCluster(**self.build_dict(pipeline_name)) elif cluster_type == "oar": from dask_jobqueue import OARCluster cluster = OARCluster(**self.build_dict(pipeline_name)) elif cluster_type == "kube": from dask_kubernetes import KubeCluster cluster = KubeCluster(**self.build_dict(pipeline_name)) else: raise ValueError( f"Must be providing one of the following ('local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}" ) with dask.distributed.Client(cluster) as client: execution_futures = [] execution_futures_dict = {} for step_level in step_levels: for step in step_level: # We ensure correctness in sequencing by letting Dask schedule futures and # awaiting dependencies within each step. dependencies = [] for step_input in step.step_inputs: for key in step_input.dependency_keys: dependencies.append(execution_futures_dict[key]) run_config = dict(pipeline_context.run_config, execution={"in_process": {}}) recon_repo = pipeline_context.pipeline.get_reconstructable_repository( ) dask_task_name = "%s.%s" % (pipeline_name, step.key) recon_pipeline = recon_repo.get_reconstructable_pipeline( pipeline_name) future = client.submit( query_on_dask_worker, dependencies, recon_pipeline, pipeline_context.pipeline_run, run_config, [step.key], pipeline_context.mode_def.name, instance.get_ref(), key=dask_task_name, resources=get_dask_resource_requirements(step.tags), ) execution_futures.append(future) execution_futures_dict[step.key] = future # This tells Dask to awaits the step executions and retrieve their results to the # master futures = dask.distributed.as_completed(execution_futures, with_results=True) # Allow interrupts while waiting for the results from Dask for future, result in iterate_with_context( raise_interrupts_immediately, futures): for step_event in result: check.inst(step_event, DagsterEvent) yield step_event
def _init_dask(self): """ Starts a dask cluster, according to the cluster type specified in the constructor. Sets self.client. Also writes useful URLs to graph-links.txt. If the 'cluster-type' is 'synchronous', then the cluster will be a special stub class (DebugCluster), which provides dummy implementations of a few functions from the DistributedCluster API. (Mostly just for convenient unit testing.) """ # Consider using client.register_worker_callbacks() to configure # - faulthandler (later) # - excepthook? # - (okay, maybe it's just best to put that stuff in __init__.py, like in DSS) load_and_overwrite_dask_config(self.cluster_type, 'dask-config.yaml', True) self._write_driver_graph_urls() if self.cluster_type in JOBQUEUE_CLUSTERS: update_jobqueue_config_with_defaults(self.cluster_type) if self.cluster_type == "lsf": from dask_jobqueue import LSFCluster cluster = LSFCluster() #ip='0.0.0.0') elif self.cluster_type == "sge": from dask_jobqueue import SGECluster cluster = SGECluster(ip='0.0.0.0') elif self.cluster_type == "slurm": from dask_jobqueue import SLURMCluster cluster = SLURMCluster(ip='0.0.0.0') else: raise AssertionError("Unimplemented jobqueue cluster") cluster.scale(self.num_workers) elif self.cluster_type == "local-cluster": cluster = LocalCluster(self.num_workers, threads_per_worker=1, processes=True, ip='0.0.0.0') elif self.cluster_type in ("synchronous", "processes"): cluster = None # synchronous/processes mode is for testing and debugging only assert dask.config.get('scheduler', self.cluster_type) == self.cluster_type, \ "Inconsistency between the dask-config and the scheduler you chose." dask.config.set(scheduler=self.cluster_type) self.client = DebugClient(self.cluster_type) else: raise AssertionError("Unknown cluster type") dump_dask_config('full-dask-config.yaml') if cluster: dashboard = cluster.dashboard_link logger.info(f"Dashboard running on {dashboard}") dashboard_ip = extract_ip_from_link(dashboard) dashboard = dashboard.replace(dashboard_ip, socket.gethostname()) logger.info(f" a.k.a. {dashboard}") # Note: Overrides config value: distributed.comm.timeouts.connect self.client = Client(cluster, timeout='60s') # Wait for the workers to spin up. with Timer(f"Waiting for {self.num_workers} workers to launch", logger) as wait_timer: while (self.wait_for_workers and self.client.status == "running" and len(self.client.cluster.scheduler.workers) < self.num_workers): if wait_timer.seconds > (60 * self.cluster_max_wait): msg = ( f"Not all cluster workers could be launched within the " "allotted time ({self.cluster_max_wait} minutes).\n" "Try again or adjust the 'cluster-max-wait' setting.\n" ) raise RuntimeError(msg) time.sleep(0.1) if self.wait_for_workers and self.cluster_type == "lsf": self._write_worker_graph_urls('graph-links.txt')