class GraphData: def __init__(self, graph_num_bars, dask_address): self.dask_client = Client(address = dask_address) self.currentValue = {'Memory' :{'total_memory':0, 'used_memory':0}, 'CPU' :{'cpu_usage':0}, 'Cluster':{'n_workers':0, 'total_threads':0}, 'Workers':[]} self.update_dask_values() # Constants data self.mem_max_value = self.currentValue['Memory']['total_memory'] self.util_max_value = 100 self.graph_num_bars = graph_num_bars # Data for graphs self.cpu_util = [0] * graph_num_bars self.mem_util = [0] * graph_num_bars # Data for statistics self.n_workers = self.num_workers() self.total_mem = self.currentValue['Memory']['total_memory'] self.used_mem = self.currentValue['Memory']['used_memory'] def close_con(self): self.dask_client.close() def update_all(self): self.update_dask_values() self.n_workers = self.num_workers() self.mem_max_value = self.currentValue['Memory']['total_memory'] self.total_mem = self.currentValue['Memory']['total_memory'] self.used_mem = self.currentValue['Memory']['used_memory'] self.cpu_util = self.update_graph_val(self.cpu_util, self.cpu_usage()) self.mem_util = self.update_graph_val(self.mem_util, self.used_mem) def reset(self): self.cpu_util = [0] * self.graph_num_bars self.mem_util = [0] * self.graph_num_bars self.mem_max_value = 0 self.total_mem = 0 self.used_mem = 0 def update_graph_val(self, values, new_val): values_num = len(values) if values_num > self.graph_num_bars: values = values[values_num - self.graph_num_bars - 1:] elif values_num < self.graph_num_bars: zero_pad = [0] * (self.graph_num_bars - values_num) values = zero_pad + values values.append(new_val) return values[1:] def update_dask_values(self): self.worker_info = self.dask_client.scheduler_info()['workers'] self.currentValue['Memory']['total_memory'] = round(self.available_memory() / (1024**2),2) self.currentValue['Memory']['used_memory'] = round(self.used_memory() / (1024**2),2) self.currentValue['Memory']['used_memory_percent'] = self.currentValue['Memory']['used_memory'] / self.currentValue['Memory']['total_memory'] self.currentValue['CPU']['cpu_usage'] = self.cpu_usage() self.currentValue['Cluster']['n_workers'] = self.num_workers() self.currentValue['Cluster']['total_threads'] = self.num_workers() self.currentValue['Workers'] = self.get_worker_stats() def num_workers(self): return len(self.worker_info) def num_threads(self): threads = [worker['nthreads'] for _, worker in self.worker_info.items()] return(sum(threads)) def available_memory(self): tots = 0 for w, info in self.worker_info.items(): tots += info['memory_limit'] return tots def used_memory(self): tots = 0 for w, info in self.worker_info.items(): tots += info['metrics']['memory'] return tots def get_worker_stats(self): worker_stats=[] for w, info in self.worker_info.items(): stats = {'user':'******', 'id' : 'filler', 'name' : 'filler', 'rawtime':1, 'time':1, 'command':'', 'cpu':1, 'memory':1, 'local_ports':'filler'} stats['address'] = w stats['nthreads'] = info['nthreads'] stats['memory'] = round(info['metrics']['memory'] / (1024**2),2) stats['memory_limit'] = round(info['memory_limit'] / (1024**2), 2) stats['cpu'] = info['metrics']['cpu'] stats['read'] = round(info['metrics']['read_bytes'] / (1024**2), 2) stats['write'] = round(info['metrics']['write_bytes'] / (1024**2), 2) worker_stats.append(stats) return worker_stats def cpu_usage(self): """ Average cpu utilization across all workers """ usages = [] for w, info in self.worker_info.items(): usages.append(info['metrics']['cpu']) if len(usages)>0: return sum(usages) / len(usages) else: return 0
#!/usr/bin/env python from dask_jobqueue import SLURMCluster from dask.distributed import Client cluster = SLURMCluster( cores=28, name='test-jobqueue', walltime='00:06:00', job_extra=['--constraint=HSW24', '--exclusive', '--nodes=1'], memory='120GB', interface='enp5s0f0') cluster.scale(196) cluster from dask.distributed import Client client = Client(cluster) client print('Currently working with ' + str(len(client.scheduler_info()["workers"])) + ' workers')
def all_ping(client: Client): workers = list(client.scheduler_info()["workers"]) start = time.time() client.run(ping, workers) stop = time.time() print(format_time(stop - start))
print(f"Initializing Local Dask cluster") client = Client() else: if scheduler is None: cluster = EDASCluster() print( "Initializing Dask-distributed cluster with scheduler address: " + cluster.scheduler_address) client = Client(cluster.scheduler_address, timeout=60) time.sleep(20) else: print("Initializing client with existing scheduler at: " + scheduler) client = Client(scheduler) scheduler_info = client.scheduler_info() workers: Dict = scheduler_info.pop("workers") print(" @@@@@@@ SCHEDULER INFO: " + str(scheduler_info)) print(f" N Workers: {len(workers)} ") start_time1 = time.time() job1 = Job.init("Test", "SCHEDULER_TEST", "jobId", domains, variables, operations, []) print("Running workflow for requestId " + job1.requestId) result1 = edasOpManager.buildTask(job1) print("Completed first workflow in time " + str(time.time() - start_time1)) start_time2 = time.time() job2 = Job.init("Test", "SCHEDULER_TEST", "jobId", domains, variables, operations, []) print("Running workflow for requestId " + job2.requestId)
def launch_python_post(): curDir = os.path.dirname(os.path.abspath(__file__)) logger = PyPostTools.pyPostLogger() logger.write("Initializing WRF Python Post-Processing Program") #Step 1: Load program settings logger.write(" 1. Application Initalization") logger.write(" - Loading control file, python_post_control.txt") _pySet = PyPostSettings.PyPostSettings() logger.write(" - Success!") logger.write(" - Testing Environmental Variables") try: dask_nodes = os.environ["PYTHON_POST_NODES"] dask_threads = os.environ["PYTHON_POST_THREADS"] postDir = os.environ["PYTHON_POST_DIR"] targetDir = os.environ["PYTHON_POST_TARG_DIR"] except KeyError: logger.write( "***FAIL*** KeyError encountered while trying to access important environmental variables, abort." ) sys.exit("") logger.write(" - Success!") logger.write(" - Initializing Dask (" + str(dask_nodes) + " Nodes Requested), Collecting routines needed") _routines = Routines.Routines() # Start Dask Tasks #cLoop = IOLoop.current() #t = Thread(target = cLoop.start, daemon = True) #t.start() logger.write(" - Async IO Loop initialized...") async def f(port): s = Scheduler(port=scheduler_port) s = await s await s.finished() return 1 asyncio.gather(f(scheduler_port)) #asyncio.get_event_loop().run_until_complete(f(scheduler_port)) logger.write(" - Dask Scheduler initialized (Port " + str(scheduler_port) + ")...") dask_client = Client("tcp://" + socket.gethostname() + ":" + str(scheduler_port)) logger.write(" - Dask Client initialized...") logger.write(" - Writing Dask Worker Job Files...") with PyPostTools.cd(targetDir): writeFile1 = PyPostTools.write_job_file(socket.gethostname(), scheduler_port, project_name="Nowcast", queue="default", nodes=dask_nodes, wall_time=60, nProcs=1) writeFile2 = PyPostTools.write_worker_file(socket.gethostname(), scheduler_port, nProcs=1) if (writeFile1 == False or writeFile2 == False): dask_client.close() logger.write( " - Failed to write job files, are you missing an important parameter?" ) sys.exit("") return else: logger.write( " - Dask Worker Job File Written, Submitting to Queue.") PyPostTools.popen("chmod +x launch-worker.sh") PyPostTools.popen("chmod +x dask-worker.job") PyPostTools.popen("qsub dask-worker.job") # Wait here for workers. logger.write(" -> Worker Job submitted to queue, waiting for workers...") while len(dask_client.scheduler_info()['workers']) < int(dask_nodes): time.sleep(2) logger.write(" -> Workers are now connected.") #logger.write(" - Adding local packages to dask workers") #dask_client.upload_file("PyPostTools.py") #dask_client.upload_file("ArrayTools.py") #dask_client.upload_file("Calculation.py") #dask_client.upload_file("ColorMaps.py") #dask_client.upload_file("Conversions.py") #dask_client.upload_file("Plotting.py") #dask_client.upload_file("PyPostSettings.py") #dask_client.upload_file("Routines.py") logger.write(" - Success!") logger.write(" 1. Done.") logger.write(" 2. Start Post-Processing Calculations") calculation_future = start_calculations(dask_client, _routines, dask_threads) if (calculation_future != None): wait(calculation_future) result_calc = dask_client.gather(calculation_future)[0] if (result_calc != 0): logger.write( "***FAIL*** An error occured in calculations method, check worker logs for more info." ) logger.close() sys.exit("") logger.write(" 2. Done.") logger.write(" 3. Generating Figures") logger.write(" - Collecting files from target directory (" + targetDir + ").") fList3 = sorted(glob.glob(targetDir + "WRFPRS_F*")) logger.write(" - " + str(len(fList3)) + " files have been found.") logger.write(" -> Pushing run_plotting_routines() to dask.") fullDict = _pySet.get_full_dict() plotting_future = start_plotting(dask_client, fullDict, dask_threads) wait(plotting_future) result_plot = dask_client.gather(plotting_future)[0] if (result_plot != 0): logger.write( "***FAIL*** An error occured in plotting method, check worker logs for more info." ) logger.close() sys.exit("") logger.write(" 3. Done.") logger.write(" 4. Final Steps") logger.write(" 4. Done, Closing Dask Client.") dask_client.retire_workers(workers=dask_client.scheduler_info()['workers'], close=True) dask_client.close() logger.write("All Steps Completed.") logger.write("***SUCCESS*** Program execution complete.") logger.close()
if __name__ == "__main__": if use_local_cluster: print(f"Creating local cluster with {ncpus_local} workers." f" Dashboard address: {dashboard_address}") client = Client( processes=True, dashboard_address=dashboard_address, n_workers=ncpus_local, threads_per_worker=1, memory_limit="4GB", ) else: print(f"Creating Slurm cluster at {slurm_cluster_ip}." f" Dashboard address: {dashboard_address}") client = Client(parameters["slurm_cluster_ip"]) parameters["ncpus"] = len(client.scheduler_info()["workers"]) print("Cluster created!") datasets = parameters["grouping"].keys() # datasets = ["dy_m100_mg"] parameters["hist_vars"] = [ "dimuon_mass", "dimuon_pt", "dimuon_eta", "dimuon_phi", "dimuon_dEta", "dimuon_dPhi", "dimuon_dR", "dimuon_rap", "dimuon_cos_theta_cs",
ask_workers=2 memory='4GB' from dask_jobqueue import PBSCluster from dask.distributed import Client import dask.dataframe as dd cluster = PBSCluster(cores=1, memory=memory, project='PerfTestPangeo', walltime='04:00:00') cluster.scale(ask_workers) c = Client(cluster) c from dask.utils import ensure_dict, format_bytes wk = c.scheduler_info()["workers"] text="Workers= " + str(len(wk)) memory = [w["memory_limit"] for w in wk.values()] cores = sum(w["nthreads"] for w in wk.values()) text += ", Cores=" + str(cores) if all(memory): text += ", Memory=" + format_bytes(sum(memory)) print(text) #Workers= 2, Cores=2, Memory=8.00 GB %time ds=xr.open_zarr('/work/ALT/odatis/eNATL60/zarr/eNATL60-BLBT02-SSH-1h') #56.3 ms %time mean=ds.sossheig.mean(dim='time_counter') #195 ms
model_params = {'n_estimators': [20, 30]} # create random forest classifier model rf = RandomForestClassifier(random_state=1) # set up grid search meta-estimator clf = GridSearchCV(rf, model_params, cv=3) # DASK - TRAIN MODEL # ### Fit Model with Dask from joblib import Parallel, parallel_backend with parallel_backend('dask'): model = clf.fit(X_train, y_train) ## Optional - print dask cluster config import json print(json.dumps(client.scheduler_info(), indent=4)) ## stop CDSW workers import cdsw cdsw.stop_workers() # print winning set of hyperparameters from pprint import pprint pprint(model.best_estimator_.get_params()) # generate predictions using the best-performing model predictions = model.predict(X_test) print(predictions)
def start_dask_cluster(environment=os.path.basename( os.environ['CONDA_PREFIX']), worker_profile='Medium Worker', profile='default', region='us-west-2', endpoint=None, worker_min=2, worker_max=20, adaptive_scaling=True, wait_for_cluster=True, cfile=None, use_existing_cluster=True, propagate_env=False): ''' environment - should match the kernel running, and will be set autmatically worker profile - 'Small Worker', 'Medium Worker', or 'Pangeo Worker' (determines available memory in a worker) profile - 'default' is good, but others can be used region - AWS region endpoint - None by default matches region. Set correct endpoint to s3 buckets worker_min - minumum number of workers (for adaptive scaling) worker_max - maximum number of workers adaptive_scaling - Default True. If False, launches worker_max workers wait_for_cluster - Default True. cfile - None. Finds aws credentials in this file use_existing_cluster - Default True. propagate_env - Default False. Set to True when working with Cloud VRTs ''' if not endpoint: endpoint = f's3.{region}.amazonaws.com' set_credentials2(profile=profile, region=region, endpoint=endpoint, cfile=cfile) try: gateway.list_clusters() except: gateway = Gateway() if gateway.list_clusters(): print('Existing Dask clusters:') j = 0 for c in gateway.list_clusters(): print(f'Cluster Index c_idx: {j} / Name:', c.name, c.status) j += 1 else: print('No Cluster running.') # TODO Check if worker_profile is the same, otherwise start new cluster if gateway.list_clusters() and use_existing_cluster: print('Using existing cluster [0].') cluster = gateway.connect(gateway.list_clusters()[0].name) else: print('Starting new cluster.') cluster = gateway.new_cluster(environment=environment, profile=worker_profile) if adaptive_scaling: print(f'Setting Adaptive Scaling min={worker_min}, max={worker_max}') cluster.adapt(minimum=worker_min, maximum=worker_max) else: print(f'Setting Fixed Scaling workers={worker_max}') cluster.scale(worker_max) try: client = Client(cluster) client.close() print('Reconnect client to clear cache') except: pass client = Client(cluster) print( f'client.dashboard_link (for new browser tab/window or dashboard searchbar in Jupyterhub):\n{client.dashboard_link}' ) if wait_for_cluster: target_workers = worker_min if adaptive_scaling else worker_max live_workers = len(list(cluster.scheduler_info['workers'])) t = 0 interval = 2 print( f'Elapsed time to wait for {target_workers} live workers:\n{live_workers}/{target_workers} workers - {t} seconds', end='') while not live_workers >= target_workers: sleep(interval) t += interval print(f'\r{live_workers}/{target_workers} workers - {t} seconds', end='') live_workers = len(client.scheduler_info()['workers']) print(f'\r{live_workers}/{target_workers} workers - {t} seconds') # We need to propagate credentials to the workers #set_credentials(profile=profile,region=region,endpoint=endpoint) if propagate_env: print('Propagating environment variables to workers') class InitWorker(WorkerPlugin): name = "init_worker" def __init__(self, filepath=None, script=None): self.data = {} if filepath: if isinstance(filepath, str): filepath = [filepath] for file_ in filepath: with open(file_, "rb") as f: filename = os.path.basename(file_) self.data[filename] = f.read() if script: filename = f"{uuid.uuid1()}.py" self.data[filename] = script async def setup(self, worker): responses = await asyncio.gather(*[ worker.upload_file( comm=None, filename=filename, data=data, load=True) for filename, data in self.data.items() ]) assert all( len(data) == r["nbytes"] for r, data in zip(responses, self.data.values())) script = f""" \rimport os \ros.environ["AWS_ACCESS_KEY_ID"] = "{os.getenv("AWS_ACCESS_KEY_ID")}" \ros.environ["AWS_SECRET_ACCESS_KEY"] = "{os.getenv("AWS_SECRET_ACCESS_KEY")}" \ros.environ["AWS_DEFAULT_REGION"] = "{os.getenv("AWS_DEFAULT_REGION")}" \ros.environ["GDAL_DISABLE_READDIR_ON_OPEN"] ="EMPTY_DIR" """ plugin = InitWorker(script=script) client.register_worker_plugin(plugin) return client, cluster
class ProcessManager(GenericProcessManager): manager: "ProcessManager" = None @classmethod def getManager( cls ) -> Optional["ProcessManager"]: return cls.manager @classmethod def initManager( cls, serverConfiguration: Dict[str,str] ) -> "ProcessManager": if cls.manager is None: cls.manager = ProcessManager(serverConfiguration) return cls.manager def __init__( self, serverConfiguration: Dict[str,str] ): self.config = serverConfiguration self.logger = EDASLogger.getLogger() self.num_wps_requests = 0 self.scheduler_address = serverConfiguration.get("scheduler.address",None) self.maxworkers = serverConfiguration.get("scheduler.maxworkers", 16 ) self.submitters = [] self.slurm_clusters = {} self.active = True if self.scheduler_address is not None: if self.scheduler_address.lower().startswith("slurm"): scheduler_parms = self.scheduler_address.split(":") queue = "default" if len(scheduler_parms) < 2 else scheduler_parms[1] self.client = Client( self.getSlurmCluster(queue) ) else: self.logger.info( "Initializing Dask-distributed cluster with scheduler address: " + self.scheduler_address ) self.client = Client( self.scheduler_address, timeout=63 ) else: nWorkers = int( self.config.get("dask.nworkers",multiprocessing.cpu_count()) ) self.client = Client( LocalCluster( n_workers=nWorkers ) ) self.scheduler_address = self.client.scheduler.address self.logger.info( f"Initializing Local Dask cluster with {nWorkers} workers, scheduler address = {self.scheduler_address}") self.client.submit( lambda x: edasOpManager.buildIndices( x ), nWorkers ) self.ncores = self.client.ncores() self.logger.info(f" ncores: {self.ncores}") self.scheduler_info = self.client.scheduler_info() self.workers: Dict = self.scheduler_info.pop("workers") self.logger.info(f" workers: {self.workers}") log_metrics = serverConfiguration.get("log.scheduler.metrics", False ) if log_metrics: self.metricsThread = Thread( target=self.trackMetrics ) self.metricsThread.start() def getSlurmCluster( self, queue: str ): self.logger.info( f"Initializing Slurm cluster using queue {queue}" ) cluster = self.slurm_clusters.setdefault( queue, SLURMCluster() if queue == "default" else SLURMCluster( queue=queue ) ) cluster.adapt( minimum=1, maximum=self.maxworkers, interval="2s", wait_count=500 ) print( "CLUSTER JOB SCRIPT: " + cluster.job_script() ) return cluster def getCWTMetrics(self) -> Dict: metrics_data = { key:{} for key in ['user_jobs_queued','user_jobs_running','wps_requests','cpu_ave','cpu_count','memory_usage','memory_available']} metrics = self.getProfileData() counts = metrics["counts"] workers = metrics["workers"] for key in ['tasks','processing','released','memory','saturated','waiting','waiting_data','unrunnable']: metrics_data['user_jobs_running'][key] = counts[key] for key in ['tasks', 'waiting', 'waiting_data', 'unrunnable']: metrics_data['user_jobs_queued'][key] = counts[key] for wId, wData in workers.items(): worker_metrics = wData["metrics"] total_memory = wData["memory_limit"] memory_usage = worker_metrics["memory"] metrics_data['memory_usage'][wId] = memory_usage metrics_data['memory_available'][wId] = total_memory - memory_usage metrics_data['cpu_count'][wId] = wData["ncores"] metrics_data['cpu_ave'][wId] = worker_metrics["cpu"] return metrics_data def trackMetrics(self, sleepTime=1.0 ): isIdle = False self.logger.info(f" ** TRACKING METRICS ** ") while self.active: metrics = self.getProfileData() counts = metrics["counts"] if counts['processing'] == 0: if not isIdle: self.logger.info(f" ** CLUSTER IS IDLE ** ") isIdle = True else: isIdle = False self.logger.info( f" METRICS: {metrics['counts']} " ) workers = metrics["workers"] for key,value in workers.items(): self.logger.info( f" *** {key}: {value}" ) self.logger.info(f" HEALTH: {self.getHealth()}") time.sleep( sleepTime ) def getWorkerMetrics(self): metrics = {} wkeys = [ 'ncores', 'memory_limit', 'last_seen', 'metrics' ] scheduler_info = self.client.scheduler_info() workers: Dict = scheduler_info.get( "workers", {} ) for iW, worker in enumerate( workers.values() ): metrics[f"W{iW}"] = { wkey: worker[wkey] for wkey in wkeys } return metrics def getDashboardAddress(self): stoks = self.scheduler_address.split(":") host_address = stoks[-2].strip("/") return f"http://{host_address}:8787" def getCounts(self) -> Dict: profile_address = f"{self.getDashboardAddress()}/json/counts.json" return requests.get(profile_address).json() def getHealth(self, mtype: str = "" ) -> str: profile_address = f"{self.getDashboardAddress()}/health" return requests.get(profile_address).text def getMetrics(self, mtype: str = "" ) -> Optional[Dict]: counts = self.getCounts() if counts['processing'] == 0: return None mtypes = mtype.split(",") metrics = { "counts": counts } if "processing" in mtypes: metrics["processing"] = self.client.processing() if "profile" in mtypes: metrics["profile"] = self.client.profile() return metrics def getProfileData( self, mtype: str = "" ) -> Dict: try: return { "counts": self.getCounts(), "workers": self.getWorkerMetrics() } except Exception as err: self.logger.error( "Error in getProfileData") self.logger.error(traceback.format_exc()) # response2: requests.Response = requests.get(tasks_address) # print(f"\n ----> Tasks Data from {tasks_address}: \n ** {response2.text} ** \n" ) # response3: requests.Response = requests.get(workers_address) # print(f"\n ----> Workers Data from {workers_address}: \n ** {response3.text} ** \n" ) # data = json.loads(counts) # (r"info/main/workers.html", Workers), # (r"info/worker/(.*).html", Worker), # (r"info/task/(.*).html", Task), # (r"info/main/logs.html", Logs), # (r"info/call-stacks/(.*).html", WorkerCallStacks), # (r"info/call-stack/(.*).html", TaskCallStack), # (r"info/logs/(.*).html", WorkerLogs), # (r"json/counts.json", CountsJSON), # (r"json/identity.json", IdentityJSON), # (r"json/index.html", IndexJSON), # (r"individual-plots.json", IndividualPlots), # (r"metrics", PrometheusHandler), # (r"health", HealthHandler), # "/system": systemmonitor_doc, # "/stealing": stealing_doc, # "/workers": workers_doc, # "/events": events_doc, # "/counters": counters_doc, # "/tasks": tasks_doc, # "/status": status_doc, # "/profile": profile_doc, # "/profile-server": profile_server_doc, # "/graph": graph_doc, # "/individual-task-stream": individual_task_stream_doc, # "/individual-progress": individual_progress_doc, # "/individual-graph": individual_graph_doc, # "/individual-profile": individual_profile_doc, # "/individual-profile-server": individual_profile_server_doc, # "/individual-nbytes": individual_nbytes_doc, # "/individual-nprocessing": individual_nprocessing_doc, # "/individual-workers": individual_workers_doc, def term(self): self.active = False self.client.close() def runProcess( self, job: Job ) -> EDASDataset: start_time = time.time() try: self.logger.info( f"Running workflow for requestId: {job.requestId}, scheduler: {self.scheduler_address}" ) result = edasOpManager.buildTask( job ) self.logger.info( "Completed EDAS workflow in time " + str(time.time()-start_time) ) return result except Exception as err: self.logger.error( "Execution error: " + str(err)) traceback.print_exc() def submitProcess(self, service: str, job: Job, resultHandler: ExecHandler): submitter: SubmissionThread = SubmissionThread( job, resultHandler ) self.submitters.append( submitter ) submitter.start()
num_hipergator_workers = 120 cluster = SLURMCluster(processes=1, queue='hpg2-compute', threads=1, memory='4GB', walltime='96:00:00', death_timeout=600, local_directory='/tmp/') print('Starting up workers') workers = cluster.start_workers(num_hipergator_workers) dask_client = Client(cluster) wait_time = 0 while len(dask_client.scheduler_info() ['workers']) < num_hipergator_workers / 2: print('waiting on workers: {s} sec. so far'.format(s=wait_time)) sleep(10) wait_time += 10 # If 5 minutes goes by try adding them again if wait_time > 300: workers.extend(cluster.start_workers(1)) print('Most workers accounted for') ################################################## # Main species_info = pd.read_csv(config['species_list_file'])
def main(): memory_limit = 128e9 threads_per_worker = 4 cluster = LocalCUDACluster(memory_limit=memory_limit, threads_per_worker=threads_per_worker) client = Client(cluster) sched_info = client.scheduler_info() print('CLIENT: {}'.format(client)) print('SCHEDULER INFO:\n{}'.format(json.dumps(sched_info, indent=2))) # Importing here in case RMM is used later on. Must start client prior # to importing cudf stuff if using RMM. from greenflow.dataframe_flow import (TaskSpecSchema, TaskGraph) # workers_names = \ # [iw['name'] for iw in client.scheduler_info()['workers'].values()] # nworkers = len(workers_names) _basedir = os.path.dirname(__file__) # mortgage_data_path = '/datasets/rapids_data/mortgage' mortgage_data_path = os.path.join(_basedir, 'mortgage_data') # Using some default csv files for testing. # csvfile_names = os.path.join(mortgage_data_path, 'names.csv') # acq_data_path = os.path.join(mortgage_data_path, 'acq') # perf_data_path = os.path.join(mortgage_data_path, 'perf') # csvfile_acqdata = os.path.join(acq_data_path, 'Acquisition_2000Q1.txt') # csvfile_perfdata = \ # os.path.join(perf_data_path, 'Performance_2000Q1.txt_0') # mortgage_etl_workflow_def( # csvfile_names, csvfile_acqdata, csvfile_perfdata) greenflow_task_spec_list = mortgage_etl_workflow_def() start_year = 2000 end_year = 2001 # end_year is inclusive # end_year = 2016 # end_year is inclusive # part_count = 16 # the number of data files to train against # create_dmatrix_serially - When False on same node if not enough host RAM # then it's a race condition when creating the dmatrix. Make sure enough # host RAM otherwise set to True. # create_dmatrix_serially = False # able to do 18 with create_dmatrix_serially set to True part_count = 18 # the number of data files to train against create_dmatrix_serially = True # part_count = 4 # the number of data files to train against # Use RAPIDS Memory Manager. Seems to work fine without it. use_rmm = False # Clean up intermediate dataframes in the xgboost training task. delete_dataframes = True mortgage_run_params_dict_list = generate_mortgage_greenflow_run_params_list( mortgage_data_path, start_year, end_year, part_count, greenflow_task_spec_list) _basedir = os.path.dirname(__file__) mortgage_lib_module = os.path.join(_basedir, 'mortgage_greenflow_plugins.py') filter_dask_logger = False mortgage_workflow_runner_task = { TaskSpecSchema.task_id: MortgageTaskNames.dask_mortgage_workflow_runner_task_name, TaskSpecSchema.node_type: 'DaskMortgageWorkflowRunner', TaskSpecSchema.conf: { 'mortgage_run_params_dict_list': mortgage_run_params_dict_list, 'client': client, 'use_rmm': use_rmm, 'filter_dask_logger': filter_dask_logger, }, TaskSpecSchema.inputs: [], TaskSpecSchema.filepath: mortgage_lib_module } dxgb_gpu_params = { 'nround': 100, 'max_depth': 8, 'max_leaves': 2**8, 'alpha': 0.9, 'eta': 0.1, 'gamma': 0.1, 'learning_rate': 0.1, 'subsample': 1, 'reg_lambda': 1, 'scale_pos_weight': 2, 'min_child_weight': 30, 'tree_method': 'gpu_hist', 'n_gpus': 1, 'distributed_dask': True, 'loss': 'ls', # 'objective': 'gpu:reg:linear', 'objective': 'reg:squarederror', 'max_features': 'auto', 'criterion': 'friedman_mse', 'grow_policy': 'lossguide', 'verbose': True } dxgb_trainer_task = { TaskSpecSchema.task_id: MortgageTaskNames.dask_xgb_trainer_task_name, TaskSpecSchema.node_type: 'DaskXgbMortgageTrainer', TaskSpecSchema.conf: { 'create_dmatrix_serially': create_dmatrix_serially, 'delete_dataframes': delete_dataframes, 'dxgb_gpu_params': dxgb_gpu_params, 'client': client, 'filter_dask_logger': filter_dask_logger }, TaskSpecSchema.inputs: [MortgageTaskNames.dask_mortgage_workflow_runner_task_name], TaskSpecSchema.filepath: mortgage_lib_module } task_spec_list = [mortgage_workflow_runner_task, dxgb_trainer_task] out_list = [MortgageTaskNames.dask_xgb_trainer_task_name] task_graph = TaskGraph(task_spec_list) (bst, ) = task_graph.run(out_list) print('XGBOOST BOOSTER:\n', bst)
def random_search_cv(est, X, y, param_dist, n_iter=100, cv=3, client=None, log2file=False, log2tbx=False): if client is None: client = Client() if log2file or log2tbx: log_dir = make_logdir() if log2file: logger = make_filelogger(log_dir) if log2tbx: writer = make_summarywriter(log_dir) cv = check_cv(cv) source = source_for_param_space(param_dist) layout = cv_curve_and_table_for_source(source) h = bp.show(layout, notebook_handle=True) nworkers = len(client.scheduler_info()['workers']) param_iter = iter(ParameterSampler(param_dist, n_iter=n_iter)) initial_params = [next(param_iter) for _ in range(nworkers)] initial_futures = [ evaluate(est, X, y, params, cv, client) for params in initial_params ] params_map = dict(zip(initial_futures, initial_params)) af = as_completed(initial_futures) for step, future in enumerate(af): mean_train_score, mean_test_score = future.result() source.stream({ 'index': [len(source.data['index'])], 'mean_test_score': [mean_test_score], 'cummax_score': [max(np.append(source.data['cummax_score'], mean_test_score))], 'tstamp': [pd.datetime.fromtimestamp(time())], 'params': [str(params_map[future])], }) push_notebook(h) if log2file: logger.info('scoring', extra={ 'mean_test_score': mean_test_score, 'mean_train_score': mean_train_score, 'tstamp': time(), 'params': params_map[future], }) if log2tbx: writer.add_scalars( 'search-results/cv-curve', { 'mean_test_score': np.array(mean_test_score), 'cummax_score': np.array( max( np.append(source.data['cummax_score'], mean_test_score))), }, step) try: params = next(param_iter) f = evaluate(est, X, y, params, cv, client) params_map[f] = params af.add(f) except StopIteration: pass
class LargeELMRegressor(BasicELM, RegressorMixin): """ELM Regressor for larger-than-memory problems. Uses `Dask <https://dask.org>`_ for batch analysis of data in Parquet files. .. attention:: Why do I need Parquet files? Parquet files provide necessary information about the data without loading whole file content from disk. It makes a tremendous runtime difference compared to simpler `.csv` or `.json` file formats. Reading from files saves memory by loading data in small chunks, supporting arbitrary large input files. It also solves current memory leaks with Numpy matrix inputs in Dask. Any data format can be easily converted to Parquet, see `Analytical methods <techniques.html>`_ section. HDF5 is almost as good as Parquet, but performs worse with Dask due to internal data layout. .. todo: Write converters. .. todo: Memo about number of workers: one is good, several cover disk read latency but need more memory. On one machine matrix operators always run in parallel, do not benefit from Dask. .. todo: Memory consumption with large number of neurons - 100,000 neurons require 200GB or swap space, with read+write reaching 1GB/s. Suggested a fast SSD, or HDD + extra workers to hide swap latency. Mention that Dask is not the perfect solution, kept here for future updates. And it actually solves stuff larger than memory, albeit at a very high time+swap cost. .. todo: Avoid large batch sizes as workers can fail, safe bet is 2000-5000 range. .. todo: Fast HtH and in-place Cholesky solver. .. todo: Pro tip in documentation: run ELM with dummy 1000 data samples and 1e+9 regularization, This will test possible memory issues for workers without wasting your time on computing full HH. .. todo: Option to keep full HH permanently somewhere at disk. Saves before the final step, avoids failures from memory issues during Cholesky solver. .. todo: GPU + batch Cholesky solver, for both ELM and LargeELM. Requirements ------------ * Pandas * pyarrow * python-snappy Parameters ---------- batch_size : int Batch size used for both data samples and hidden neurons. With batch Cholesky solver, allows for very large numbers of hidden neurons of over 100,000; limited only by the computation time and disk swap space. .. hint:: Include bias and original features for best performance. ELM will include a bias term (1 extra feature), and the original features with `include_original_features=True`. For optimal performance, choose `batch_size` to be equal or evenly divide the `n_neurons + 1 (bias) + n_inputs (if include_original_features=True)`. .. todo:: Exact batch_size vs. GPU performance """ def __del__(self): if hasattr(self, 'client_'): self.client_.close() self.cluster_.close() def _setup_dask_client(self): self.cluster_ = LocalCluster( n_workers=4, threads_per_worker=1, local_dir="/Users/akusok/wrkdir/dask-temp", memory_limit="8GB") self.client_ = Client(self.cluster_) W_list = [hl.projection_.components_ for hl in self.hidden_layers_] W_dask = [da.from_array(_dense(W), chunks=self.bsize_) for W in W_list] self.W_ = self.client_.persist(W_dask) def foo(): import os os.environ['OMP_NUM_THREADS'] = '1' self.client_.run(foo) print("Running on:", self.client_) try: dashboard = self.client_.scheduler_info()['address'].split(":") dashboard[0] = "http" dashboard[-1] = str( self.client_.scheduler_info()['services']['dashboard']) print("Dashboard at", ":".join(dashboard)) except: pass def _project(self, X_dask): """Compute hidden layer output with Dask functionality. """ H_list = [] for hl, W in zip(self.hidden_layers_, self.W_): if hl.hidden_layer_ == HiddenLayerType.PAIRWISE: H0 = X_dask.map_blocks(pairwise_distances, W, dtype=X_dask.dtype, chunks=(X_dask.chunks[0], (W.shape[0], )), metric=hl.pairwise_metric) else: XW_dask = da.dot(X_dask, W.transpose()) if hl.ufunc_ is dummy: H0 = XW_dask elif hl.ufunc_ is np.tanh: H0 = da.tanh(XW_dask) else: H0 = XW_dask.map_blocks(hl.ufunc_) H_list.append(H0) if self.include_original_features: H_list.append(X_dask) H_list.append(da.ones((X_dask.shape[0], 1))) H_dask = da.concatenate(H_list, axis=1).rechunk(self.bsize_) return H_dask def _compute(self, X, y, sync_every, HH=None, HY=None): """Computing matrices HH and HY, the actually long part. .. todo: actually distributed computations that scatter batches of data file names, and reduce-sum the HH,HY matrices. """ # processing files for i, X_file, y_file in zip(range(len(X)), X, y): X_dask = dd.read_parquet(X_file).to_dask_array(lengths=True) Y_dask = dd.read_parquet(y_file).to_dask_array(lengths=True) H_dask = self._project(X_dask) if HH is None: # first iteration HH = da.dot(H_dask.transpose(), H_dask) HY = da.dot(H_dask.transpose(), Y_dask) else: HH += da.dot(H_dask.transpose(), H_dask) HY += da.dot(H_dask.transpose(), Y_dask) if sync_every is not None and i % sync_every == 0: wait([HH, HY]) # synchronization if sync_every is not None and i % sync_every == 0: HH, HY = self.client_.persist([HH, HY]) # finishing solution if sync_every is not None: wait([HH, HY]) return HH, HY def _solve(self, HH, HY): """Compute output weights from HH and HY using Dask functionality. """ # make HH/HY divisible by chunk size n_features, _ = HH.shape padding = 0 if n_features > self.bsize_ and n_features % self.bsize_ > 0: print("Adjusting batch size {} to n_features {}".format( self.bsize_, n_features)) padding = self.bsize_ - (n_features % self.bsize_) P01 = da.zeros((n_features, padding)) P10 = da.zeros((padding, n_features)) P11 = da.zeros((padding, padding)) HH = da.block([[HH, P01], [P10, P11]]) P1 = da.zeros((padding, HY.shape[1])) HY = da.block([[HY], [P1]]) # rechunk, add bias, and solve HH = HH.rechunk( self.bsize_) + self.alpha * da.eye(HH.shape[1], chunks=self.bsize_) HY = HY.rechunk(self.bsize_) B = da.linalg.solve(HH, HY, sym_pos=True) if padding > 0: B = B[:n_features] return B def fit(self, X, y=None, sync_every=10): """Fits an ELM with data in a bunch of files. Model will use the set of features from the first file. Same features must have same names across the whole dataset. .. todo: Check what happens if features are in different order or missing. Does **not** support sparse data. .. todo: Check if some sparse data would work. .. todo: Check that sync_every does not affect results .. todo: Add single precision .. todo: Parquet file format examples in documentation Original features and bias are added to the end of data, for easier rechunk-merge. This way full chunks of hidden neuron outputs stay intact. Parameters ---------- X : [str] List of input data files in Parquet format. y : [str] List of target data files in Parquet format. sync_every : int or None Synchronize computations after this many files are processed. None for running without synchronization. Less synchronization improves run speed with smaller data files, but may result in large swap space usage for large data problems. Use smaller number for more frequent synchronization if swap space becomes a problem. """ if not _is_list_of_strings(X) or not _is_list_of_strings(y): raise ValueError("Expected X and y as lists of file names.") if len(X) != len(y): raise ValueError( "Expected X and y as lists of files with the same length. " "Got len(X)={} and len(y)={}".format(len(X), len(y))) # read first file and get parameters X_dask = dd.read_parquet(X[0]).to_dask_array(lengths=True) Y_dask = dd.read_parquet(y[0]).to_dask_array(lengths=True) n_samples, n_features = X_dask.shape if hasattr(self, 'n_features_') and self.n_features_ != n_features: raise ValueError( 'Shape of input is different from what was seen in `fit`') _, n_outputs = Y_dask.shape if hasattr(self, 'n_outputs_') and self.n_outputs_ != n_outputs: raise ValueError( 'Shape of outputs is different from what was seen in `fit`') # set batch size, default is bsize=2000 or all-at-once with less than 10_000 samples self.bsize_ = self.batch_size if self.bsize_ is None: self.bsize_ = n_samples if n_samples < 10 * 1000 else 2000 # init model if not fit yet if not hasattr(self, 'hidden_layers_'): self.n_features_ = n_features self.n_outputs_ = n_outputs X_sample = X_dask[:10].compute() self._init_hidden_layers(X_sample) self._setup_dask_client() HH, HY = self._compute(X, y, sync_every=sync_every) self.B = self._solve(HH, HY) self.is_fitted_ = True return self def predict(self, X): """Prediction works with both lists of Parquet files and numeric arrays. Parameters ---------- X : array-like, [str] Input data as list of Parquet files, or as a numeric array. Returns ------- Yh : array, shape (n_samples, n_outputs) Predicted values for all input samples. .. attention:: Returns all outputs as a single in-memory array! Danger of running out out memory for high-dimensional outputs, if a large set of input files is provided. Feed data in smaller batches in such case. """ check_is_fitted(self, 'is_fitted_') if _is_list_of_strings(X): Yh_list = [] # processing files for X_file in X: X_dask = dd.read_parquet(X_file).to_dask_array(lengths=True) H_dask = self._project(X_dask) Yh_list.append(da.dot(H_dask, self.B)) Yh_dask = da.concatenate(Yh_list, axis=0) return Yh_dask.compute() else: X = check_array(X, accept_sparse=True) H = [np.ones((X.shape[0], 1))] if self.include_original_features: H.append(_dense(X)) H.extend([hl.transform(X) for hl in self.hidden_layers_]) return np.hstack(H) @ self.B.compute()
zone = sys.argv[3] expired = int(sys.argv[4]) with open('post.pkl', 'rb') as fopen: post_message = cloudpickle.load(fopen) while True: try: client = Client('dask:8786') break except: time.sleep(5) now = datetime.now() while True: workers = client.scheduler_info()['workers'] if any([v['metrics']['executing'] != 0 for k, v in workers.items()]): now = datetime.now() if (datetime.now() - now).seconds > expired: slack_msg = """ Gracefully deleted Dask cluster. *Time shutdown*: {exec_date} *Dask cluster name*: {dask_name} """.format(exec_date=str(datetime.now()), dask_name=name) post_message(slack_msg) compute = googleapiclient.discovery.build('compute', 'v1') compute.instances().delete(project=project, zone=zone, instance=name).execute() break
def setup_logging(self): logger = logging.getLogger() logger.setLevel(logging.INFO) def convert_to_toz(self): scenario = "ssp585" baseurl = "../oceanography/cmip6/ozone-absorption/" ds = self.get_input4mpis_forcing(scenario, baseurl) self.convert_vmro3_to_toz(scenario, ds, baseurl) def main(): ozone = CMIP6_ozone() ozone.setup_logging() ozone.convert_to_toz() if __name__ == '__main__': np.warnings.filterwarnings('ignore') # https://docs.dask.org/en/latest/diagnostics-distributed.html from dask.distributed import Client dask.config.set(scheduler='processes') client = Client() status = client.scheduler_info()['services'] print("Dask started with status at: http://localhost:{}/status".format( status["dashboard"])) print(client) main()
class Adaptor: adr = "" client = None workers = [] queues = [] def __init__(self, Sworker, scheduler_info): with open(scheduler_info) as f: s = json.load(f) self.adr = s["address"] self.client = Client(self.adr, serializers=[ 'dask', 'pickle' ]) # msgpack pour grand message ne serialize pas dask.config.set({ "distributed.deploy.lost-worker-timeout": 60, "distributed.workers.memory.spill": 0.97, "distributed.workers.memory.target": 0.95, "distributed.workers.memory.terminate": 0.99 }) self.workers = [ comm.get_address_host_port(i, strict=False) for i in self.client.scheduler_info()["workers"].keys() ] while (len(self.workers) != Sworker): self.workers = [ comm.get_address_host_port(i, strict=False) for i in self.client.scheduler_info()["workers"].keys() ] Variable("workers").set(self.workers) def create_array(self, name, shape, chunksize, dtype, timedim): chunks_in_each_dim = [ shape[i] // chunksize[i] for i in range(len(shape)) ] l = list(itertools.product(*[range(i) for i in chunks_in_each_dim])) items = [] for m in l: f = Future(key=("deisa-" + name, m), inform=True, deisa=True) d = da.from_delayed(dask.delayed(f), shape=chunksize, dtype=dtype) items.append([list(m), d]) ll = self.array_sort(items) arrays = da.block(ll) return arrays def create_array_list(self, name, shape, chunksize, dtype, timedim): #list arrays, one for each time step. chunks_in_each_dim = [ shape[i] // chunksize[i] for i in range(len(shape)) ] l = list(itertools.product(*[range(i) for i in chunks_in_each_dim])) items = [] for m in l: f = Future(key=("deisa-" + name, m), inform=True, deisa=True) d = da.from_delayed(dask.delayed(f), shape=chunksize, dtype=dtype) items.append([list(m), d]) ll = self.array_sort(items) for i in ll: arrays.append(da.block(i)) return arrays def array_sort(self, ListDs): if len(ListDs[0][0]) == 0: return ListDs[0][1] else: dico = dict() for e in ListDs: dico.setdefault(e[0][0], []).append([e[0][1:], e[1]]) return [self.array_sort(dico[k]) for k in sorted(dico.keys())] def get_data(self, as_list=False): arrays = dict() self.arrays_desc = Queue("Arrays").get() for name in self.arrays_desc: if not as_list: arrays[name] = self.create_array( name, self.arrays_desc[name]["sizes"], self.arrays_desc[name]["subsizes"], self.arrays_desc[name]["dtype"], self.arrays_desc[name]["timedim"]) else: #TODO test this arrays[name] = self.create_array_list( name, self.arrays_desc[name]["sizes"], self.arrays_desc[name]["subsizes"], self.arrays_desc[name]["dtype"], self.arrays_desc[name]["timedim"]) #Barrier after the creation of all the dask arrays e = Event("Done") e.set() return arrays
# for zarr storage os.environ['TMPDIR'] = '/blue/adamginsburg/adamginsburg/tmp' if __name__ == "__main__": # need to be in main block for dask to work from dask.distributed import Client if os.getenv('SLURM_MEM_PER_NODE'): memlim_total = int(os.getenv('SLURM_MEM_PER_NODE')) / 1024 # GB ntasks = int(os.getenv('SLURM_NTASKS')) memlim = memlim_total / ntasks print(f"Memory limit is {memlim} GB") else: memlim = 1 ntasks = 8 client = Client(memory_limit=f'{memlim}GB', n_workers=ntasks) nworkers = len(client.scheduler_info()['workers']) print(f"Client scheduler info: {client.scheduler_info()['services']}") print(f"Number of workers: {nworkers} (should be equal to ntasks={ntasks})") print(f"Client scheduler info: {client.scheduler_info()}") print(f"Client vers: {client.get_versions(check=True)}") if os.getenv('ENVIRONMENT') == 'BATCH': pass else: from dask.diagnostics import ProgressBar pbar = ProgressBar() pbar.register() assert tempfile.gettempdir() == '/blue/adamginsburg/adamginsburg/tmp' basepath = '/orange/adamginsburg/ALMA_IMF/2017.1.01355.L/imaging_results'
# Setup dask cluster ###################################################### from dask_jobqueue import SLURMCluster from dask.distributed import Client from dask import delayed import dask cluster = SLURMCluster(processes=1,queue='hpg2-compute', cores=1, memory='10GB', walltime='96:00:00', job_extra=['--qos ewhite-b'], death_timeout=600, local_directory='/tmp/', interface='ib0') print('Starting up workers') workers = cluster.start_workers(hindcast_config.num_hipergator_workers) dask_client = Client(cluster) wait_time=0 while len(dask_client.scheduler_info()['workers']) < hindcast_config.num_hipergator_workers: print('waiting on workers: {s} sec. so far'.format(s=wait_time)) sleep(10) wait_time+=10 # If 5 minutes goes by try adding them again if wait_time > 300: workers.extend(cluster.start_workers(1)) print('All workers accounted for') # xr import must be after dask.array, and I think after setup # up the cluster/client. import dask.array as da import xarray as xr
formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) # add the handlers to the logger logger.addHandler(fh) start = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") if __name__ == "__main__": # client = Client() client = Client(processes=True, n_workers=2, threads_per_worker=1, memory_limit='12GB') print(client.scheduler_info()['services']) logger.info("client ready at ... {} ... at {}".format( client.scheduler_info()['services'], datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) soilgrids = { 'sand': 'soil250_grid_sand_sd', 'silt': 'soil250_grid_silt_sd', 'clay': 'soil250_grid_clay_sd', 'rock': 'soil250_grid_coarsefrag_sd', 'bd': 'soil250_grid_bulkdens_sd', 'soc': 'soil250_grid_soc_sd', 'awc': 'soil250_grid_awc_sd', 'k_sat': 'soil250_grid_k_sat_sd' }
def launch_python_post(): curDir = os.path.dirname(os.path.abspath(__file__)) logger = PyPostTools.pyPostLogger() logger.write("Initializing WRF Python Post-Processing Program") #Step 1: Load program settings logger.write(" 1. Application Initalization") logger.write(" - Loading control file, python_post_control.txt") _pySet = PyPostSettings.PyPostSettings() logger.write(" - Success!") logger.write(" - Testing Environmental Variables") try: dask_nodes = os.environ["PYTHON_POST_NODES"] dask_threads = os.environ["PYTHON_POST_THREADS"] postDir = os.environ["PYTHON_POST_DIR"] targetDir = os.environ["PYTHON_POST_TARG_DIR"] except KeyError: logger.write("***FAIL*** KeyError encountered while trying to access important environmental variables, abort.") sys.exit("") logger.write(" - Success!") logger.write(" - Initializing Dask (" + str(dask_nodes) + " Nodes Requested), Collecting routines needed") _routines = Routines.Routines() logger.write(" - Async IO Loop initialized...") def f(scheduler_port): async def g(port): s = Scheduler(port=scheduler_port) await s await s.finished() asyncio.get_event_loop().run_until_complete(g(scheduler_port)) # Starts the scheduler in its own process - needed as otherwise it will # occupy the program and make it do an infinite loop process = Process(target=f, args=(scheduler_port,)) process.start() logger.write(" - Dask Scheduler initialized (Port " + str(scheduler_port) + ")...") try: dask_client = Client("tcp://" + socket.gethostname() + ":" + str(scheduler_port), timeout=30) except OSError: logger.write(" <-> Dask Client could not be created, timeout error.") process.terminate() sys.exit() logger.write(" - Dask Client initialized...") logger.write(" - Writing Dask Worker Job Files...") with PyPostTools.cd(targetDir): writeFile = PyPostTools.write_job_file(socket.gethostname(), scheduler_port, project_name="climate_severe", queue="debug-cache-quad", nodes=dask_nodes, wall_time=60, nProcs=1) if(writeFile == False): dask_client.close() logger.write(" - Failed to write job file, are you missing an important parameter?") sys.exit("") return else: logger.write(" - Dask Worker Job File Written, Submitting to Queue.") PyPostTools.popen("chmod +x dask-worker.job") PyPostTools.popen("qsub dask-worker.job") # Wait here for workers. logger.write(" -> Worker Job submitted to queue, waiting for workers...") while len(dask_client.scheduler_info()['workers']) < int(dask_nodes): time.sleep(2) logger.write(" -> Workers are now connected.") logger.write(" - Success!") logger.write(" 1. Done.") logger.write(" 2. Start Post-Processing Calculations") start_calculations(dask_client, _routines, dask_threads, process) logger.write(" 2. Done.") logger.write(" 3. Generating Figures") logger.write(" - Collecting files from target directory (" + targetDir + ").") fList3 = sorted(glob.glob(targetDir + "WRFPRS_F*")) logger.write(" - " + str(len(fList3)) + " files have been found.") logger.write(" -> Pushing run_plotting_routines() to dask.") fullDict = _pySet.get_full_dict() plotting_future = start_plotting(dask_client, fullDict, dask_threads, process) wait(plotting_future) result_plot = dask_client.gather(plotting_future)[0] if(result_plot != 0): logger.write("***FAIL*** An error occured in plotting method, check worker logs for more info.") logger.close() sys.exit("") logger.write(" 3. Done.") logger.write(" 4. Final Steps") logger.write(" 4. Done, Closing Dask Client.") # Close the client object dask_client.retire_workers(workers=dask_client.scheduler_info()['workers'], close=True) dask_client.close() logger.write("All Steps Completed.") logger.write("***SUCCESS*** Program execution complete.") logger.close() del dask_client process.terminate()
from dask_mpi import initialize initialize() import socket from distributed.scheduler import logger import dask.array as da from dask.distributed import Client client = Client() # Connect this local process to remote workers host = client.run_on_scheduler(socket.gethostname) port = client.scheduler_info()['services']['dashboard'] login_node_address = ( 'supercomputer.university.edu' # Change this to the address/domain of your login node ) logger.info(f'ssh -N -L {port}:{host}:{port} {login_node_address}') logger.info('HELLO' * 10) print('WORLD' * 10) x = da.random.random((200, 10_000, 5_000), chunks=(20, 1_000, 1_000)) y = x.std(axis=0) y = y.compute() print(y)
name='make_profiles', walltime='00:30:00', job_extra=['--constraint=HSW24', '--exclusive', '--nodes=1'], memory='120GB', interface='ib0') cluster.scale(196) cluster from dask.distributed import Client client = Client(cluster) client import time nb_workers = 0 while True: nb_workers = len(client.scheduler_info()["workers"]) if nb_workers >= 2: break time.sleep(1) print(nb_workers) import sys, glob import numpy as np import xarray as xr sys.path.insert(0, "/scratch/cnt0024/hmg2840/albert7a/DEV/git/xscale") import xscale.spectral.fft as xfft import xscale import Wavenum_freq_spec_func as wfs import time
from dask.distributed import Client import dask.array as da from time import sleep """ A perpetual dask scheduler to test dask-top Will print out the scheduler address and run jobs every few seconds until Ctl+c """ def sqrt(x): return x**0.5 if __name__ == '__main__': client = Client(n_workers=2, nthreads=1, memory_limit='512mb', dashboard_address=8787) print(client.scheduler_info()) sleep(3) while True: x = client.scatter(da.random.random((1000, 1000), chunks=(50, 50))) _ = client.submit(sqrt, x).result().compute() sleep(3)