def main(): args = get_args() client = Client('127.0.0.1:8786') ncores = sum(client.ncores().values()) pd.set_option('display.large_repr', 'truncate') pd.set_option('display.max_columns', 0) # noqa pd.set_option('display.max_rows', 1000) # noqa cann_group_df = make_cann_group_df(num_products=100) df = read_df(args, cann_group_df['productKey']) logger.info('Setting index') df = df.set_index('customerKey', drop=True) logger.info('Repartitioning') df = df.repartition(npartitions=ncores) logger.info('Mapping Cann Group') df['cannGroupKey'] = df['productKey'].map(cann_group_df['cannGroupKey']) logger.info('Persisting') df = client.persist(df) logger.info('Cann Groups') for cann_group_key in cann_group_df['cannGroupKey'].unique().tolist(): print('Filtering Cann Group %s' % cann_group_key) cann_df = df[df['cannGroupKey'] == cann_group_key] print('This df: %s' % (len(cann_df), )) with Timer('%s' % (cann_group_key, )): calculate_switching(cann_df) return
def init_client(): import argparse parser = argparse.ArgumentParser(add_help=False) parser.add_argument('-scheduler', required=False) parser.add_argument('-expect-workers', type=int, default=0) args, unknown = parser.parse_known_args() if args.scheduler: from dask.distributed import Client client = Client(args.scheduler) if args.expect_workers > 0: while True: num_workers = len(client.ncores()) if num_workers >= args.expect_workers: break print( 'Client waiting for workers (have %s expect %s)' % (num_workers, args.expect_workers), flush=True) import time time.sleep(5) else: client = None return client
def get_total_cores(cluster: SpecCluster, client: Client) -> int: """ Retrieve the total number of cores from a Dask cluster object. """ # The Client.ncores() method returns the number of cores of each Dask # worker that is known to the scheduler return sum(client.ncores().values())
def map_wrapper(function_item, list_items): from dask.distributed import Client import dask.bag as db c = Client() NCORES = len(c.ncores().values()) b0 = db.from_sequence(list_items, npartitions=NCORES) list_items = list(db.map(function_item, b0).compute()) return list_items
def pool_broadcast(client: Client, action: Any, *args: List[Any], **kwargs: Dict[str, Any]): """Call ``action(*args, **kwargs)`` on every worker thread. This function block until all tasks are complete, expectation is that this is called at the very beginning on an empty pool, if called on a busy pool this will block until all active tasks are complete. Broadcast is achieved by blocking every task until all tasks have started, every worker does the following: 1. Let the primary task know this task has started 2. Perform action 3. Wait for all other tasks to start 4. Finish Steps (1) and (3) are achieved using distributed Queues, step (1) is a non-blocking ``put`` and step (3) is a blocking ``get``. :param client: Dask client object :param action: Callable `action(*args, **kwargs)` :param args: Ordered arguments to action :param kwargs: Named arguments to action """ postfix = "-{:02x}".format(randint(0, 1 << 64)) total_worker_threads = sum(client.ncores().values()) q1 = Queue("q1" + postfix, client=client, maxsize=total_worker_threads) q2 = Queue("q2" + postfix, client=client, maxsize=total_worker_threads) ff = [ client.submit( _bcast_action, q1, q2, i, action, args, kwargs, key="broadcast_action_{:04d}{}".format(i, postfix), ) for i in range(total_worker_threads) ] tks = set() for _ in range(total_worker_threads): tks.add(q1.get()) # blocking assert len(tks) == total_worker_threads # at this point all workers have launched # allow them to continue for i in range(total_worker_threads): q2.put(i) # should not block # block until all done and return result return [f.result() for f in ff]
def map_wrapper(function_item,list_items,other_args=None): from dask.distributed import Client import dask.bag as db c = Client() NCORES = len(c.ncores().values())-2 b0 = db.from_sequence(list_items, npartitions=NCORES) if other_args is not None: list_items = list(db.map(function_item,b0,other_args).compute()) else: list_items = list(db.map(function_item,b0).compute()) return list_items
def run_dask_compute(h5_main): raw_data = h5_main[()] #cpu_cores = int(cpu_cores/8) #dask_raw_data = da.from_array(raw_data, chunks='auto') #cluster = LocalCluster(n_workers=cpu_cores/8) #client = Client(cluster, processes=True) #map = dask_raw_data.map_blocks(find_all_peaks, [20, 60], num_steps=30) #results = map.compute() client = Client(processes=False) dask_raw_data = client.scatter(raw_data) args = [[20, 60]] kwargs = {'num_steps': 30} L = client.submit(find_all_peaks, dask_raw_data, args, kwargs) dask_results = client.compute(L) cores = client.ncores() client.close() return cores
def run_dask_compute(h5_main, proc=True): raw_data = h5_main[()] #cpu_cores = int(cpu_cores/8) dask_raw_data = da.from_array(raw_data, chunks='auto') #cluster = LocalCluster(n_workers=cpu_cores/8) #client = Client(cluster, processes=True) #map = dask_raw_data.map_blocks(find_all_peaks, [20, 60], num_steps=30) #results = map.compute() client = Client(processes=proc) L = client.map(find_all_peaks, dask_raw_data, width_bounds=[20, 60], num_steps=30) dask_results = client.gather(L) cores = client.ncores() client.close() return results
def main(): #print('XGBOOST_BUILD_DOC is ' + os.environ['XGBOOST_BUILD_DOC']) parser = argparse.ArgumentParser("rapidssample") parser.add_argument("--data_dir", type=str, help="location of data") parser.add_argument("--num_gpu", type=int, help="Number of GPUs to use", default=1) parser.add_argument("--part_count", type=int, help="Number of data files to train against", default=2) parser.add_argument("--end_year", type=int, help="Year to end the data load", default=2000) parser.add_argument("--cpu_predictor", type=str, help="Flag to use CPU for prediction", default='False') parser.add_argument('-f', type=str, default='') # added for notebook execution scenarios args = parser.parse_args() data_dir = args.data_dir num_gpu = args.num_gpu part_count = args.part_count end_year = args.end_year cpu_predictor = args.cpu_predictor.lower() in ('yes', 'true', 't', 'y', '1') if cpu_predictor: print('Training with CPUs require num gpu = 1') num_gpu = 1 print('data_dir = {0}'.format(data_dir)) print('num_gpu = {0}'.format(num_gpu)) print('part_count = {0}'.format(part_count)) #part_count = part_count + 1 # adding one because the usage below is not inclusive print('end_year = {0}'.format(end_year)) print('cpu_predictor = {0}'.format(cpu_predictor)) import subprocess cmd = "hostname --all-ip-addresses" process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE) output, error = process.communicate() IPADDR = str(output.decode()).split()[0] cluster = LocalCUDACluster(ip=IPADDR,n_workers=num_gpu) client = Client(cluster) client print(client.ncores()) # to download data for this notebook, visit https://rapidsai.github.io/demos/datasets/mortgage-data and update the following paths accordingly acq_data_path = "{0}/acq".format(data_dir) #"/rapids/data/mortgage/acq" perf_data_path = "{0}/perf".format(data_dir) #"/rapids/data/mortgage/perf" col_names_path = "{0}/names.csv".format(data_dir) # "/rapids/data/mortgage/names.csv" start_year = 2000 #end_year = 2000 # end_year is inclusive -- converted to parameter #part_count = 2 # the number of data files to train against -- converted to parameter client.run(initialize_rmm_pool) client print(client.ncores()) # NOTE: The ETL calculates additional features which are then dropped before creating the XGBoost DMatrix. # This can be optimized to avoid calculating the dropped features. print("Reading ...") t1 = datetime.datetime.now() gpu_dfs = [] gpu_time = 0 quarter = 1 year = start_year count = 0 while year <= end_year: for file in glob(os.path.join(perf_data_path + "/Performance_" + str(year) + "Q" + str(quarter) + "*")): if count < part_count: gpu_dfs.append(process_quarter_gpu(client, col_names_path, acq_data_path, year=year, quarter=quarter, perf_file=file)) count += 1 print('file: {0}'.format(file)) print('count: {0}'.format(count)) quarter += 1 if quarter == 5: year += 1 quarter = 1 wait(gpu_dfs) t2 = datetime.datetime.now() print("Reading time ...") print(t2-t1) print('len(gpu_dfs) is {0}'.format(len(gpu_dfs))) client.run(cudf._gdf.rmm_finalize) client.run(initialize_rmm_no_pool) client print(client.ncores()) dxgb_gpu_params = { 'nround': 100, 'max_depth': 8, 'max_leaves': 2**8, 'alpha': 0.9, 'eta': 0.1, 'gamma': 0.1, 'learning_rate': 0.1, 'subsample': 1, 'reg_lambda': 1, 'scale_pos_weight': 2, 'min_child_weight': 30, 'tree_method': 'gpu_hist', 'n_gpus': 1, 'distributed_dask': True, 'loss': 'ls', 'objective': 'gpu:reg:linear', 'max_features': 'auto', 'criterion': 'friedman_mse', 'grow_policy': 'lossguide', 'verbose': True } if cpu_predictor: print('Training using CPUs') dxgb_gpu_params['predictor'] = 'cpu_predictor' dxgb_gpu_params['tree_method'] = 'hist' dxgb_gpu_params['objective'] = 'reg:linear' else: print('Training using GPUs') print('Training parameters are {0}'.format(dxgb_gpu_params)) gpu_dfs = [delayed(DataFrame.from_arrow)(gpu_df) for gpu_df in gpu_dfs[:part_count]] gpu_dfs = [gpu_df for gpu_df in gpu_dfs] wait(gpu_dfs) tmp_map = [(gpu_df, list(client.who_has(gpu_df).values())[0]) for gpu_df in gpu_dfs] new_map = {} for key, value in tmp_map: if value not in new_map: new_map[value] = [key] else: new_map[value].append(key) del(tmp_map) gpu_dfs = [] for list_delayed in new_map.values(): gpu_dfs.append(delayed(cudf.concat)(list_delayed)) del(new_map) gpu_dfs = [(gpu_df[['delinquency_12']], gpu_df[delayed(list)(gpu_df.columns.difference(['delinquency_12']))]) for gpu_df in gpu_dfs] gpu_dfs = [(gpu_df[0].persist(), gpu_df[1].persist()) for gpu_df in gpu_dfs] gpu_dfs = [dask.delayed(xgb.DMatrix)(gpu_df[1], gpu_df[0]) for gpu_df in gpu_dfs] gpu_dfs = [gpu_df.persist() for gpu_df in gpu_dfs] gc.collect() wait(gpu_dfs) labels = None t1 = datetime.datetime.now() bst = dxgb_gpu.train(client, dxgb_gpu_params, gpu_dfs, labels, num_boost_round=dxgb_gpu_params['nround']) t2 = datetime.datetime.now() print("Training time ...") print(t2-t1) print('str(bst) is {0}'.format(str(bst))) print('Exiting script')
from jmetal.util.observer import ProgressBarObserver, VisualizerObserver from jmetal.util.termination_criterion import StoppingByEvaluations from pymsa.core.score import SumOfPairs, PercentageOfTotallyConservedColumns from sequoya.algorithm.multiobjective.nsgaii import DistributedNSGAII from sequoya.operator import SPXMSA, ShiftClosedGapGroups from sequoya.problem import BAliBASE from sequoya.util.solution import get_representative_set from sequoya.util.visualization import MSAPlot if __name__ == '__main__': # setup Dask client (web interface will be initialized at http://127.0.0.1:8787/workers) cluster = LocalCluster(n_workers=4, processes=True) client = Client(cluster) ncores = sum(client.ncores().values()) print(f'{ncores} cores available') # creates the problem problem = BAliBASE( instance='BB20019', path='../resources', score_list=[SumOfPairs(), PercentageOfTotallyConservedColumns()]) # creates the algorithm max_evaluations = 200000 reference_point = [-175000, -1.35] algorithm = DistributedNSGAII( problem=problem,
import dask import numpy as np if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(add_help=False) parser.add_argument('-scheduler', required=False) parser.add_argument('-expect-workers', type=int, default=0) args, unknown = parser.parse_known_args() if args.scheduler: from dask.distributed import Client client = Client(args.scheduler) if args.expect_workers > 0: while True: num_workers = len(client.ncores()) if num_workers >= args.expect_workers: break print('Client waiting for workers (have %s expect %s)' % (num_workers, args.expect_workers), flush=True) import time time.sleep(5) def encode_task_graph(graph): from task_bench_core import ffi, c return np.frombuffer(ffi.buffer(ffi.addressof(graph), ffi.sizeof(graph)), dtype=np.ubyte) def decode_task_graph(graph_array): from task_bench_core import ffi, c return ffi.cast("task_graph_t *", graph_array.ctypes.data)[0]
import s3fs import dask.dataframe as dd import dask.distributed import os from time import ctime s3url = os.environ['s3url'] schurl = os.environ['schurl'] print(ctime(), 'running daskclientapp.. hang tight...') df = dd.read_csv(s3url, storage_options={'anon': True}) from dask.distributed import Client client = Client(schurl) print(ctime(), client.ncores()) print(ctime(), df.head()) # modify the below statement based on your s3 dataset dfg = df.groupby('VendorID').agg({ 'passenger_count': 'count', 'trip_distance': 'sum' }).astype(int).reset_index().rename(columns={ 'passenger_count': 'Trip Count' }).compute() print(ctime(), dfg)
class ProcessManager(GenericProcessManager): manager: "ProcessManager" = None @classmethod def getManager(cls) -> Optional["ProcessManager"]: return cls.manager @classmethod def initManager(cls, serverConfiguration: Dict[str, str]) -> "ProcessManager": if cls.manager is None: cls.manager = ProcessManager(serverConfiguration) return cls.manager def __init__(self, serverConfiguration: Dict[str, str]): self.config = serverConfiguration self.logger = EDASLogger.getLogger() self.num_wps_requests = 0 self.scheduler_address = serverConfiguration.get( "scheduler.address", None) self.submitters = [] self.active = True if self.scheduler_address is not None: self.logger.info( "Initializing Dask-distributed cluster with scheduler address: " + self.scheduler_address) self.client = Client(self.scheduler_address, timeout=60) else: nWorkers = int( self.config.get("dask.nworkers", multiprocessing.cpu_count())) self.client = Client(LocalCluster(n_workers=nWorkers)) self.scheduler_address = self.client.scheduler.address self.logger.info( f"Initializing Local Dask cluster with {nWorkers} workers, scheduler address = {self.scheduler_address}" ) self.client.submit(lambda x: edasOpManager.buildIndices(x), nWorkers) self.ncores = self.client.ncores() self.logger.info(f" ncores: {self.ncores}") self.scheduler_info = self.client.scheduler_info() self.workers: Dict = self.scheduler_info.pop("workers") self.logger.info(f" workers: {self.workers}") log_metrics = serverConfiguration.get("log.scheduler.metrics", False) if log_metrics: self.metricsThread = Thread(target=self.trackMetrics) self.metricsThread.start() def getCWTMetrics(self) -> Dict: metrics_data = { key: {} for key in [ 'user_jobs_queued', 'user_jobs_running', 'wps_requests', 'cpu_ave', 'cpu_count', 'memory_usage', 'memory_available' ] } metrics = self.getProfileData() counts = metrics["counts"] workers = metrics["workers"] for key in [ 'tasks', 'processing', 'released', 'memory', 'saturated', 'waiting', 'waiting_data', 'unrunnable' ]: metrics_data['user_jobs_running'][key] = counts[key] for key in ['tasks', 'waiting', 'waiting_data', 'unrunnable']: metrics_data['user_jobs_queued'][key] = counts[key] for wId, wData in workers.items(): worker_metrics = wData["metrics"] total_memory = wData["memory_limit"] memory_usage = worker_metrics["memory"] metrics_data['memory_usage'][wId] = memory_usage metrics_data['memory_available'][wId] = total_memory - memory_usage metrics_data['cpu_count'][wId] = wData["ncores"] metrics_data['cpu_ave'][wId] = worker_metrics["cpu"] return metrics_data def trackMetrics(self, sleepTime=1.0): isIdle = False self.logger.info(f" ** TRACKING METRICS ** ") while self.active: metrics = self.getProfileData() counts = metrics["counts"] if counts['processing'] == 0: if not isIdle: self.logger.info(f" ** CLUSTER IS IDLE ** ") isIdle = True else: isIdle = False self.logger.info(f" METRICS: {metrics['counts']} ") workers = metrics["workers"] for key, value in workers.items(): self.logger.info(f" *** {key}: {value}") self.logger.info(f" HEALTH: {self.getHealth()}") time.sleep(sleepTime) def getWorkerMetrics(self): metrics = {} wkeys = ['ncores', 'memory_limit', 'last_seen', 'metrics'] scheduler_info = self.client.scheduler_info() workers: Dict = scheduler_info.get("workers", {}) for iW, worker in enumerate(workers.values()): metrics[f"W{iW}"] = {wkey: worker[wkey] for wkey in wkeys} return metrics def getDashboardAddress(self): stoks = self.scheduler_address.split(":") host_address = stoks[-2].strip("/") return f"http://{host_address}:8787" def getCounts(self) -> Dict: profile_address = f"{self.getDashboardAddress()}/json/counts.json" return requests.get(profile_address).json() def getHealth(self, mtype: str = "") -> str: profile_address = f"{self.getDashboardAddress()}/health" return requests.get(profile_address).text def getMetrics(self, mtype: str = "") -> Optional[Dict]: counts = self.getCounts() if counts['processing'] == 0: return None mtypes = mtype.split(",") metrics = {"counts": counts} if "processing" in mtypes: metrics["processing"] = self.client.processing() if "profile" in mtypes: metrics["profile"] = self.client.profile() return metrics def getProfileData(self, mtype: str = "") -> Dict: try: return { "counts": self.getCounts(), "workers": self.getWorkerMetrics() } except Exception as err: self.logger.error("Error in getProfileData") self.logger.error(traceback.format_exc()) # response2: requests.Response = requests.get(tasks_address) # print(f"\n ----> Tasks Data from {tasks_address}: \n ** {response2.text} ** \n" ) # response3: requests.Response = requests.get(workers_address) # print(f"\n ----> Workers Data from {workers_address}: \n ** {response3.text} ** \n" ) # data = json.loads(counts) # (r"info/main/workers.html", Workers), # (r"info/worker/(.*).html", Worker), # (r"info/task/(.*).html", Task), # (r"info/main/logs.html", Logs), # (r"info/call-stacks/(.*).html", WorkerCallStacks), # (r"info/call-stack/(.*).html", TaskCallStack), # (r"info/logs/(.*).html", WorkerLogs), # (r"json/counts.json", CountsJSON), # (r"json/identity.json", IdentityJSON), # (r"json/index.html", IndexJSON), # (r"individual-plots.json", IndividualPlots), # (r"metrics", PrometheusHandler), # (r"health", HealthHandler), # "/system": systemmonitor_doc, # "/stealing": stealing_doc, # "/workers": workers_doc, # "/events": events_doc, # "/counters": counters_doc, # "/tasks": tasks_doc, # "/status": status_doc, # "/profile": profile_doc, # "/profile-server": profile_server_doc, # "/graph": graph_doc, # "/individual-task-stream": individual_task_stream_doc, # "/individual-progress": individual_progress_doc, # "/individual-graph": individual_graph_doc, # "/individual-profile": individual_profile_doc, # "/individual-profile-server": individual_profile_server_doc, # "/individual-nbytes": individual_nbytes_doc, # "/individual-nprocessing": individual_nprocessing_doc, # "/individual-workers": individual_workers_doc, def term(self): self.active = False self.client.close() def runProcess(self, job: Job) -> EDASDataset: start_time = time.time() try: self.logger.info( f"Running workflow for requestId: {job.requestId}, scheduler: {self.scheduler_address}" ) result = edasOpManager.buildTask(job) self.logger.info("Completed EDAS workflow in time " + str(time.time() - start_time)) return result except Exception as err: self.logger.error("Execution error: " + str(err)) traceback.print_exc() def submitProcess(self, service: str, job: Job, resultHandler: ExecHandler): submitter: SubmissionThread = SubmissionThread(job, resultHandler) self.submitters.append(submitter) submitter.start()
'DoubleMuon': [ 'root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/Run2012B_DoubleMuParked.root', 'root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/Run2012C_DoubleMuParked.root', ], 'ZZ to 4mu': [ 'root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/ZZTo4mu.root' ] } exe_args = { 'client': client, 'savemetrics': True, 'schema': processor.NanoEvents, 'align_clusters': True } while len(client.ncores()) < 4: print('Waiting for more cores to spin up, currently there are {0} available...'.format(len(client.ncores()))) print('Dask client info ->', client) time.sleep(10) proc = MyProcessor() hists = processor.run_uproot_job(fileset, treename="Events", processor_instance=proc, executor=processor.dask_executor, executor_args=exe_args) print(hists)
def open(): scheduler = Client(schedulerAddress) #scheduler = Client('127.0.0.1:8786') print("Running on %d cores" % sum(scheduler.ncores().values())) return scheduler
default="ips.txt", help="location of the nodes") return parser.parse_args() if __name__ == '__main__': args = parse_args() # setup the game game, agents, ratings = game_setup(args.num_agents) # run the game matches on the cluster nodes = get_nodes(args.ip_file) print("Connecting to cluster scheduler {} with workers:".format(nodes[0])) client = Client(nodes[0] + ':8786') for worker, cores in client.ncores().items(): print("{:>35} {} cores".format(worker, cores)) client.upload_file('game.py') start = default_timer() matches = run_games(game, agents, args.num_matches, client) check_status(matches) print("Game run in {:.2f}".format(default_timer() - start)) # here we could do something with failed matches (errors) # run rating evaluations start = default_timer() compute_ratings(matches, ratings) print("Skills computed in {:.2f}".format(default_timer() - start))