def get_peeker(): logger.info("getting new peeker") try: scheduler_address = os.environ[SCH_ADDR_KEY] client = get_client(scheduler_address) except Exception as e: logger.info("starting new dask client", exception=str(e)) client = Client() with open(".env", "w") as fp: fp.write(f"{SCH_ADDR_KEY}={client.scheduler.address}") n_part = sum(client.nthreads().values()) logger.info("dask client acquired", dashboard=client.dashboard_link, scheduler=client.scheduler.address, n_threads=n_part) app_data = load_entire_app_data() ddf = dd.from_pandas(app_data, npartitions=n_part).persist() cat_filt_cols = app_discrete_cols num_filt_cols = [chain_len_col] return FootballPeeker(ddf, cat_filt_cols, num_filt_cols)
# just wait for a valid worker, not an empty worker # The safest approach is to track futures! run_history = [] futures = [] # Submit all jobs counter = 0 while not dispatcher.is_complete(): # Are there workers available? # The nthreads dynamically query for the scheduler capacity # On a local cluster, capacity can be increased by scale() # On a distributed, dask-worker 'tcp://127.0.0.1:45739' --nprocs 1 --nthreads 1 total_compute_power = sum(client.nthreads().values()) if len(futures) < total_compute_power: now = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) print(f"Scheduling job {counter} at {now}") counter += 1 # submit a new job run_info = dispatcher.get_next_run() futures.append(client.submit(fit_and_score, run_info)) # If the cluster was not under full load, we check whether anything has completed # since the last iteration and immediately check if we need to submit more jobs try: done_futures = wait(futures, timeout=0, return_when='FIRST_COMPLETED').done
class DaskParallelRunner(BaseRunner): """Interface to submit and collect a job in a distributed fashion. DaskParallelRunner is intended to comply with the bridge design pattern. Nevertheless, to reduce the amount of code within single-vs-parallel implementations, DaskParallelRunner wraps a BaseRunner object which is then executed in parallel on n_workers. This class then is constructed by passing a BaseRunner that implements a run() method, and is capable of doing so in a serial fashion. Then, this wrapper class called DaskParallelRunner uses dask to initialize N number of BaseRunner that actively wait of a RunInfo to produce a RunValue object. To be more precise, the work model is then: 1- The smbo.intensifier dictates "what" to run (a configuration/instance/seed) via a RunInfo object. 2- a tae_runner takes this RunInfo object and launches the task via tae_runner.submit_run(). In the case of DaskParallelRunner, n_workers receive a pickle-object of DaskParallelRunner.single_worker, each with a run() method coming from DaskParallelRunner.single_worker.run() 3- RunInfo objects are run in a distributed fashion, an their results are available locally to each worker. Such result is collected by DaskParallelRunner.get_finished_runs() and then passed to the SMBO. 4- Exceptions are also locally available to each worker and need to be collected. Dask works with Future object which are managed via the DaskParallelRunner.client. Attributes ---------- results ta stats run_obj par_factor cost_for_crash abort_i_first_run_crash n_workers futures client Parameters --------- single_worker: BaseRunner A runner to run in a distributed fashion n_workers: int Number of workers to use for distributed run. Will be ignored if ``dask_client`` is not ``None``. patience: int How much to wait for workers to be available if one fails output_directory: str, optional If given, this will be used for the dask worker directory and for storing server information. If a dask client is passed, it will only be used for storing server information as the worker directory must be set by the program/user starting the workers. dask_client: dask.distributed.Client User-created dask client, can be used to start a dask cluster and then attach SMAC to it. """ def __init__( self, single_worker: BaseRunner, n_workers: int, patience: int = 5, output_directory: typing.Optional[str] = None, dask_client: typing.Optional[dask.distributed.Client] = None, ): super(DaskParallelRunner, self).__init__( ta=single_worker.ta, stats=single_worker.stats, run_obj=single_worker.run_obj, par_factor=single_worker.par_factor, cost_for_crash=single_worker.cost_for_crash, abort_on_first_run_crash=single_worker.abort_on_first_run_crash, ) # The single worker, which is replicated on a need # basis to every compute node self.single_worker = single_worker self.n_workers = n_workers # How much time to wait for workers to be available self.patience = patience self.output_directory = output_directory # Because a run() method can have pynisher, we need to prevent the multiprocessing # workers to be instantiated as demonic - this cannot be passed via worker_kwargs dask.config.set({'distributed.worker.daemon': False}) if dask_client is None: self.close_client_at_del = True self.client = Client(n_workers=self.n_workers, processes=True, threads_per_worker=1, local_directory=output_directory) if self.output_directory: self.scheduler_file = os.path.join(self.output_directory, '.dask_scheduler_file') self.client.write_scheduler_file( scheduler_file=self.scheduler_file) else: self.close_client_at_del = False self.client = dask_client self.futures = [] # type: typing.List[Future] self.scheduler_info = self.client._get_scheduler_info() def submit_run(self, run_info: RunInfo) -> None: """This function submits a configuration embedded in a run_info object, and uses one of the workers to produce a result locally to each worker. The execution of a configuration follows this procedure: 1- SMBO/intensifier generates a run_info 2- SMBO calls submit_run so that a worker launches the run_info 3- submit_run internally calls self.run(). it does so via a call to self.run_wrapper() which contains common code that any run() method will otherwise have to implement, like capping check. Child classes must implement a run() method. All results will be only available locally to each worker, so the main node needs to collect them. Parameters ---------- run_info: RunInfo An object containing the configuration and the necessary data to run it """ # Check for resources or block till one is available if not self._workers_available(): wait(self.futures, return_when='FIRST_COMPLETED').done self._extract_completed_runs_from_futures() # In code check to make sure that there are resources if not self._workers_available(): warnings.warn( "No workers are available. This could mean workers crashed" "Waiting for new workers...") time.sleep(self.patience) if not self._workers_available(): raise ValueError( "Tried to execute a job, but no worker was " "available. This likely means that a worker crashed " "or no workers were properly configured.") # At this point we can submit the job self.futures.append( self.client.submit(self.single_worker.run_wrapper, run_info)) def get_finished_runs( self) -> typing.List[typing.Tuple[RunInfo, RunValue]]: """This method returns any finished configuration, and returns a list with the results of exercising the configurations. This class keeps populating results to self.results until a call to get_finished runs is done. In this case, the self.results list is emptied and all RunValues produced by running run() are returned. Returns ------- List[RunInfo, RunValue]: A list of RunValues (and respective RunInfo), that is, the results of executing a run_info a submitted configuration """ # Proactively see if more configs have finished self._extract_completed_runs_from_futures() results_list = [] while self.results: results_list.append(self.results.pop()) return results_list def _extract_completed_runs_from_futures(self) -> None: """ A run is over, when a future has done() equal true. This function collects the completed futures and move them from self.futures to self.results. *** We make sure futures never exceed the capacity of the scheduler """ # In code check to make sure we don;t exceed resource allocation if len(self.futures) > sum(self.client.nthreads().values()): warnings.warn( "More running jobs than resources available " "Should not have more futures/runs in remote workers " "than the number of workers. This could mean a worker " "crashed and was not able to be recovered by dask. ") # A future is removed to the list of futures as an indication # that a worker is available to take in an extra job done_futures = [f for f in self.futures if f.done()] for future in done_futures: self.results.append(future.result()) self.futures.remove(future) def wait(self) -> None: """SMBO/intensifier might need to wait for runs to finish before making a decision. This class waits until 1 run completes """ if self.futures: wait(self.futures, return_when='FIRST_COMPLETED').done def pending_runs(self) -> bool: """ Whether or not there are configs still running. Generally if the runner is serial, launching a run instantly returns it's result. On parallel runners, there might be pending configurations to complete. """ # If there are futures available, it translates # to runs still not finished/processed return len(self.futures) > 0 def run( self, config: Configuration, instance: str, cutoff: typing.Optional[float] = None, seed: int = 12345, budget: typing.Optional[float] = None, instance_specific: str = "0", ) -> typing.Tuple[StatusType, float, float, typing.Dict]: """ This method only complies with the abstract parent class. In the parallel case, we call the single worker run() method Parameters ---------- config : Configuration dictionary param -> value instance : string problem instance cutoff : float, optional Wallclock time limit of the target algorithm. If no value is provided no limit will be enforced. seed : int random seed budget : float, optional A positive, real-valued number representing an arbitrary limit to the target algorithm. Handled by the target algorithm internally instance_specific: str instance specific information (e.g., domain file or solution) Returns ------- status: enum of StatusType (int) {SUCCESS, TIMEOUT, CRASHED, ABORT} cost: float cost/regret/quality (float) (None, if not returned by TA) runtime: float runtime (None if not returned by TA) additional_info: dict all further additional run information """ return self.single_worker.run( config=config, instance=instance, cutoff=cutoff, seed=seed, budget=budget, instance_specific=instance_specific, ) def num_workers(self) -> int: """Total number of workers available. This number is dynamic as more resources can be allocated""" return sum(self.client.nthreads().values()) def _workers_available(self) -> bool: """"Query if there are workers available, which means that there are resources to launch a dask job""" total_compute_power = sum(self.client.nthreads().values()) if len(self.futures) < total_compute_power: return True return False def __del__(self) -> None: """Make sure that when this object gets deleted, the client is terminated. This is only done if the client was created by the dask runner.""" if self.close_client_at_del: self.client.close()
def main(): # client = Client(processes = False) # threads ? client = Client() size = 10000000 # size = 20 # shards = 20 # shards = 6 # shards = 1 shards = 12 shape = [size] lat = np.random.rand(size) * 180.0 - 90.0 lon = np.random.rand(size) * 360.0 - 180.0 resolution_ = 8 resolution = np.full(shape, resolution_, dtype=np.int64) # print('lat shape: ',lat.shape) print('') serial_start = timer() s_sids = ps.from_latlon(lat, lon, resolution_) s_sidsstr = [hex16(s_sids[i]) for i in range(len(s_sids))] serial_end = timer() # print('0 s_sids: ',s_sids) print('time s_sids: ', serial_end - serial_start) def w_from_latlon(llr): # print('') # print('llr: ',llr) sids = ps.from_latlon(llr[0], llr[1], int(llr[2][0])) # print('sids: ',sids) # print('') return sids # def w_from_latlon1(lat,lon,res): # return ps.from_latlon(np.array([lat],dtype=np.double)\ # ,np.array([lon],dtype=np.double)\ # ,int(res)) # sid = ps.from_latlon(lat,lon,resolution) # sid = client.map(w_from_latlon1,lat,lon,resolution) # futures dask_start = timer() shard_size = int(size / shards) shard_bins = np.arange(shards + 1) * shard_size shard_bins[-1] = size # print('---') # print('shards: ',shards) # print('shard_size: ',shard_size) # print('shard_bins: ',shard_bins) # print('---') lat_shards = [lat[shard_bins[i]:shard_bins[i + 1]] for i in range(shards)] lon_shards = [lon[shard_bins[i]:shard_bins[i + 1]] for i in range(shards)] res_shards = [ resolution[shard_bins[i]:shard_bins[i + 1]] for i in range(shards) ] llr_shards = [] for i in range(shards): llr_shards.append([lat_shards[i], lon_shards[i], res_shards[i]]) # print('llr_shards len: ',len(llr_shards)) # print('llr_shards: ',llr_shards) ## future = client.submit(func, big_data) # bad ## ## big_future = client.scatter(big_data) # good ## future = client.submit(func, big_future) # good # sid = client.map(w_from_latlon,llr_shards) # futures big_future = client.scatter(llr_shards) sid = client.map(w_from_latlon, big_future) # futures # print('0 sid: ',sid) # print('9 len(sid): ',len(sid)) # for i in range(shards): # print(i, ' 10 sid: ',sid[i]) # print(i, ' 11 sid: ',sid[i].result()) # print('15 sid: ',[type(i) for i in sid]) sid_cat = np.concatenate([i.result() for i in sid]) sidsstr = [hex16(sid_cat[i]) for i in range(len(sid_cat))] dask_end = timer() # print('2 sids: ',sids) sids = sid_cat print('') # for i in range(size-20,size): for i in np.array(np.random.rand(20) * size, dtype=np.int64): print("%09i" % i, sidsstr[i], s_sidsstr[i], ' ', sids[i] - s_sids[i]) print('') print('dask total threads: ', sum(client.nthreads().values())) print('size: ', size) print('shards: ', shards) print('') print('time sids: ', dask_end - dask_start) print('time s_sids: ', serial_end - serial_start) print('parallel speed up: ', (serial_end - serial_start) / (dask_end - dask_start)) client.close()
} # initialize the random seed generator random_generator = np.random.default_rng(RAND_SEED) all_seeds = random_generator.choice(MAX_SEED, TOTAL_RUN, replace=False) my_seed_generator = list(runautopilots.chunks(all_seeds, CHUNK_SIZE)) ndebug = len(DEBUG_NUMS) my_seed_generator[0][0:ndebug] = DEBUG_NUMS # initialize best list best_list = runautopilots.reset_best_simulations(NUM_BEST) # if parallel, initialize client if not SERIES: client = Client() ncores = sum(client.ncores().values()) nthreads = sum(client.nthreads().values()) runtype = f"Parallel with [Cores:{ncores:.0f}] x [Threads:{nthreads:.0f}]" else: runtype = "Series" # needed for each autpilot my_mission = mission.mission(**class_params["mission_params"]) t_start = time.time() if not DEBUG: # initialize autopilot list ap_template = autopilot.ap_nn(my_mission.survey_lines, **autopilot_params) all_autopilot_list = []
#python local.py def hello_world(n): import time time.sleep(1) return "Hello world " + str(n) if __name__ == '__main__': from dask.distributed import Client, LocalCluster import time import numpy as np cluster = LocalCluster(n_workers=4, threads_per_worker=1) client = Client(cluster) print(cluster) print(client.nthreads()) start = time.time() res = client.map(hello_world, np.arange(36)) for r in res: print(r.result()) client.close() print(time.time() - start)