def train_on_jz_dask(job_name, train_function, *args, **kwargs): cluster = SLURMCluster( cores=1, job_cpu=20, memory='80GB', job_name=job_name, walltime='60:00:00', interface='ib0', job_extra=[ f'--gres=gpu:1', '--qos=qos_gpu-t4', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) cluster.scale(1) print(cluster.job_script()) client = Client(cluster) futures = client.submit( # function to execute train_function, *args, **kwargs, # this function has potential side effects pure=True, ) client.gather(futures) print('Shutting down dask workers')
def main(): n_mutation = 100 client = Client('scheduler:8786') futures = client.map(initialize_network, range(n_mutation)) results = client.gather(futures) results.sort(key=lambda x: -x[1]) truncated = list(map(lambda x: x[0], results[:3])) futures = [] for i, seed in enumerate(truncated): name = 'top-{}'.format(i) futures.append( client.submit(initialize_network, seed, store=True, name=name)) results = client.gather(futures) print(results, flush=True) for g in range(10): futures = [] for seed in range(n_mutation): futures.append(client.submit(update_network, seed, g + 1)) results = client.gather(futures) results.sort(key=lambda x: -x[1]) truncated = list(map(lambda x: x[0], results[:3])) futures = [] for i, seed in enumerate(truncated): name = 'top-{}'.format(i) futures.append( client.submit(update_network, seed, g + 1, store=True, name=name)) results = client.gather(futures) print(results, flush=True)
def main(): """.""" host = os.getenv('DASK_SCHEDULER_HOST', default='localhost') port = os.getenv('DASK_SCHEDULER_PORT', default=8786) print(host, port) client = Client('{}:{}'.format(host, port)) # client.run(init_logging) # client.run_on_scheduler(init_logging) # Run some mock functions and gather a result data = client.map(print_listdir, range(10)) future = client.submit(print_values, data) progress(future) print('') result = client.gather(future) print(result) # Run a second stage which runs some additional processing. print('here A') data_a = client.map(set_value, range(100)) print('here B') data_b = client.map(square, data_a) print('here C') data_c = client.map(neg, data_b) print('here D') # Submit a function application to the scheduler total = client.submit(sum, data_c) print('here E') progress(total) print(total.result()) print('here F')
def run_simulations_dask(clearance_heights, xgaps, Ds, tilts, kwargs): # Create client scheduler_file = '/scratch/sayala/dask_testing/scheduler.json' client = Client(scheduler_file=scheduler_file) # Iterate over inputs futures = [] for ch in range (0, len(clearance_heights)): clearance_height = clearance_heights[ch] for xx in range (0, len(xgaps)): xgap = xgaps[xx] for tt in range (0, len(tilts)): tilt = tilts[tt] for dd in range (0, len(Ds)): D = Ds[dd] futures.append(client.submit(simulate_single, clearance_height=clearance_height, xgap=xgap, tilt=tilt, D=D, **kwargs)) # Get results for all simulations res = client.gather(futures) # Close all dask workers and scheduler try: client.shutdown() except: pass # Close client client.close() res = 'FINISHED!!!!!!!!!!!!!!!!!!!!!!!!!!!!!' return res
def handle(self, *args, **options): # Unpack variables name = options['name'] model = options['model'] segmentation = options['segmentation'] spatial_aggregation = options['spatial_aggregation'] categorical_variables = options['categorical_variables'] scheduler_file = options['scheduler'] # datacube query gwf_kwargs = { k: options[k] for k in ['product', 'lat', 'long', 'region']} iterable = gwf_query(**gwf_kwargs) # Start cluster and run client = Client(scheduler_file=scheduler_file) client.restart() C = client.map(predict_object, iterable, pure=False, **{'model_name': model, 'segmentation_name': segmentation, 'categorical_variables': categorical_variables, 'aggregation': spatial_aggregation, 'name': name, }) result = client.gather(C) print('Successfully ran prediction on %d tiles' % sum(result)) print('%d tiles failed' % result.count(False))
def test_insert(pod): # Write with workers label = "my_label" repo = Repo(pod=pod) # Create collection and label collection = repo.create_collection(schema, "my_collection") token = pod.token cluster = LocalCluster(processes=False) client = Client(cluster) args = [(token, label, y) for y in years] with timeit(f"\nWRITE ({pod.protocol})"): fut = client.map(insert, args) assert sum(client.gather(fut)) == 10_519_200 client.close() cluster.close() # Merge everything and read series with timeit(f"\nMERGE ({pod.protocol})"): collection.merge() with timeit(f"\nREAD ({pod.protocol})"): series = collection / label df = series["2015-01-01":"2015-01-02"].df() assert len(df) == 1440 df = series["2015-12-31":"2016-01-02"].df() assert len(df) == 2880
def rank_populations(self, top: int = 0.5): """Given all the populations, rank them according to the the given fitness function Inputs: ======= populations (List): populations to evaluate top (int): percentage of top populations to return Outputs: ======== best_populations (List): top populations """ client = Client() client_input = [(self.data, p, self.yield_column) for p in self.populations] futures = client.map(GeneticAlgorithm.evaluate_fitness, client_input) ranking = client.gather(futures) client.close() # return top performing populations top_n = int(top * len(self.populations)) return [self.populations[i] for i in argsort(ranking)[-top_n:]]
def DASK_batch_mult(matrix_input, vector_input, workers, batch_size, input_size, output_channels): client = Client(n_workers=workers) results = [] batch_no = matrix_input.shape[0] // batch_size for i in range(batch_no): batch = client.scatter(matrix_input[i * batch_size:i * batch_size + batch_size]) results.append( client.submit(convolution_mean, batch, vector_input, batch_size, vector_input.shape[0])) wait(results) data = client.gather(results) out_tensor = np.empty( (batch_size * batch_no, output_channels, input_size, input_size)) for i in range(batch_no): out_tensor[i * batch_size:i * batch_size + batch_size] = data[i].reshape(batch_size, output_channels, input_size, input_size) client.shutdown() return out_tensor
class DaskParallelRunner(object): """Run the simulations using dask.distributed on a cluster. This requires some set up on the cluster (see the dask.distributed documentation). TO BE DOCUMENTED. """ def __init__(self, client, chunk=10): if isinstance(client, str): from dask.distributed import Client self.client = Client(client) else: self.client = client self.chunk = chunk def __call__(self, function, argument_list): def function_with_single_numerical_threads(args): lib.set_max_numerical_threads(1) return function(*args) # make a bag argument_list = list(argument_list) n = self.chunk futures = [] for i in range(0, len(argument_list), n): args = argument_list[i: i + n] future = self.client.map(function_with_single_numerical_threads, list(args)) futures += future results = self.client.gather(futures, direct=False) return results
def get_words(data): ''' find all high_frequency_words in a given column of strings or other types of data whose elements are all strings a distributed client 'c' is applied :param: data :type : pd.Series ''' assert isinstance(data, pd.Series) assert all(isinstance(i, str) for i in data) from dask.distributed import Client c = Client() lines = [_ for _ in data] tasks = c.map(word_frequency, [_ for _ in lines]) allDicts = c.gather(tasks) allDict = {} for dic in allDicts: for key in dic.keys(): if key in allDict.keys(): allDict[key] += 1 else: allDict[key] = 1 words = pd.Series(list(allDict.keys()), index=list(allDict.values())) words = words.sort_index() threshold = int(words.shape[0] / 50) words = words.loc[threshold:] c.close() return words
def run_simulations_dask(tilts, kwargs): # Create client scheduler_file = '/scratch/sayala/dask_testing/scheduler.json' client = Client(scheduler_file=scheduler_file) # Iterate over inputs futures = [] # Add Iterations HERE for tilt in tilts: futures.append(client.submit(simulate_single, tilt=tilt, **kwargs)) # Get results for all simulations res = client.gather(futures) # Close all dask workers and scheduler try: client.shutdown() except: pass # Close client client.close() res = 'FINISHED!!!!!!!!!!!!!!!!!!!!!!!!!!!!!' return res
def handle(self, *args, **options): # Unpack variables model_id = options['model_id'] out_dir = options['out_dir'] # Create output dir if does not exist if not os.path.exists(out_dir): os.makedirs(out_dir) # datacube query gwf_kwargs = { k: options[k] for k in ['product', 'lat', 'long', 'region'] } iterable = gwf_query(**gwf_kwargs) # Start cluster and run client = Client() client.restart() C = client.map(predict_pixel_tile, iterable, **{ 'model_id': model_id, 'outdir': out_dir }) filename_list = client.gather(C) print(filename_list)
def load_data_parallel(data_path, num_processes, image_variable="abi", count_variable="flash_counts", time_variable="time"): cluster = LocalCluster(n_workers=num_processes, threads_per_worker=1) client = Client(cluster) data_files = sorted(glob(join(data_path, "*.nc"))) data_jobs = [] for data_file in data_files: data_jobs.append( client.submit(load_single_data_file, data_file, image_variable=image_variable, count_variable=count_variable, time_variable=time_variable)) wait(data_jobs) data_results = client.gather(data_jobs) all_images = np.concatenate([d[0] for d in data_results]) all_counts = np.concatenate([d[1] for d in data_results]) all_time = pd.DatetimeIndex(np.concatenate([d[2] for d in data_results])) client.close() cluster.close() del client del cluster return all_images, all_counts, all_time
class DaskHandler(IProcessingHandler): """This class wraps all Dask related functions.""" def __init__(self, number_of_workers, class_cb: Callable, brain_class, worker_log_level=logging.WARNING): super().__init__(number_of_workers) self._client: Optional[Client] = None self._cluster: Optional[LocalCluster] = None self.class_cb = class_cb self.brain_class = brain_class self.worker_log_level = worker_log_level def init_framework(self): if self._client: raise RuntimeError("Dask client already initialized.") # threads_per_worker must be one, because atari-env is not thread-safe. # And because lower the thread-count from the default, we must increase the number of workers self._cluster = LocalCluster(processes=True, asynchronous=False, threads_per_worker=1, silence_logs=self.worker_log_level, n_workers=self.number_of_workers, memory_pause_fraction=False, lifetime='1 hour', lifetime_stagger='5 minutes', lifetime_restart=True, interface="lo") self._client = Client(self._cluster) self._client.register_worker_plugin(_CreatorPlugin(self.class_cb, self.brain_class), name="creator-plugin") logging.info("Dask dashboard available at port: " + str(self._client.scheduler_info()["services"]["dashboard"])) def map(self, func, *iterable): if not self._client: raise RuntimeError("Dask client not initialized. Call \"init_framework\" before calling \"map\"") return self._client.gather(self._client.map(func, *iterable)) def cleanup_framework(self): self._client.shutdown()
def run_simulations_dask(xgaps, numpanelss, sensorsxs, kwargs): # Create client scheduler_file = '/scratch/sayala/dask_testing/scheduler.json' client = Client(scheduler_file=scheduler_file) # Iterate over inputs futures = [] for nn in range(0, len(numpanelss)): numpanels = numpanelss[nn] for xx in range(0, len(xgaps)): xgap = xgaps[xx] for ii in sensorsxs: futures.append( client.submit(simulate_single, xgap=xgap, numpanels=numpanels, sensorx=ii, **kwargs)) # Get results for all simulations res = client.gather(futures) # Close all dask workers and scheduler try: client.shutdown() except: pass # Close client client.close() res = 'FINISHED!!!!!!!!!!!!!!!!!!!!!!!!!!!!!' return res
def run_simulations_dask(daylist, posxs, moduleWiths, kwargs): # Create client scheduler_file = '/scratch/sayala/dask_testing/scheduler.json' client = Client(scheduler_file=scheduler_file) # Iterate over inputs futures = [] # Add Iterations HERE for daydate in daylist: for posx in posxs: for moduleWith in moduleWiths: futures.append( client.submit(simulate_single, daydate=daydate, posx=posx, moduleWith=moduleWith, **kwargs)) # Get results for all simulations res = client.gather(futures) # Close all dask workers and scheduler try: client.shutdown() except: pass # Close client client.close() res = 'FINISHED!!!!!!!!!!!!!!!!!!!!!!!!!!!!!' return res
def train_on_jz_dask(job_name, train_function, *args, **kwargs): cluster = SLURMCluster( cores=1, job_cpu=40, memory='80GB', job_name=job_name, walltime='20:00:00', interface='ib0', job_extra=[ f'--gres=gpu:4', '--qos=qos_gpu-t3', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/understanding-unets', '. ./submission_scripts_jean_zay/env_config.sh', ], ) cluster.scale(1) print(cluster.job_script()) client = Client(cluster) futures = client.submit( # function to execute train_function, *args, **kwargs, # this function has potential side effects pure=True, ) run_id = client.gather(futures) print(f'Train run id: {run_id}')
def start_futures(): t = time() isins = get_isins() client = Client('127.0.0.1:8786') data = client.map(load_data, isins) params_a = client.map(get_param, data, ['param_a'] * len(isins)) params_b = client.map(get_param, data, ['param_b'] * len(isins)) result_a = client.map(task_a, isins, params_a, params_b) group_args = list(chain(*zip(isins, result_a, params_b))) result_group = client.submit(task_group_alter, *group_args) result_b = client.map(task_b, isins, params_b, [result_group] * len(isins)) result_c = client.map(task_c, isins, params_b) result = client.gather([result_group] + result_a + result_b + result_c) total = time() - t print(total) print(len(result)) with open('/Users/vladimirmarunov/git/dask-test/res.txt', 'w') as f: f.write('{}\n'.format(total)) json.dump(result, f, indent=4)
def main(args): config_file = args.config_file # Configure on cluster if config_file: stream = open(config_file, 'r') inp = yaml.load(stream) cores = inp['jobqueue']['slurm']['cores'] memory = inp['jobqueue']['slurm']['memory'] jobs = inp['jobqueue']['slurm']['jobs'] cluster = SLURMCluster( cores=cores, memory=memory, ) cluster.scale(jobs=jobs) # Configure locally else: cluster = LocalCluster() client = Client(cluster) raised_futures = client.map(sleep_more, range(100)) progress(raised_futures) raised = client.gather(raised_futures) print('\n', raised)
def main(): parser = argparse.ArgumentParser() parser.add_argument("config", help="Configuration yaml file") parser.add_argument("-p", "--proc", type=int, default=1, help="Number of processors") args = parser.parse_args() if not exists(args.config): raise FileNotFoundError(args.config + " not found.") with open(args.config) as config_file: config = yaml.load(config_file) #time_files = get_cam_output_times(config["model_path"], time_var=config["time_var"], # file_start=config["model_file_start"], # file_end=config["model_file_end"]) if not exists(config["out_path"]): makedirs(config["out_path"]) #print(time_files) #filenames = np.sort(time_files["filename"].unique()) filenames = sorted( glob( join(config["model_path"], config["model_file_start"] + "*" + config["model_file_end"]))) if "dt" not in config.keys(): config["dt"] = 1800 if args.proc == 1: for filename in filenames: process_cesm_file_subset( filename, staggered_variables=config["staggered_variables"], out_variables=config["out_variables"], subset_variable=config["subset_variable"], subset_threshold=config["subset_threshold"], out_path=config["out_path"], out_format=config["out_format"], dt=config["dt"]) else: cluster = LocalCluster(n_workers=0) cluster.scale(args.proc) client = Client(cluster) print(client) futures = client.map(process_cesm_file_subset, filenames, staggered_variables=config["staggered_variables"], out_variables=config["out_variables"], subset_variable=config["subset_variable"], subset_threshold=config["subset_threshold"], out_path=config["out_path"], out_start=config["out_start"], out_format=config["out_format"], dt=config["dt"]) out = client.gather(futures) print(out) client.close() return
def eval_parameter_grid(run_ids, job_name, eval_function, parameter_grid, n_gpus=1): parameters = list(ParameterGrid(parameter_grid)) n_parameters_config = len(parameters) # eval eval_cluster = SLURMCluster( cores=1, job_cpu=40, memory='80GB', job_name=job_name, walltime='5:00:00', interface='ib0', job_extra=[ f'--gres=gpu:{n_gpus}', '--qos=qos_gpu-t3', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) eval_cluster.scale(n_parameters_config) client = Client(eval_cluster) original_parameters = [] for params in parameters: original_params = {} original_params['n_samples'] = params.pop('n_samples', None) original_params['loss'] = params.pop('loss', 'mae') original_params['fixed_masks'] = params.pop('fixed_masks', False) original_parameters.append(original_params) futures = [ client.submit( # function to execute eval_function, run_id=run_id, n_samples=50, **params, ) for run_id, params in zip(run_ids, parameters) ] for params, original_params, future in zip(parameters, original_parameters, futures): metrics_names, eval_res = client.gather(future) params.update(original_params) print('Parameters', params) print(metrics_names) print(eval_res) print('Shutting down dask workers') client.close() eval_cluster.close()
def train_eval_dealiasers(contrast='CORPD_FBK', n_epochs=200, n_samples=None, model_name=None, model_size=None, loss='mae'): job_name = 'dealiasing_fastmri' model_specs = list(get_model_specs(force_res=True, dealiasing=True)) if model_name is not None: model_specs = [ms for ms in model_specs if ms[0] == model_name] if model_size is not None: model_specs = [ms for ms in model_specs if ms[1] == model_size] n_models = len(model_specs) train_cluster = SLURMCluster( cores=1, job_cpu=20, memory='80GB', job_name=job_name, walltime='20:00:00', interface='ib0', job_extra=[ f'--gres=gpu:1', '--qos=qos_gpu-t3', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) train_cluster.adapt(minimum_jobs=0, maximum_jobs=n_models) client = Client(train_cluster) futures = [client.submit( # function to execute train_dealiaser, model_fun=model_fun, model_kwargs=kwargs, run_id=f'{model_name}_{model_size}', n_scales=n_scales, contrast=contrast, n_epochs=n_epochs, n_samples=n_samples, loss=loss, ) for model_name, model_size, model_fun, kwargs, _, n_scales, _ in model_specs] run_ids = client.gather(futures) client.close() train_cluster.close() # eval eval_dealiasers( run_ids, job_name=job_name, contrast=contrast, n_epochs=n_epochs, model_name=model_name, model_size=model_size, n_samples_train=n_samples, loss=loss, ) return run_ids
def _get_online_sp(): client = Client() # start local workers as threads # TODO: Figure out a way to not hardwire the pages futures = client.map(request_online, range(1, 48)) df = pd.concat(client.gather(futures)).reset_index(drop='index') cleaned_df = df[(df['snow_depth'] != '') & (df['lat'] != '') & (df['lon'] != '')].sort_values( by='time').reset_index(drop='index') cleaned_df.loc[:, 'lon'] = cleaned_df['lon'].apply(lambda x: float(x)) cleaned_df.loc[:, 'lat'] = cleaned_df['lat'].apply(lambda x: float(x)) return cleaned_df
def distribute(func, parameters, scheduler_addr=None): """Run the function with the parameters in parallel distributedly.""" try: if scheduler_addr: addr = scheduler_addr elif not hasattr(parameters[0], 'scheduler_addr'): raise RuntimeError('The parameters or distribute() need a scheduler_addr parameter.') else: addr = parameters[0].scheduler_addr client = Client(addr) results = client.map(func, parameters) client.gather(results) except Exception as e: print('Distributed run failed.') raise e finally: client.close() return results
def eval_parameter_grid(job_name, eval_function, parameter_grid, run_ids, n_samples_eval=None): parameters = list(ParameterGrid(parameter_grid)) n_parameters_config = len(parameters) assert n_parameters_config == len( run_ids), 'Not enough run ids provided for grid evaluation' eval_cluster = SLURMCluster( cores=1, job_cpu=40, memory='60GB', job_name=job_name, walltime='3:00:00', interface='ib0', job_extra=[ f'--gres=gpu:4', '--qos=qos_gpu-t3', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/understanding-unets', '. ./submission_scripts_jean_zay/env_config.sh', ], ) eval_cluster.scale(n_parameters_config) client = Client(eval_cluster) n_samples_list = [] for params in parameters: n_samples = params.pop('n_samples', -1) n_samples_list.append(n_samples) futures = [ client.submit( # function to execute eval_function, run_id=run_id, n_samples=n_samples_eval, **params, ) for run_id, params in zip(run_ids, parameters) ] results = [] for params, future, n_samples in zip(parameters, futures, n_samples_list): metrics_names, eval_res = client.gather(future) if n_samples != -1: params.update({'n_samples': n_samples}) results.append((params, eval_res)) print('Shutting down dask workers') client.close() eval_cluster.close() return metrics_names, results
def test_gc(): # Create pod, repo & collection pod = POD.from_uri("memory://") token = pod.token label = "my_label" repo = Repo(pod=pod) clc = repo.create_collection(schema, "my_collection") # Start cluster & schedule concurrent writes & gc cluster = LocalCluster(processes=False) client = Client(cluster) args = [(token, label, y) for y in years] insert_fut = client.map(insert, args) gc_fut = client.submit(do_squash_and_gc, token) assert sum(client.gather(insert_fut)) == 10_519_200 client.gather(gc_fut) client.close() cluster.close() # Read data back clc.merge() frm = clc.series("my_label").frame() assert len(frm) == 10_519_200
class daskerator(object): _DSCH = { 'd': 'distributed', 't': 'threads', 'p': 'processes', 's': 'synchronous' } def _get_sched(mp_type) -> str: if mp_type in daskerator._DSCH.keys(): return daskerator._DSCH[mp_type] else: return mp_type mp_type = attr.ib(default='s', type=str, converter=_get_sched, validator=attr.validators.in_( list(_DSCH.keys()) + list(_DSCH.values()))) sch_add = attr.ib(default='', type=str) @sch_add.validator def check_dask_opts(instance, attribute, value): if instance.mp_type != 'distributed' and value != '': raise ValueError( 'Only distributed dask can accept scheduler address.') _client = attr.ib(default=None) _cluster = attr.ib(default=None) def __attrs_post_init__(self): if self.mp_type[0] == 'd': from dask.distributed import Client, LocalCluster dbg("Creating distributed client object.") if self.sch_add == '': dbg("Creating new cluster on localhost.") self._cluster = LocalCluster() self._client = Client(self._cluster) else: dbg(f"Existing scheduler address: {self.sch_add}") self._client = Client(self.sch_add) log.info(self._client) @curry def run_dask(self, func, iterator): dbg(f'Scheduler: {self.mp_type}') if self.mp_type[0] == 'd': dbg('Using dask client') return self._client.gather(self._client.map(func, iterator)) else: dbg('Not using dask client.') return compute(*map(delayed(func), iterator), scheduler=self.mp_type)
def generate_captchas(): # start Dask distributed client with 4 processes / 1 thread per process client = DaskClient(n_workers=6, threads_per_worker=1) # submit future functions to cluster futures = [] for i in range(10000): futures.append(client.submit(synth_captcha, pure=False)) # execute and compute results (synchronous / blocking!) results = client.gather(futures) print(len(results)) # stop & release client client.close()
def convert_batch(apkFilenameList): """Convert APK files to AppGene files in batch. Dask creates multiple threads or use multiple nodes to execute the convertSingleApk function. Args: apkFilenameList: A list of the base filenames of APK files available from the HTTP interface to be converted Returns: A list of conversion result objects """ client = Client(daskSchedulerConnection) # One APK file per new task futures = client.map(convertSingleApk, apkFilenameList) # Await until all tasks are done results = client.gather(futures) return list(results)
def main(): cluster = LocalCluster(n_workers=4, threads_per_worker=1) client = Client(cluster) print("started cluster") num_layers = [2, 3, 4] num_neurons = [20, 40, 60] futures = [] for l in num_layers: for n in num_neurons: futures.append(client.submit(train_random_model, l, n)) results = client.gather(futures) print(results) client.close() return