def run_hg38_variant_retrieval(client: Client = None, force: bool = False) -> List[Future]: """ Executes the genomic variant retrieval step of the ETL pipeline for hg38 variants. arguments client: a dask Client object force: if true, datasets will be downloaded even if they exist locally returns a list of Futures, one per chromosome variant build """ client = get_client() if client is None else client futures = [] for chrom in Globals().var_human_chromosomes: ## Download from Ensembl dl = client.submit(download_hg38_variant_build, chrom, force=force) ## Decompress dl_unzip = client.submit(_unzip, dl, force=force) futures.append(dl_unzip) return futures
def main(): n_mutation = 100 client = Client('scheduler:8786') futures = client.map(initialize_network, range(n_mutation)) results = client.gather(futures) results.sort(key=lambda x: -x[1]) truncated = list(map(lambda x: x[0], results[:3])) futures = [] for i, seed in enumerate(truncated): name = 'top-{}'.format(i) futures.append( client.submit(initialize_network, seed, store=True, name=name)) results = client.gather(futures) print(results, flush=True) for g in range(10): futures = [] for seed in range(n_mutation): futures.append(client.submit(update_network, seed, g + 1)) results = client.gather(futures) results.sort(key=lambda x: -x[1]) truncated = list(map(lambda x: x[0], results[:3])) futures = [] for i, seed in enumerate(truncated): name = 'top-{}'.format(i) futures.append( client.submit(update_network, seed, g + 1, store=True, name=name)) results = client.gather(futures) print(results, flush=True)
def main(): """.""" host = os.getenv('DASK_SCHEDULER_HOST', default='localhost') port = os.getenv('DASK_SCHEDULER_PORT', default=8786) print(host, port) client = Client('{}:{}'.format(host, port)) # client.run(init_logging) # client.run_on_scheduler(init_logging) # Run some mock functions and gather a result data = client.map(print_listdir, range(10)) future = client.submit(print_values, data) progress(future) print('') result = client.gather(future) print(result) # Run a second stage which runs some additional processing. print('here A') data_a = client.map(set_value, range(100)) print('here B') data_b = client.map(square, data_a) print('here C') data_c = client.map(neg, data_b) print('here D') # Submit a function application to the scheduler total = client.submit(sum, data_c) print('here E') progress(total) print(total.result()) print('here F')
def log2tf_winsorizer(meta: pd.DataFrame, counts: pd.DataFrame, log2_transform: bool, threads: int) -> dict: """ Builds a cluster structure and calculates the means values """ cluster_names = meta['cell_type'].drop_duplicates().tolist() def mstats_winsorizer(s): return mstats.winsorize(s, limits=[0, 0.05]) ########## # def WinsorizeSampleList(data): # # quantiles = data.quantile([0.95]) # # q_05 = quantiles.loc[0.05] # # q_95 = quantiles.loc[0.95] # quantiles = data.quantile([0.25, 0.75]) # q_25 = quantiles.loc[0.25] # q_75 = quantiles.loc[0.75] # step = (q_75 - q_25) * 1.5 # # return data[(data.values >= q_05) & (data.values <= q_95)] # return data[(data.values >= q_75 + step)] ########### # Winsorizer function def winsorizer_process(cluster_count, log2_transform): winsorized_cluster_count_array = cluster_count.apply(mstats_winsorizer, axis=1) if log2_transform: winsorized_cluster_count_array = winsorized_cluster_count_array[:].apply( lambda x: np.log2(x + 1)) return pd.DataFrame.from_records(winsorized_cluster_count_array, \ index=cluster_count.index, columns=cluster_count.columns) chunks = [ counts.loc[:, meta[meta['cell_type'] == cluster_name].index] for cluster_name in cluster_names ] # Concatenating the individual dataframes def df_concatenate(dfs): return pd.concat(dfs, axis=1) # Starting DASK client and submitting parallel processing jobs client = Client() L = [ client.submit(winsorizer_process, future, log2_transform) for future in chunks ] future = client.submit(df_concatenate, L) result = future.result() return result
def threadExample(self): world_rank = self.comms.comm.Get_rank() world_size = self.comms.comm.Get_size() client = Client() exec_time = 0 exec_time -= time.time() a = client.submit( self.sendToRank, 1, world_size) # calls inc(10) in background thread or process b = client.submit( self.recvFromRank, 0, world_size) # calls inc(20) in background thread or process print(a.result(), b.result()) exec_time += time.time() print("Dask Time Taken : " + str(exec_time))
def test_recv_any_rank(n_trials, ucx_cluster): client = Client(ucx_cluster) try: cb = CommsContext(comms_p2p=True) cb.init() dfs = [ client.submit(func_test_recv_any_rank, cb.sessionId, n_trials, random.random(), workers=[w]) for w in cb.worker_addresses ] wait(dfs) result = [x.result() for x in dfs] assert result finally: cb.destroy() client.close()
class QAExtractor(Extractor): def __init__(self, client): if type(client) == str: logger.info('Input is a str, inferring to be scheduler address. Initializing client') self.addr = client self.client = Client(client)#, serializers=['msgpack', 'dask'], deserializers=['msgpack', 'dask']) logger.info(self.client) else: self.client = client self.addr = None def __del__(self): if self.addr is not None: self.client.close() def extract(self, query, context): result = self.client.submit(QAExtractor._extract, query, context, resources={'qa': 1}) result = result.result() return result @classmethod def _extract(cls, query, context): worker = get_worker() dp = None for plg in worker.plugins: if 'qa_extractor' in plg: dp = worker.plugins[plg] break if dp is None: raise Exception('No QA plugin registered') model = dp.model answer, score = model.extract(query, context) return answer, score
def run_simulations_dask(xgaps, numpanelss, sensorsxs, kwargs): # Create client scheduler_file = '/scratch/sayala/dask_testing/scheduler.json' client = Client(scheduler_file=scheduler_file) # Iterate over inputs futures = [] for nn in range(0, len(numpanelss)): numpanels = numpanelss[nn] for xx in range(0, len(xgaps)): xgap = xgaps[xx] for ii in sensorsxs: futures.append( client.submit(simulate_single, xgap=xgap, numpanels=numpanels, sensorx=ii, **kwargs)) # Get results for all simulations res = client.gather(futures) # Close all dask workers and scheduler try: client.shutdown() except: pass # Close client client.close() res = 'FINISHED!!!!!!!!!!!!!!!!!!!!!!!!!!!!!' return res
def start_futures(): t = time() isins = get_isins() client = Client('127.0.0.1:8786') data = client.map(load_data, isins) params_a = client.map(get_param, data, ['param_a'] * len(isins)) params_b = client.map(get_param, data, ['param_b'] * len(isins)) result_a = client.map(task_a, isins, params_a, params_b) group_args = list(chain(*zip(isins, result_a, params_b))) result_group = client.submit(task_group_alter, *group_args) result_b = client.map(task_b, isins, params_b, [result_group] * len(isins)) result_c = client.map(task_c, isins, params_b) result = client.gather([result_group] + result_a + result_b + result_c) total = time() - t print(total) print(len(result)) with open('/Users/vladimirmarunov/git/dask-test/res.txt', 'w') as f: f.write('{}\n'.format(total)) json.dump(result, f, indent=4)
def _submit(self, cluster: "ClusterType", client: Client, f: Callable, *args, **kwargs) -> Future: # For normal tasks, we maintain the Dask default that functions are pure (by # default) kwargs.update( {"pure": getattr(cluster, "pure", getattr(kwargs, "pure", True))}) return client.submit(f, *args, **kwargs)
def plugin_f_and_f(dump, plugin, params, user_pk): """ Fire and forget plugin on dask """ dask_client = Client(settings.DASK_SCHEDULER_URL) fire_and_forget( dask_client.submit(run_plugin, dump, plugin, params, user_pk))
def subdivideCheck(client: distributed.Client, lower: float, upper: float, func: Callable[[float], float], step: float, eps: float) -> distributed.Future: """Subdivide the range upper<=x<=lower into segments of size at most 1, and then call findZero(func, l, u, eps) on each.""" if (upper - lower) <= 1: # proxy for determining if this interval should be searched. 1 in 4 chance. if 1 == randint(0, 3): return client.submit(piece, lower, upper, func, step, eps) else: return [] else: # proxy for recursive exploration of a data space or structure. mid = (upper + lower) / 2. rl = subdivideCheck(client, lower, mid, func, step, eps) rr = subdivideCheck(client, mid, upper, func, step, eps) return client.submit(concat, rl, rr)
def run_simulations_dask(clearance_heights, xgaps, Ds, tilts, kwargs): # Create client scheduler_file = '/scratch/sayala/dask_testing/scheduler.json' client = Client(scheduler_file=scheduler_file) # Iterate over inputs futures = [] for ch in range (0, len(clearance_heights)): clearance_height = clearance_heights[ch] for xx in range (0, len(xgaps)): xgap = xgaps[xx] for tt in range (0, len(tilts)): tilt = tilts[tt] for dd in range (0, len(Ds)): D = Ds[dd] futures.append(client.submit(simulate_single, clearance_height=clearance_height, xgap=xgap, tilt=tilt, D=D, **kwargs)) # Get results for all simulations res = client.gather(futures) # Close all dask workers and scheduler try: client.shutdown() except: pass # Close client client.close() res = 'FINISHED!!!!!!!!!!!!!!!!!!!!!!!!!!!!!' return res
def test_allreduce(cluster): client = Client(cluster) try: cb = CommsContext() cb.init() start = time.time() dfs = [ client.submit(func_test_allreduce, cb.sessionId, random.random(), workers=[w]) for wid, w in zip(range(len(cb.worker_addresses)), cb.worker_addresses) ] wait(dfs) print("Time: " + str(time.time() - start)) print(str(list(map(lambda x: x.result(), dfs)))) assert all(list(map(lambda x: x.result(), dfs))) finally: cb.destroy() client.close()
def run_simulations_dask(tilts, kwargs): # Create client scheduler_file = '/scratch/sayala/dask_testing/scheduler.json' client = Client(scheduler_file=scheduler_file) # Iterate over inputs futures = [] # Add Iterations HERE for tilt in tilts: futures.append(client.submit(simulate_single, tilt=tilt, **kwargs)) # Get results for all simulations res = client.gather(futures) # Close all dask workers and scheduler try: client.shutdown() except: pass # Close client client.close() res = 'FINISHED!!!!!!!!!!!!!!!!!!!!!!!!!!!!!' return res
def main(): client = Client(address=DASK_SCHEDULER_URL) # , asynchronous=True print(client) # client.restart() # client.close(10) # scheduler_info = client.scheduler_info() # from pprint import pprint # pprint(scheduler_info) x = client.submit(add, 1, 2) # print("status:{},key:{},done():{},result:{}".format(x.status, x.key, x.done(), x.result())) print("status:{},key:{},done():{}".format(x.status, x.key, x.done())) y = client.submit(np.random.random, 1000, pure=False) print(y.key) z = client.submit(np.random.random, 1000, pure=False) print(z.key)
def train_on_jz_dask(job_name, train_function, *args, **kwargs): cluster = SLURMCluster( cores=1, job_cpu=40, memory='80GB', job_name=job_name, walltime='20:00:00', interface='ib0', job_extra=[ f'--gres=gpu:4', '--qos=qos_gpu-t3', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/understanding-unets', '. ./submission_scripts_jean_zay/env_config.sh', ], ) cluster.scale(1) print(cluster.job_script()) client = Client(cluster) futures = client.submit( # function to execute train_function, *args, **kwargs, # this function has potential side effects pure=True, ) run_id = client.gather(futures) print(f'Train run id: {run_id}')
def DASK_batch_mult(matrix_input, vector_input, workers, batch_size, input_size, output_channels): client = Client(n_workers=workers) results = [] batch_no = matrix_input.shape[0] // batch_size for i in range(batch_no): batch = client.scatter(matrix_input[i * batch_size:i * batch_size + batch_size]) results.append( client.submit(convolution_mean, batch, vector_input, batch_size, vector_input.shape[0])) wait(results) data = client.gather(results) out_tensor = np.empty( (batch_size * batch_no, output_channels, input_size, input_size)) for i in range(batch_no): out_tensor[i * batch_size:i * batch_size + batch_size] = data[i].reshape(batch_size, output_channels, input_size, input_size) client.shutdown() return out_tensor
def test_recv_any_rank(n_trials, ucx_cluster): client = Client(ucx_cluster) try: cb = CommsContext(comms_p2p=True) cb.init() dfs = [ client.submit(func_test_recv_any_rank, cb.sessionId, n_trials, random.random(), workers=[w]) for wid, w in zip(range(len(cb.worker_addresses)), cb.worker_addresses) ] wait(dfs) result = list(map(lambda x: x.result(), dfs)) assert (result) finally: cb.destroy() client.close()
def data_processing(input_file): try: featured_index_dict = fetch_pickle_FromS3('featured_index_dict.pkl') data_dataframe = pd.read_csv(input_file) total_rows = data_dataframe.shape[0] data_X = data_transformation(data_dataframe, featured_index_dict) print('1) data_X.shape: ', data_X.shape) for i in range(0, len(PICKLED_MODELS)): # Load Model model = fetch_pickle_FromS3(PICKLED_MODELS[i]) model_name = secure_filename(PICKLED_MODELS[i]).rsplit('.', 1)[0] # Make prediction if total_rows > 5: client = Client(processes=False) print('2-if) data_X.shape: ', data_X.shape) prediction = client.submit(model.predict, data_X).result().tolist() else: print('2-else) data_X.shape: ', data_X.shape) prediction = model.predict(data_X).tolist() print('3) data_X.shape: ', data_X.shape) prediction_series = pd.Series(prediction) data_dataframe[model_name] = prediction_series return data_dataframe except Exception as e: print(str(e)) raise e
def load_data_parallel(data_path, num_processes, image_variable="abi", count_variable="flash_counts", time_variable="time"): cluster = LocalCluster(n_workers=num_processes, threads_per_worker=1) client = Client(cluster) data_files = sorted(glob(join(data_path, "*.nc"))) data_jobs = [] for data_file in data_files: data_jobs.append( client.submit(load_single_data_file, data_file, image_variable=image_variable, count_variable=count_variable, time_variable=time_variable)) wait(data_jobs) data_results = client.gather(data_jobs) all_images = np.concatenate([d[0] for d in data_results]) all_counts = np.concatenate([d[1] for d in data_results]) all_time = pd.DatetimeIndex(np.concatenate([d[2] for d in data_results])) client.close() cluster.close() del client del cluster return all_images, all_counts, all_time
def run_simulations_dask(daylist, posxs, moduleWiths, kwargs): # Create client scheduler_file = '/scratch/sayala/dask_testing/scheduler.json' client = Client(scheduler_file=scheduler_file) # Iterate over inputs futures = [] # Add Iterations HERE for daydate in daylist: for posx in posxs: for moduleWith in moduleWiths: futures.append( client.submit(simulate_single, daydate=daydate, posx=posx, moduleWith=moduleWith, **kwargs)) # Get results for all simulations res = client.gather(futures) # Close all dask workers and scheduler try: client.shutdown() except: pass # Close client client.close() res = 'FINISHED!!!!!!!!!!!!!!!!!!!!!!!!!!!!!' return res
def test_dask_connection(): cluster = LocalCluster( scheduler_port=0, silence_logs=True, processes=False, asynchronous=False, ) client = Client(cluster, asynchronous=False) def square(x): return x**2 def neg(x): return -x # Run a computation on Dask a = client.map(square, range(10)) b = client.map(neg, a) total = client.submit(sum, b) result = total.result() if result != -285: raise AssertionError("Result is " + str(result)) else: print("The result is correct!!!") client.close() cluster.close() return True
def _find_ports_for_workers(client: Client, worker_addresses: Iterable[str], local_listen_port: int) -> Dict[str, int]: """Find an open port on each worker. LightGBM distributed training uses TCP sockets by default, and this method is used to identify open ports on each worker so LightGBM can reliable create those sockets. Parameters ---------- client : dask.distributed.Client Dask client. worker_addresses : Iterable[str] An iterable of addresses for workers in the cluster. These are strings of the form ``<protocol>://<host>:port`` local_listen_port : int First port to try when searching for open ports. Returns ------- result : Dict[str, int] Dictionary where keys are worker addresses and values are an open port for LightGBM to use. """ lightgbm_ports = set() worker_ip_to_port = {} for worker_address in worker_addresses: port = client.submit( func=_find_open_port, workers=[worker_address], worker_ip=urlparse(worker_address).hostname, local_listen_port=local_listen_port, ports_to_skip=lightgbm_ports ).result() lightgbm_ports.add(port) worker_ip_to_port[worker_address] = port return worker_ip_to_port
def train_on_jz_dask(job_name, train_function, *args, **kwargs): cluster = SLURMCluster( cores=1, job_cpu=20, memory='80GB', job_name=job_name, walltime='60:00:00', interface='ib0', job_extra=[ f'--gres=gpu:1', '--qos=qos_gpu-t4', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) cluster.scale(1) print(cluster.job_script()) client = Client(cluster) futures = client.submit( # function to execute train_function, *args, **kwargs, # this function has potential side effects pure=True, ) client.gather(futures) print('Shutting down dask workers')
def test_send_recv(n_trials): cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) cb = CommsContext(comms_p2p=True) cb.init() cb = default_comms() start = time.time() dfs = [client.submit(func_test_send_recv, cb.sessionId, n_trials, random.random(), workers=[w]) for wid, w in zip(range(len(cb.worker_addresses)), cb.worker_addresses)] wait(dfs) print("Time: " + str(time.time() - start)) result = list(map(lambda x: x.result(), dfs)) print(str(result)) assert(result) cb.destroy() client.close() cluster.close()
def predict_outcome(data_X, data_dataframe): pickled_models = fetch_pickle_FromS3('pickled_models.pkl') total_rows = data_dataframe.shape[0] small_df = data_dataframe.filter(['customerID'], axis=1) for i in range(1, (len(pickled_models)+1)): print('2) data_X.shape.in_predict_outcome: ', data_X.shape) # Load Model model_rank = i print('model_rank: ', i) model = pickled_models[i][0] model_name = 'Rank '+str(model_rank)+': '+pickled_models[i][1] print('model_name: ', model_name) # Make prediction if total_rows > 5: client = Client(processes=False) print('2-if) data_X.shape: ', data_X.shape) prediction = client.submit(model.predict, data_X).result().tolist() else: print('2-else) data_X.shape: ', data_X.shape) prediction = model.predict(data_X).tolist() print('3) data_X.shape: ', data_X.shape) prediction_series = pd.Series(prediction) data_dataframe[model_name] = prediction_series small_df[model_name] = prediction_series return data_dataframe, small_df
def plugin_f_and_f(dump, plugin, params): """ Fire and forget plugin on dask """ dask_client = Client(settings.DASK_SCHEDULER_URL) fire_and_forget( dask_client.submit(run_plugin, dump, plugin, settings.ELASTICSEARCH_URL, params) )
def index_f_and_f(dump_pk, user_pk): """ Run all plugin for a new index on dask """ dask_client = Client(settings.DASK_SCHEDULER_URL) fire_and_forget( dask_client.submit(unzip_then_run, dump_pk, user_pk, settings.ELASTICSEARCH_URL) )
def run_nta_dask(parameters, input_dfs, tracer_df=None, jobid="00000000", verbose=True): dask_client = Client(processes=False) return dask_client.submit(run_nta, parameters, input_dfs, tracer_df, jobid, verbose)
def main(): client = Client('localhost:8786') A = client.map(set_value, range(100)) B = client.map(square, A) C = client.map(neg, B) total = client.submit(sum, C) print(progress(total)) print(total.result())