def training(self, examplesDf, daskClient: Client = None): clusters = [] groupSize = self.CONSTS.k * self.CONSTS.representationThr for label, group in examplesDf.groupby('label'): for chunk in range(0, len(group), groupSize): subgroup = group[chunk:chunk + groupSize] subgroupDf = pd.DataFrame(iter(subgroup['item'])) if daskClient: daskClient.scatter(subgroupDf) clusters += self.trainGroup(subgroupDf, label) return clusters
def run_photoz_dask(runs, modelD, galcat, output_dir, fit_bands, ip_dask): """Run the photo-z on a Dask cluster.""" path_out = Path(output_dir) / 'pzcat.pq' if path_out.exists(): print('Photo-z catalogue already exists.') return # If not specified, we start up a local cluster. client = Client(ip_dask) if not ip_dask is None else Client() xnew_modelD = client.scatter(fix_model(modelD, fit_bands)) #xnew_modelD = fix_model(modelD, fit_bands) galcat = dd.read_parquet(str(output_dir / 'galcat_in.pq')) #npartitions = int(302138 / 10) + 1 npartitions = int(9900 / 10) + 1 galcat = galcat.reset_index().repartition(npartitions=npartitions).set_index('ref_id') ebvD = dict(runs.EBV) pzcat = galcat.map_partitions( bcnz.fit.photoz_flatten, xnew_modelD, ebvD, fit_bands) pzcat = pzcat.repartition(npartitions=100) pzcat = dask.optimize(pzcat)[0] pzcat.to_parquet(str(path_out))
def DASK_batch_mult(matrix_input, vector_input, workers, batch_size, input_size, output_channels): client = Client(n_workers=workers) results = [] batch_no = matrix_input.shape[0] // batch_size for i in range(batch_no): batch = client.scatter(matrix_input[i * batch_size:i * batch_size + batch_size]) results.append( client.submit(convolution_mean, batch, vector_input, batch_size, vector_input.shape[0])) wait(results) data = client.gather(results) out_tensor = np.empty( (batch_size * batch_no, output_channels, input_size, input_size)) for i in range(batch_no): out_tensor[i * batch_size:i * batch_size + batch_size] = data[i].reshape(batch_size, output_channels, input_size, input_size) client.shutdown() return out_tensor
def fit_spiking_likelihood(position, spikes, is_training, place_bin_centers, place_bin_edges, is_track_interior, penalty=1E1, knot_spacing=30): """Estimate the place field model. Parameters ---------- position : ndarray, shape (n_time,) spikes : ndarray, shape (n_time, n_neurons) place_bin_centers : ndarray, shape (n_place_bins,) penalty : float, optional time_bin_size : float, optional Returns ------- spiking_likelihood : function """ if np.any(np.ptp(place_bin_edges, axis=0) <= knot_spacing): logging.warning("Range of position is smaller than knot spacing.") is_training = np.asarray(is_training).astype(float) include = ~np.isclose(is_training, 0.0) & ~np.any(np.isnan(position), axis=1) is_training = is_training[include] position = position[include] spikes = spikes[include] design_matrix = make_spline_design_matrix(position, place_bin_edges, knot_spacing) try: client = get_client() except ValueError: client = Client() dm = client.scatter(np.asarray(design_matrix), broadcast=True) place_field_coefficients = [ fit_glm(is_spike, dm, is_training, penalty).params for is_spike in spikes.T ] place_field_coefficients = np.stack( dask.compute(*place_field_coefficients), axis=1) predict_matrix = make_spline_predict_matrix(design_matrix.design_info, place_bin_centers) place_conditional_intensity = get_firing_rate(predict_matrix, place_field_coefficients, sampling_frequency=1) return partial(spiking_likelihood, design_matrix=design_matrix, place_field_coefficients=place_field_coefficients, place_conditional_intensity=place_conditional_intensity, is_track_interior=is_track_interior)
def estimate_place_fields(position, spikes, place_bin_centers, place_bin_edges, penalty=1E-1, knot_spacing=10): '''Gives the conditional intensity of the neurons' spiking with respect to position. Parameters ---------- position : ndarray, shape (n_time, n_position_dims) spikes : ndarray, shape (n_time, n_neurons) place_bin_centers : ndarray, shape (n_bins, n_position_dims) place_bin_edges : ndarray, shape (n_bins + 1, n_position_dims) penalty : float, optional knot_spacing : int, optional Returns ------- conditional_intensity : ndarray, shape (n_bins, n_neurons) ''' if np.any(np.ptp(place_bin_edges, axis=0) <= knot_spacing): logging.warning("Range of position is smaller than knot spacing.") design_matrix = make_spline_design_matrix(position, place_bin_edges, knot_spacing) design_info = design_matrix.design_info try: client = get_client() except ValueError: client = Client() design_matrix = client.scatter(np.asarray(design_matrix), broadcast=True) results = [ fit_glm(is_spike, design_matrix, penalty) for is_spike in spikes.T ] results = dask.compute(*results) predict_matrix = make_spline_predict_matrix(design_info, place_bin_centers) place_fields = np.stack( [get_firing_rate(predict_matrix, result) for result in results], axis=1) DIMS = ['position', 'neuron'] if position.shape[1] == 1: names = ['position'] coords = {'position': place_bin_centers.squeeze()} elif position.shape[1] == 2: names = ['x_position', 'y_position'] coords = { 'position': pd.MultiIndex.from_arrays(place_bin_centers.T.tolist(), names=names) } return xr.DataArray(data=place_fields, coords=coords, dims=DIMS)
def build_histogram(self, client: Client = None): """ Use numpy histogram2d to build out the counts for each cell. Important if we go to filter out cells that have insufficient density (soundings per cell) Parameters ---------- client optional dask client, if provided will map to cluster """ if self.x_range is None: self.construct_base_grid() # numpy histogram2d is slow as hell # self.cell_count, xedges, yedges = np.histogram2d(self.x, self.y, bins=(self.x_range, self.y_range)) bins = np.array([len(self.x_range) - 1, len(self.y_range) - 1]) if client is not None: # first index of chunks is the chunks in the 1st dim strt = 0 chnks = [] for c in self.x.chunks[0]: chnks.append([strt, strt + c]) strt += c bin_futs = client.scatter([bins] * len(chnks)) range_futs = client.scatter([self.ranges] * len(chnks)) x_futs = client.scatter([self.x[c[0]:c[1]].values for c in chnks]) y_futs = client.scatter([self.y[c[0]:c[1]].values for c in chnks]) rslt = client.map(hist2d_numba_seq, x_futs, y_futs, bin_futs, range_futs) summed_rslt = client.submit(_hist2d_add, rslt) self.cell_count = summed_rslt.result() else: try: self.cell_count = hist2d_numba_seq(self.x.values, self.y.values, bins, self.ranges) except AttributeError: # numpy workflow self.cell_count = hist2d_numba_seq(self.x, self.y, bins, self.ranges)
def train(self, algorithm, imgs, labels): params = self.get_params(algorithm) model = self.get_model(algorithm, params) logging.info("Training %s with the following parameters:" % (algorithm)) logging.info(params) dask_client = Client(DASK_IP_ADRESS) img_train, img_test, lbl_train, lbl_test = train_test_split( self.imgs, self.labels, test_size=0.2) futures_img_train = dask_client.scatter(img_train) futures_img_test = dask_client.scatter(img_test) futures_lbl_train = dask_client.scatter(lbl_train) futures_lbl_test = dask_client.scatter(lbl_test) future_model_fit = dask_client.submit(model.fit, futures_img_train, futures_lbl_train) model = future_model_fit.result() future_score_train = dask_client.submit(model.score, futures_img_train, futures_lbl_train) future_score_test = dask_client.submit(model.score, futures_img_test, futures_lbl_test) score_test = future_score_test.result() score_train = future_score_train.result() logging.info("Training complete, saving model %s to file" % (algorithm)) # saving the model to file with self.hdfs_client.write('/' + str(self.model_folder) + str(algorithm) + ".model") as writer: joblib.dump(model, writer) logging.info("Score on training set: %.4f, score on test set: %.4f" % (score_train, score_test)) return score_train, score_test
def run_dask_compute(h5_main): raw_data = h5_main[()] #cpu_cores = int(cpu_cores/8) #dask_raw_data = da.from_array(raw_data, chunks='auto') #cluster = LocalCluster(n_workers=cpu_cores/8) #client = Client(cluster, processes=True) #map = dask_raw_data.map_blocks(find_all_peaks, [20, 60], num_steps=30) #results = map.compute() client = Client(processes=False) dask_raw_data = client.scatter(raw_data) args = [[20, 60]] kwargs = {'num_steps': 30} L = client.submit(find_all_peaks, dask_raw_data, args, kwargs) dask_results = client.compute(L) cores = client.ncores() client.close() return cores
def main(): x = np.random.normal(size=(1000000, 5)) y = x.mean(axis=1) cluster = LocalCluster(n_workers=4, threads_per_worker=1, memory_limit='1G') client = Client(cluster) print(client) print("scattering") [x_ref, y_ref] = client.scatter([x, y], broadcast=True) jobs = [] for e in range(1, 30): print(e) jobs.append(client.submit(train_rf, e, x_ref, y_ref)) for job in as_completed(jobs): print(job.result()) del job client.rebalance() client.close() cluster.close() return
def run_dask(instances, cluster): from dask.distributed import Client client = Client(cluster) graphs = {} instance_to_graph = {} instances = list(instances) for (i, instance) in enumerate(instances): if instance.graph not in graphs: graphs[instance.graph] = client.scatter([instance.graph], broadcast=True)[0] inst = instance._replace(graph=None) instance_to_graph[inst] = graphs[instance.graph] instances[i] = inst results = client.map(process_dask, ((instance_to_graph[i], i) for i in instances)) return client.gather(results)
def main(): # Data creation times = pd.date_range( '2000-01-01', periods=300 ) # to stress more the system just increase the period value x = range(1) y = range(int(14e3)) cube = xr.DataArray(np.random.rand(len(times), len(x), len(y)), coords=[times, x, y], dims=['time', 'x', 'y']) pixels_pairs = np.argwhere(cube.isel(time=0).values) # Client client = Client(processes=False, n_workers=1, threads_per_worker=1) # client = Client() url = 'http://localhost:8787/status' webbrowser.open_new(url) for row_idx in cube.x.values: row = cube.isel(dict([('x', row_idx)])) px_list = [ith for ith in pixels_pairs if ith[0] == row_idx] output_carrier = pd.DataFrame(index=cube.time.values, columns=cube.y.values) chunks = np.array_split(px_list, multiprocessing.cpu_count() * 4) rowi = client.scatter(row, broadcast=True) futures = client.map(function, chunks, **{ 'data': rowi, 'parameter': 10 }) for future, result in as_completed(futures, with_results=True): output_carrier.update(result) cube[:, row_idx] = output_carrier.values print(cube) client.close()
def main(): scheduler = os.environ.get('DASK_SCHEDULER', 'scheduler:8786') client = Client(scheduler) path, _ = os.path.split(DEM_TINDEX) dems = get_target_tiles(DEM_TINDEX, path) tindex = get_tif_fragments(LAS_TINDEX) frags = client.scatter(tindex, broadcast=True) print('submitting tasks', flush=True) futures = [ client.submit(merge_frags, fname, bounds[1:], frags) for fname, bounds in zip(dems['location'], dems.bounds.itertuples()) ] dems['processed'] = client.gather(futures) print('tasks gathered', flush=True) dems.to_file(DEM_TINDEX.replace('.gpkg', '_out.gpkg'), driver='GPKG') client.close() sys.exit(0)
class ClientFuture(): def __init__(self, local_client_n_workers, local_client_threads_per_worker, use_dashboard=True): self.use_dashboard = use_dashboard if use_dashboard: self.dashboard_address = ':8787' else: self.dashboard_address = None host_ip = get_host_ip_address() self.local_cluster = LocalCluster( n_workers=local_client_n_workers, threads_per_worker=local_client_threads_per_worker, processes=True, host=host_ip, dashboard_address=self.dashboard_address) self.local_client = Client(address=self.local_cluster, timeout='2s') def submit(self, func, *args, **kwargs): future = self.local_client.submit(func, *args, **kwargs) return future def scatter(self, *args): scattered_args = self.local_client.scatter(args, broadcast=True) return scattered_args def get_dashboard_link(self): if self.use_dashboard: print('local cluster: ', self.local_cluster.dashboard_link) else: print('dashboard disabled')
##### Loading up the ensemble results if needed ##### if ensemble: try: ens_res, bestens = get_best_ensembles(CTDparams, params_preproc_ens, dataseed) except FileNotFoundError: print("Ensemble not found, skipping\n") continue ##### Preparing data ##### X, y, delaymask = get_X_y(dataseed, params_preproc_test, monkey, region, taskvar) Xalpha = X[:, delaymask][:, ::5] Xfut, Xalphafut, yfut = client.scatter((X, Xalpha, y), broadcast=True) if not ensemble: nneurons = X.shape[2] bestens = [np.arange(nneurons) for i in range(5)] if permutes: np.random.seed(dataseed) permseeds = np.random.randint(0, 999999, permutes) else: permseeds = [None] outerxval = KFold(n_splits=nouterfolds) subxval = KFold(n_splits=2) acc_test_futs = []
count += 1 if (results != []): return results return None for i in range(parallel_execs): if (i != parallel_execs - 1): scatter_data = data_sentence[int(i * len(data_sentence) / parallel_execs):int( (i + 1) * len(data_sentence) / parallel_execs)] else: scatter_data = data_sentence[int(i * len(data_sentence) / parallel_execs):] futures_array.append(client.scatter(scatter_data)) for i in range(parallel_execs): delayed_array.append(delayed(find_key)(futures_array[i], 'amet')) results_array = compute(*delayed_array) results_array = list(results_array) final_result = list() for index in range(len(results_array)): if (results_array[index] != None): for item in results_array[index]: item[0] = item[0] + int( index * len(data_sentence) / parallel_execs) final_result.extend(results_array[index]) print(final_result)
def run(self, client: DaskClient): """ Run the algorithm. Parameters ---------- client : DaskClient A client to Dask. rj : RedisClient A Redist Client, a rejson.Client Notes ----- This function runs the adaptive algorithm. Because it's asynchronous, this function should return if ``"reset" in rj.keys() and rj.jsonget("reset")``. """ rj = self.redis_client() answers: List = [] logger.info(f"Staring {self.ident}") def submit(fn: str, *args, allow_other_workers=True, **kwargs): if "workers" in kwargs: kwargs.update({"allow_other_workers": allow_other_workers}) return client.submit( getattr(type(self), fn), *args, **kwargs, ) update = False queries = np.array([]) scores = np.array([]) n_model_updates = 0 rj.jsonset(f"alg-perf-{self.ident}", root, []) save_deadline = 0.0 # right away data: List[Dict[str, Any]] = [] error_raised: List[int] = [] for k in itertools.count(): try: loop_start = time() datum = {"iteration": k, "ident": self.ident, "time": time()} answers = self.get_answers(rj, clear=True) datum["num_answers"] = len(answers) self_future = client.scatter(self) _start = time() if len(queries) and len(scores): queries_f = client.scatter(queries) scores_f = client.scatter(scores) else: queries_f = scores_f = [] if update: datum["cleared_queries"] = True __start = time() self.clear_queries(rj) datum["time_clearing"] = time() - __start else: datum["cleared_queries"] = False done = distributed.Event(name="pa_finished") done.clear() workers = list(client.has_what()) random.shuffle(workers) f_post = submit( "post_queries", self_future, queries_f, scores_f, done=done, workers=workers[0], ) f_model = submit( "process_answers", self_future, answers, workers=workers[1], ) f_search = submit( "get_queries", self_future, stop=done, workers=workers[2], ) time_model = 0.0 time_post = 0.0 time_search = 0.0 def _model_done(_): nonlocal time_model nonlocal done done.set() time_model += time() - _start def _post_done(_): nonlocal time_post time_post += time() - _start def _search_done(_): nonlocal time_search time_search += time() - _start f_model.add_done_callback(_model_done) f_post.add_done_callback(_post_done) f_search.add_done_callback(_search_done) # Future.result raises errors automatically posted = f_post.result() new_self, update = f_model.result() queries, scores, search_meta = f_search.result() _datum_update = { "n_queries_posted": posted, "n_queries_scored": len(queries), "n_queries_in_db": rj.zcard(f"alg-{self.ident}-queries"), "model_updated": update, "n_model_updates": n_model_updates, "time_posting_queries": time_post, "time_model_update": time_model, "time_search": time_search, "time": time(), **search_meta, } datum.update(_datum_update) if update: _s = time() self.__dict__.update(new_self.__dict__) datum["time_update"] = time() - _s n_model_updates += 1 if time() > save_deadline + 1e-3: save_deadline = time() + 60 _s = time() self.save() datum["time_save"] = time() - _s datum["time_loop"] = time() - loop_start data.append(datum) logger.info(datum) posting_deadline = data[0]["time"] + 2 * 60 if time() >= posting_deadline or k == 10 or k == 20: flush_logger(logger) keys = data[-1].keys() to_post = {} for _k in keys: vals = [d.get(_k, None) for d in data] vals = [v for v in vals if v] if not len(vals): continue if isinstance(vals[0], (int, np.integer)): Type = int elif isinstance(vals[0], (float, np.floating)): Type = float else: continue _update = { f"{_k}_median": np.median(vals), f"{_k}_mean": np.mean(vals), f"{_k}_min": np.min(vals), f"{_k}_max": np.max(vals), } if _k == "time": _update = {"time": _update["time_median"]} to_post.update( {_k: Type(v) for _k, v in _update.items()}) try: rj.jsonarrappend(f"alg-perf-{self.ident}", root, to_post) except ResponseError as e: if ("could not perform this operation on a key that doesn't exist" in str(e)): # I think this happens when the frontend deletes # the database when /reset is triggered pass else: raise e data = [] if "reset" in rj.keys() and rj.jsonget("reset", root): logger.warning(f"Resetting {self.ident}") self.reset(client, rj, futures=[f_model, f_post, f_search]) break except Exception as e: logger.exception(e) flush_logger(logger) error_raised.append(k) __n = 5 if np.diff(error_raised[-__n:]).tolist() == [1] * (__n - 1): logger.exception(e) flush_logger(logger) raise e return True
def main(): # client = Client(processes = False) # threads ? client = Client() size = 10000000 # size = 20 # shards = 20 # shards = 6 # shards = 1 shards = 12 shape = [size] lat = np.random.rand(size) * 180.0 - 90.0 lon = np.random.rand(size) * 360.0 - 180.0 resolution_ = 8 resolution = np.full(shape, resolution_, dtype=np.int64) # print('lat shape: ',lat.shape) print('') serial_start = timer() s_sids = ps.from_latlon(lat, lon, resolution_) s_sidsstr = [hex16(s_sids[i]) for i in range(len(s_sids))] serial_end = timer() # print('0 s_sids: ',s_sids) print('time s_sids: ', serial_end - serial_start) def w_from_latlon(llr): # print('') # print('llr: ',llr) sids = ps.from_latlon(llr[0], llr[1], int(llr[2][0])) # print('sids: ',sids) # print('') return sids # def w_from_latlon1(lat,lon,res): # return ps.from_latlon(np.array([lat],dtype=np.double)\ # ,np.array([lon],dtype=np.double)\ # ,int(res)) # sid = ps.from_latlon(lat,lon,resolution) # sid = client.map(w_from_latlon1,lat,lon,resolution) # futures dask_start = timer() shard_size = int(size / shards) shard_bins = np.arange(shards + 1) * shard_size shard_bins[-1] = size # print('---') # print('shards: ',shards) # print('shard_size: ',shard_size) # print('shard_bins: ',shard_bins) # print('---') lat_shards = [lat[shard_bins[i]:shard_bins[i + 1]] for i in range(shards)] lon_shards = [lon[shard_bins[i]:shard_bins[i + 1]] for i in range(shards)] res_shards = [ resolution[shard_bins[i]:shard_bins[i + 1]] for i in range(shards) ] llr_shards = [] for i in range(shards): llr_shards.append([lat_shards[i], lon_shards[i], res_shards[i]]) # print('llr_shards len: ',len(llr_shards)) # print('llr_shards: ',llr_shards) ## future = client.submit(func, big_data) # bad ## ## big_future = client.scatter(big_data) # good ## future = client.submit(func, big_future) # good # sid = client.map(w_from_latlon,llr_shards) # futures big_future = client.scatter(llr_shards) sid = client.map(w_from_latlon, big_future) # futures # print('0 sid: ',sid) # print('9 len(sid): ',len(sid)) # for i in range(shards): # print(i, ' 10 sid: ',sid[i]) # print(i, ' 11 sid: ',sid[i].result()) # print('15 sid: ',[type(i) for i in sid]) sid_cat = np.concatenate([i.result() for i in sid]) sidsstr = [hex16(sid_cat[i]) for i in range(len(sid_cat))] dask_end = timer() # print('2 sids: ',sids) sids = sid_cat print('') # for i in range(size-20,size): for i in np.array(np.random.rand(20) * size, dtype=np.int64): print("%09i" % i, sidsstr[i], s_sidsstr[i], ' ', sids[i] - s_sids[i]) print('') print('dask total threads: ', sum(client.nthreads().values())) print('size: ', size) print('shards: ', shards) print('') print('time sids: ', dask_end - dask_start) print('time s_sids: ', serial_end - serial_start) print('parallel speed up: ', (serial_end - serial_start) / (dask_end - dask_start)) client.close()
from dask.distributed import Client import dask.array as da from time import sleep def sqrt(x): return x**0.5 if __name__ == '__main__': client = Client(n_workers=2, nthreads=1, memory_limit='512mb', dashboard_address=8787) print(client.scheduler_info()) sleep(3) while True: x = client.scatter(da.random.random((1000, 1000), chunks=(50, 50))) _ = client.submit(sqrt, x).result().compute() sleep(3)
def parse_alignment_bam( input_bam: Path, fragment_df: FragmentDf, alignment_table: Path = None, read_table: Path = None, overlap_table: Path = None, alignment_summary: Path = None, read_summary: Path = None, chunksize: int = 50000, n_workers: int = 1, ): """Filter alignments to keep only alignments that contribute to contacts Parameters ---------- input_bam : str Path to a namesorted bam with unfiltered alignments chunksize: int The alignments are batched for processing, this controls the batch size """ source_aligns = NameSortedBamSource(input_bam, metadata={}) source_aligns.discover() parallel = n_workers > 1 fragment_df = fragment_df.set_index( ["fragment_id"]).sort_index() # .rename_axis("index", axis=0) if parallel: from dask.distributed import Client, LocalCluster from time import sleep cluster = LocalCluster(processes=True, n_workers=n_workers, threads_per_worker=1) client = Client(cluster) fragment_df = client.scatter(fragment_df) writers = dict( alignment_table=TableWriter(alignment_table), read_table=TableWriter(read_table), overlap_table=TableWriter(overlap_table), ) batch_progress_bar = tqdm(total=None, desc="Alignments submitted: ", unit=" alignments", position=0) alignment_progress = AlignmentProgress(position=1) read_progress = ReadProgress(position=2) # perc_alignment_bar = tqdm(total=None, desc="Alignments processed: ", unit=" alignments", position=1) # stream that holds the raw alignment dfs bam_stream = Stream() # stream that holds the filtered/processed alignments if parallel: filtered_align_stream = (bam_stream.scatter().map( filter_read_alignments, fragment_df=fragment_df).buffer(n_workers).gather()) else: filtered_align_stream = bam_stream.map(filter_read_alignments, fragment_df=fragment_df) # write the alignments using the table writer, updating progress bar as we go align_sink = ( # noqa: F841 filtered_align_stream.pluck("alignment_table").accumulate( alignment_progress, returns_state=True, start=alignment_progress).sink(writers["alignment_table"])) read_sink = ( # noqa: F841 filtered_align_stream.pluck("read_table").accumulate( read_progress, returns_state=True, start=read_progress).sink(writers["read_table"])) overlap_sink = filtered_align_stream.pluck("overlap_table").sink( writers["overlap_table"]) # noqa: F841 for batch_idx, align_df in enumerate( source_aligns.read_chunked(chunksize=chunksize)): bam_stream.emit(align_df) batch_progress_bar.update(len(align_df)) batch_progress_bar.set_postfix({"batches": batch_idx}) if parallel: while True: processing = client.processing() still_running = [len(v) > 0 for k, v in processing.items()] if any(still_running): sleep(10) else: break client.close() cluster.close() batch_progress_bar.close() alignment_progress.close() alignment_progress.save(alignment_summary) read_progress.close() read_progress.save(read_summary) sys.stderr.write("\n\n\n") sys.stdout.write("\n") return read_progress.final_stats()
import pandas import dask.dataframe as dd from dask.distributed import Client client = Client("10.110.122.238:8888") df = pd.read_csv('trainingData.csv') future = client.scatter(df) # send dataframe to one worker ddf = dd.from_delayed([future], meta=df) # build dask.dataframe on remote data ddf = ddf.repartition(npartitions=20).persist() # split client.rebalance(ddf) # spread around all of your workers
def run_JK_distributed_massboosted(df, param): '''Receives the pandas dataframe with the objects containing the temperature decrements and the parameter object and run the kSZ statistic and generate Jack Knifes. Everything runs in the cluster, so current terminal does not need to request many cpus. df: dataframe object containing the variables for the calculation params: param file for this calculation NJK: how many subgroups we will make to run the calculation''' Ncores = envVars.Ncores NWorkers = envVars.NWorkers Ngroups = param.JK_NGROUPS #setup cluster cluster = SGECluster( walltime='172800', processes=1, cores=1, env_extra=[ '#$-pe sge_pe %i' % Ncores, '-l m_core=%i' % Ncores, 'mkdir -p /tmp/pag227/dask/dask-scratch', 'export NUMBA_NUM_THREADS=%i' % Ncores, 'export OMP_NUM_THREADS=%i' % Ncores # 'export OMP_NUM_THREADS=1', # noqa ]) cluster.scale(NWorkers) client = Client(cluster) time.sleep(30) #end setting up cluster #send full dataset to the cluster future_fullDataset = client.scatter(df) future_params = client.scatter(param) res_fullDataset = client.submit(get_pairwise_ksz_massboosted, future_fullDataset, future_params, multithreading=True) #done with the full dataset jk_results = [] futureData = [] #data to be sent in jk or bootstrap in galaxy space for j in range(Ngroups): df_bs = df.copy() choose = np.random.choice(len(df), len(df)) df_bs['dT'] = df.dT.values[choose] futureData.append(client.scatter(df_bs)) if param.JK_RESAMPLING_METHOD.lower() == "bs_dt_mass_boosted_est": get_pw_func = get_pairwise_ksz_massboosted elif param.JK_RESAMPLING_METHOD.lower( ) == 'bs_dt_mass_boosted_est_debiased': # noqa get_pw_func = get_pairwise_ksz_massboosted_debiased for j in range(Ngroups): jk_results.append( client.submit(get_pw_func, futureData[j], future_params, multithreading=True)) # extract results fullDataset_results = res_fullDataset.result() jk_results = client.gather(jk_results) client.close() # cluster.close() return fullDataset_results, jk_results
matched, ambiguous, failed = pickle.load(f) modifiers_dict = { k: bool(int(v)) for k, v in read_pairs_list('data/modifiers.txt') } modifiers_dict[None] = True candidates = make_candidates(matched, modifiers_dict.keys()) print('Number of candidates : {}'.format(len(candidates))) if True: # Dask processing cluster = LocalCluster(n_workers=48) client = Client(cluster) b = db.from_sequence(failed, partition_size=200) [c] = client.scatter( [candidates], broadcast=True) # Broadcast the list of candidates to the workers r = b.map(_fn, c) f = client.compute(r) progress(f) matching_results = f.result() else: # Multiprocessing matching_results = [] with Pool(40) as p: for simple_result in tqdm(p.imap(_fn, failed, chunksize=300), total=len(failed)): matching_results.append(simple_result) matching_results = sorted(matching_results, key=lambda x: x[1][0][0], reverse=True)
epsilon = args['epsilon'] # convergence stopping criterion M = args['dictatoms'] # dimensionality of the learned dictionary R = int(args['pnonzero'] * P) # enforces sparsity u_new = da.zeros(T) v = da.zeros(P) max_iterations = P * 10 file_D = os.path.join(args['dictionary'], "{}_D.txt".format(args["prefix"])) file_z = os.path.join(args['output'], "{}_z.txt".format(args["prefix"])) # Start the loop! for m in range(M): #Let us randomly generate a integer, broadcast that int, and create a seed. seed = np.random.randint(max_iterations + 1, high=4294967295) _SEED_ = client.scatter(seed, broadcast=True) np.random.seed(_SEED_.result()) #Create a dense random vector #Then subtracting off the mean an normalizing it u_old = da.random.random(T) u_old = dask_normalize(u_old).compute() #Setting loop criteria num_iterations = 0 delta = 2 * epsilon # Start the inner loop: this learns a single atom. while num_iterations < max_iterations and delta > epsilon: _U_ = client.scatter(u_old, broadcast=True)
n.symbols = None n.scores = None n.event = None n.char = None n.eventype = None n.AAevent = 0 for i, l in enumerate(tree.leaf_nodes()): l.event = {} l.scores = {} l.symbols = {} l.char = {} l.calc = {} print('scattering tree') remote_tree = client.scatter(pickle.dumps(tree), broadcast=True) row_index = client.scatter(row_index, broadcast=True) print('done') retmatsize = (len(tree.nodes()), align_array.shape[0]) for annot_index, annot_row in annotation.iterrows(): #indexing starts at 1 for blast #####switch to sending the coordinates and masking for the matrix for j, codon in enumerate( range(annot_row.qstart - 1, annot_row.qend - 1, 3)): keep_codons += [count, count, count] keep_positions += [codon, codon + 1, codon + 2] count += 1 print('selecting positions') print('positions to analyze:', len(keep_positions)) mapping = dict(zip(keep_positions, keep_codons))
args = parser.parse_args() # Cluster scheduler cluster = args.scheduler client = Client(cluster) print(client) client.upload_file( "/nfs/paper-big-data-engines/utils.py") # Allow workers to use module client.upload_file( "/nfs/paper-big-data-engines/incrementation/Increment.py") # Read images paths = crawl_dir(os.path.abspath(args.bb_dir)) client.scatter(paths) results = [] for path in paths: img = client.submit(read_img, path, start=start, args=args) # Increment the data n time: for _ in range(args.iterations): img = client.submit(increment, img, delay=args.delay, start=start, args=args) # Save the data results.append(client.submit(save_results, img, start=start,
from dask.distributed import Client client = Client('tcp://172.17.0.2:8786') df = pd.read_csv("train.csv") def demo(df): X = df.drop(labels='Activity', axis=1) y = df['Activity'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25) gbm = GradientBoostingClassifier(learning_rate=0.05, max_features=106, n_estimators=300) gbm.fit(X_train, y_train) result = gbm.predict(X_test) score = accuracy_score(y_test, result) return score start = time.time() big_future = client.scatter(df) output = client.submit(demo, big_future) outcome = output.result() # outcome = client.gather(output) print(outcome) print("Time_taken:", (time.time() - start) % 60)
from Generators.gen_wordlist import generate_data from dask import delayed, compute from dask.distributed import Client import sys if (len(sys.argv) < 3): print("USAGE ./sort.py <size of file> <scheduler url>") exit(1) num_words = int(sys.argv[1]) sched_IP = sys.argv[2] data = generate_data(num_words) client = Client(sched_IP) result = [x for x in data.split("\n") if x != ''] def sort(data): content = data.split("\n") return "\n".join(sorted(content)) future = client.scatter(data) task = delayed(sort)(future) output = compute(task) print("Done")
parser.add_argument("--benchmark", action="store_true", help="benchmark results") args = parser.parse_args() # Cluster scheduler cluster = args.scheduler client = Client(cluster) print(client) client.upload_file("/nfs/paper-big-data-engines/utils.py") client.upload_file("/nfs/paper-big-data-engines/bidsApp-examples/Example.py") from Example import run_group, run_participant, site_crawler, subject_crawler # Retrieve all subject path subjects = subject_crawler(args.bids_dir) client.scatter(subjects) results = list() for subject in subjects: results.append( client.submit( run_participant, subject_id=subject[1], start=start, args=args, site=subject[0], ) ) client.gather(results)
else: AA_mutation = sparseND.COO( coords = (AAeventindex , np.ones(len(AAeventindex)) * column , AAeventypes ) , data = np.ones(len(AAeventindex) , ) , shape = (matsize[0] , matsize[1] ,len(transitiondict_AA ) ) , dtype = np.int32 ) count +=1 print('FINAL SAVE !') save_mats(count, runName, AA_mutation,nucleotide_mutation) print('DONE ! ') brake.set(False) return None #######start the sankof algo here ####################### print('starting sankof') #scale cluster #scatter the blank tree and row index for each process #remote_tree = client.scatter(tree) remote_index = client.scatter(IDindex) inq = Queue('inq') outq = Queue('outq') lock = Lock('x') stopiter = Variable(False) brake = Variable(True) saver_started = False workers_started = False #start workers for workers in range(NCORE*ncpu ): w = client.submit( calculate_small_parsimony , inq= None ,outq = None ,stopiter= stopiter , treefile=treefile , bootstrap_replicates = bootstrap_replicates,
class DaskHashBag(HashBag): """A HashBag that uses the `Dask <http://dask.org>`_ library.""" def start_client(self, **kwargs): global client from dask.distributed import Client try: client = Client(**kwargs) self.client = Client(**kwargs) except Exception as e: log.warn(e) def __init__(self, it=(), npartitions=None, client=None, **kwargs): self.client = client self.kwargs = kwargs self.try_npartitions = npartitions if kwargs: self.start_client(**kwargs) if isinstance(it, db.Bag): self.bag = it else: it = list(it) npartitions = npartitions or len(it) or None self.bag = db.from_sequence(it, npartitions=npartitions) def new(self, it): npartitions = max(self.try_npartitions or 1, self.bag.npartitions or 1) return DaskHashBag(it, npartitions=npartitions, client=self.client) def __repr__(self): kwargs = {"npartitions": self.bag.npartitions, **self.kwargs} args = (f"{k}={v.__repr__()}" for k, v in kwargs.items()) return f"DaskHashBag(%s)" % ", ".join(args) def load(self, *f): cls = self.__class__ from io import TextIOBase if isinstance(f, TextIOBase): return cls(robust_json_loads_lines(f), client=self.client) else: log.info(f"Reading {f} with {self.client}?") return cls( db.read_text(f).map_partitions(robust_json_loads_lines), client=self.client, ) @classmethod def concat(cls, hashbags): return hashbags[0].new(db.concat([hb.bag for hb in hashbags])) def take(self, n): self.bag = self.bag.take(n, npartitions=-1, compute=False) return self def persist(self): try: self.bag = self.bag.persist() except Exception as e: log.error(e) return self def __iter__(self): return iter(self.bag.compute()) def __len__(self): self.persist() return self.bag.count().compute() def pipe(self, func, *args, **kwargs): newargs = list(args) newkwargs = dict(kwargs) if self.client: try: if newargs: newargs = self.client.scatter(newargs, broadcast=True) if newkwargs: newkwargs = self.client.scatter(newkwargs, broadcast=True) except: log.debug(f"Scattering for {func.__name__} failed!") @functools.wraps(func) def listify(x, *args, **kwargs): return list(func(x, *args, **kwargs)) return self.new(self.bag.map_partitions(listify, *newargs, **newkwargs)) def fold_tree(self, key, binop): return self.new(self.bag.foldby(key, binop=binop).map(lambda x: x[1])) def fold(self, key, binop): import pandas as pd def combine(df): return functools.reduce(binop, df.table) df = self.bag.map(lambda t: {'table':t} ).to_dataframe(meta={'table': 'object'}) keymeta = pd.Series([key(t) for t in df.table.head(1)]) index = df.table.apply(key, meta = keymeta) groups = df.assign(index=index).set_index("index").groupby("index") return self.new(groups.apply(combine).to_bag()) def offset(self, get_attr, set_attr, default=0): d = self.bag.map(lambda t: {'table':t, get_attr: t.get(get_attr, default)}) df = d.to_dataframe(meta={'table': 'object', get_attr: 'int'}) vs = df[get_attr].cumsum() - df[get_attr] def setval(x, v): x[set_attr] = v return x return self.new(self.bag.map(setval, vs.to_bag())) def dump(self, f, **kwargs): from io import TextIOBase if isinstance(f, TextIOBase): HashBag.dump(self.bag.compute(), f) return self else: self.bag.map(json_dump).to_textfiles(f, last_endline=True) return self.load(f)