def benchmark(fn, args, filetype=None): """Benchmark when "fn" function gets called on "args" tuple. "args" may have a Kwargs instance at the end. If "filetype" is provided, it may be used to convert columns to categorical dtypes after reading (the "loading" is assumed). """ posargs = list(args) kwargs = {} # Remove Kwargs instance at end of posargs list, if one exists if posargs and isinstance(posargs[-1], Kwargs): lastarg = posargs.pop() kwargs.update(lastarg) if DEBUG: printable_posargs = ', '.join([str(posarg.head()) if hasattr(posarg, 'head') else str(posarg) for posarg in posargs]) printable_kwargs = ', '.join(['{}={}'.format(k, v) for k,v in kwargs.items()]) print('DEBUG: {}({}{})'.format(fn.__name__, printable_posargs, ', '+printable_kwargs if printable_kwargs else '', flush=True)) # Benchmark fn when run on posargs and kwargs start = time.time() res = fn(*posargs, **kwargs) # If we're loading data if filetype is not None: if filetype not in filetypes_storing_categories: opts=odict() if p.dftype == 'pandas': opts['copy']=False for c in p.categories: res[c]=res[c].astype('category',**opts) # Force loading (--cache=persist was provided) if p.dftype == 'dask' and DD_FORCE_LOAD: if DASK_CLIENT is not None: # 2017-04-28: This combination leads to a large drop in # aggregation performance (both --distributed and # --cache=persist were provided) res = DASK_CLIENT.persist(res) distributed.wait(res) else: if DEBUG: print("DEBUG: Force-loading Dask dataframe", flush=True) res = res.persist() end = time.time() return end-start, res
tar_spwtab.putcol(columnname="EFFECTIVE_BW", value=write_bandwidth) tar_spwtab.putcol(columnname="RESOLUTION", value=write_bandwidth) tar_spwtab.putcol(columnname="NUM_CHAN", value=[count] * nspw) t7 = time() print('Done, time consuming %.3fs' % (t7 - t6)) print('Finish operating channel-average on MS') if __name__ == '__main__': c = Client('172.31.99.84:8786') """ This program can ONLY be run once, if an exception occurs you need to delete the files in the modify_vis directory, and re-copy the source files to that directory, then run again. """ print('cpu_count: %d' % cpu_count()) s1 = time() avg_channel = [2, 4, 8, 16, 32, 64] lenght = len(avg_channel) dst = arl_path('source_data/modify_vis/day2_copy_avgchannel') dsts = [dst + str(avg) for avg in avg_channel] avg_ms = c.map(modify_ms, dsts, avg_channel) wait(avg_ms) s2 = time() print('Total time: %.3fs' % (s2 - s1))
def _fit( model, params, X_train, y_train, X_test, y_test, additional_calls, fit_params=None, scorer=None, random_state=None, ): original_model = model fit_params = fit_params or {} client = default_client() rng = check_random_state(random_state) info = {} models = {} scores = {} for ident, param in enumerate(params): model = client.submit(_create_model, original_model, ident, **param) info[ident] = [] models[ident] = model # assume everything in fit_params is small and make it concrete fit_params = yield client.compute(fit_params) # Convert testing data into a single element on the cluster # This assumes that it fits into memory on a single worker if isinstance(X_test, da.Array): X_test = client.compute(X_test) else: X_test = yield client.scatter(X_test) if isinstance(y_test, da.Array): y_test = client.compute(y_test) else: y_test = yield client.scatter(y_test) # Convert to batches of delayed objects of numpy arrays X_train, y_train = dask.persist(X_train, y_train) X_train = sorted(futures_of(X_train), key=lambda f: f.key) y_train = sorted(futures_of(y_train), key=lambda f: f.key) assert len(X_train) == len(y_train) # Order by which we process training data futures order = [] def get_futures(partial_fit_calls): """ Policy to get training data futures Currently we compute once, and then keep in memory. Presumably in the future we'll want to let data drop and recompute. This function handles that policy internally, and also controls random access to training data. """ # Shuffle blocks going forward to get uniform-but-random access while partial_fit_calls >= len(order): L = list(range(len(X_train))) rng.shuffle(L) order.extend(L) j = order[partial_fit_calls] return X_train[j], y_train[j] # Submit initial partial_fit and score computations on first batch of data X_future, y_future = get_futures(0) X_future_2, y_future_2 = get_futures(1) _models = {} _scores = {} _specs = {} d_partial_fit = dask.delayed(_partial_fit) d_score = dask.delayed(_score) for ident, model in models.items(): model = d_partial_fit(model, X_future, y_future, fit_params) score = d_score(model, X_test, y_test, scorer) spec = d_partial_fit(model, X_future_2, y_future_2, fit_params) _models[ident] = model _scores[ident] = score _specs[ident] = spec _models, _scores, _specs = dask.persist( _models, _scores, _specs, priority={tuple(_specs.values()): -1}) _models = {k: list(v.dask.values())[0] for k, v in _models.items()} _scores = {k: list(v.dask.values())[0] for k, v in _scores.items()} _specs = {k: list(v.dask.values())[0] for k, v in _specs.items()} models.update(_models) scores.update(_scores) speculative = _specs new_scores = list(_scores.values()) history = [] # async for future, result in seq: while True: metas = yield client.gather(new_scores) for meta in metas: ident = meta["model_id"] info[ident].append(meta) history.append(meta) instructions = additional_calls(info) bad = set(models) - set(instructions) # Delete the futures of bad models. This cancels speculative tasks for ident in bad: del models[ident] del scores[ident] del info[ident] if not any(instructions.values()): break _models = {} _scores = {} _specs = {} for ident, k in instructions.items(): start = info[ident][-1]["partial_fit_calls"] + 1 if k: k -= 1 model = speculative.pop(ident) for i in range(k): X_future, y_future = get_futures(start + i) model = d_partial_fit(model, X_future, y_future, fit_params) score = d_score(model, X_test, y_test, scorer) X_future, y_future = get_futures(start + k) spec = d_partial_fit(model, X_future, y_future, fit_params) _models[ident] = model _scores[ident] = score _specs[ident] = spec _models2, _scores2, _specs2 = dask.persist( _models, _scores, _specs, priority={tuple(_specs.values()): -1}) _models2 = { k: v if isinstance(v, Future) else list(v.dask.values())[0] for k, v in _models2.items() } _scores2 = {k: list(v.dask.values())[0] for k, v in _scores2.items()} _specs2 = {k: list(v.dask.values())[0] for k, v in _specs2.items()} models.update(_models2) scores.update(_scores2) speculative = _specs2 new_scores = list(_scores2.values()) models = { k: client.submit(operator.getitem, v, 0) for k, v in models.items() } yield wait(models) scores = yield client.gather(scores) best = max(scores.items(), key=lambda x: x[1]["score"]) info = defaultdict(list) for h in history: info[h["model_id"]].append(h) info = dict(info) raise gen.Return(Results(info, models, history, best))
def generate_prediction(sensor_path, size=11, chunk_size=500, classes=21, savedir=".", use_dask=False, client=None): """Yield one instance of data with raster indices Args: chunk_size: number of images per tfrecord size: N x N image size savedir: directory to save tfrecords use_dask: optional dask client to parallelize computation Returns: filename: tfrecords path """ with rasterio.open(sensor_path) as src: cols, rows = np.meshgrid(np.arange(src.shape[1]), np.arange(src.shape[0])) results = pd.DataFrame({ "rows": np.ravel(rows), "cols": np.ravel(cols) }) #turn ground truth into a dataframe of coords print("There are {} sensor pixels in the prediction data".format( results.shape[0])) #Create chunks to write results["chunk"] = np.arange(len(results)) // chunk_size basename = os.path.splitext(os.path.basename(sensor_path))[0] filenames = [] if use_dask: if client is None: raise ValueError( "use_dask is {} but no client specified".format(use_dask)) for g, df in results.groupby("chunk"): coordinates = zip(df.rows, df.cols) filename = "{}/{}_{}.tfrecord".format(savedir, basename, g) #Submit to dask client fn = client.submit(_record_wrapper_, sensor_path=sensor_path, index_iterable=coordinates, size=size, classes=classes, filename=filename, train=False) filenames.append(fn) wait(filenames) filenames = [x.result() for x in filenames] else: for g, df in results.groupby("chunk"): filename = "{}/{}_{}.tfrecord".format(savedir, basename, g) coordinates = zip(df.rows, df.cols) #Write record fn = _record_wrapper_(sensor_path=sensor_path, index_iterable=coordinates, size=size, classes=classes, filename=filename, train=False) filenames.append(fn) return filenames
print(a) volume = lx*ly*lz print("Dataset volume is {} ({} GByte)".format(volume, (volume*8)/2**30)) a = client.persist(a) a = da.concatenate([a,a], axis=1) a = da.concatenate([a,a], axis=1) a = da.concatenate([a,a], axis=1) a = da.concatenate([a,a], axis=1) a = da.concatenate([a,a], axis=1) # da.concatenate does not merge chunks, even if they span the entire dimension, need to rechunk: a = a.rechunk(chunks=(1, -1, -1)) a = client.persist(a) print(a) wait(a) # Slice in Z (fast) start_time = timeit.default_timer() sliced = a[7,:,:] sliced = sliced.compute() print(timeit.default_timer() - start_time) # Slice in Y (fast) start_time = timeit.default_timer() sliced = a[:,7,:] sliced = sliced.compute() print(timeit.default_timer() - start_time) # Slice in X (extremely slow, why?) start_time = timeit.default_timer()
def generate_hand_annotations(DEBUG, BASE_PATH, FILEPATH, SIZE, config, dask_client): #Generate tfrecords dirname = "hand_annotations/" annotations_file = BASE_PATH + dirname + "crops/hand_annotations.csv" class_file = utilities.create_classes(annotations_file) if DEBUG: tfrecords.create_tfrecords(annotations_file=annotations_file, class_file=class_file, image_min_side=config["image-min-side"], backbone_model=config["backbone"], size=SIZE, savedir=FILEPATH + dirname + "tfrecords/") else: #Collect annotation files for each tile annotations_file= BASE_PATH + dirname + "crops/hand_annotations.csv" df = pd.read_csv(annotations_file, names=["image_path","xmin","ymin","xmax","ymax","label"]) #enforce dtype, as there might be errors df.xmin = df.xmin.astype(pd.Int64Dtype()) df.ymin = df.ymin.astype(pd.Int64Dtype()) df.xmax = df.xmax.astype(pd.Int64Dtype()) df.ymax = df.ymax.astype(pd.Int64Dtype()) #Randomize rows df = df.sample(frac=1) #split pandas frame into chunks images = df.image_path.unique() indices = np.arange(len(images)) size = 500 chunk_list = [ ] #Split dataframe into chunks of images and write to file for i in range(ceil(len(indices) / size)): image_indices = indices[i * size:(i * size) + size] selected_images = images[image_indices] split_frame = df[df.image_path.isin(selected_images)] filename = BASE_PATH + dirname + "crops/hand_annotations{}.csv".format(i) split_frame.to_csv(filename, header=False,index=False) chunk_list.append(filename) print(" Created {} files to create tfrecords".format(len(chunk_list))) #Apply create tfrecords to each futures = dask_client.map( tfrecords.create_tfrecords, chunk_list, class_file=class_file, image_min_side=config["image-min-side"], backbone_model=config["backbone"], size=SIZE, savedir=FILEPATH + dirname + "tfrecords/") wait(futures) for future in futures: try: local_annotations = future.result() except Exception as e: print("future {} failed with {}".format(future, e))
def main(client): import cudf import dask_cudf product_reviews_df = read_tables() product_reviews_df = product_reviews_df[ ~product_reviews_df.pr_review_content.isnull()].reset_index(drop=True) product_reviews_df[ "pr_review_content"] = product_reviews_df.pr_review_content.str.lower( ) product_reviews_df[ "pr_review_content"] = product_reviews_df.pr_review_content.str.replace( [".", "?", "!"], [eol_char], regex=False) sentences = product_reviews_df.map_partitions( create_sentences_from_reviews) # need the global position in the sentence tokenized df sentences["x"] = 1 sentences["sentence_tokenized_global_pos"] = sentences.x.cumsum() del sentences["x"] word_df = sentences.map_partitions( create_words_from_sentences, global_position_column="sentence_tokenized_global_pos", ) # These files come from the official TPCx-BB kit # We extracted them from bigbenchqueriesmr.jar neg_sent_df = load_sentiment_words("negativeSentiment.txt", "NEG") pos_sent_df = load_sentiment_words("positiveSentiment.txt", "POS") sent_df = cudf.concat([pos_sent_df, neg_sent_df]) sent_df = dask_cudf.from_cudf(sent_df, npartitions=1) word_sentence_sentiment = word_df.merge(sent_df, how="inner", on="word") temp = word_sentence_sentiment.merge( sentences, how="left", left_on="sentence_idx_global_pos", right_on="sentence_tokenized_global_pos", ) temp = temp[["review_idx_global_pos", "word", "sentiment", "sentence"]] product_reviews_df = product_reviews_df[["pr_item_sk", "pr_review_sk"]] product_reviews_df["pr_review_sk"] = product_reviews_df[ "pr_review_sk"].astype("int32") final = temp.merge( product_reviews_df, how="inner", left_on="review_idx_global_pos", right_on="pr_review_sk", ) final = final.rename( columns={ "pr_item_sk": "item_sk", "sentence": "review_sentence", "word": "sentiment_word", }) keepcols = ["item_sk", "review_sentence", "sentiment", "sentiment_word"] final = final[keepcols].persist() # with sf100, there are 3.2M postive and negative review sentences(rows) final = final.sort_values(by=keepcols) final = final.persist() wait(final) return final
def main(client, config): import cudf import dask_cudf date_dim_df, store_returns_df, web_returns_df, product_reviews_df = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) # filter date table date_dim_df = date_dim_df.merge( date_dim_df, on=["d_week_seq"], how="outer", suffixes=("", "_r") ) date_dim_df = date_dim_df[date_dim_df.d_date_r.isin(q19_returns_dates)].reset_index( drop=True ) date_dim_df = date_dim_df[["d_date_sk"]].drop_duplicates() sr_merged_df = store_returns_df.merge( date_dim_df, left_on=["sr_returned_date_sk"], right_on=["d_date_sk"], how="inner", ) sr_merged_df = sr_merged_df[["sr_item_sk", "sr_return_quantity"]] sr_grouped_df = ( sr_merged_df.groupby(["sr_item_sk"]) .agg({"sr_return_quantity": "sum"}) .reset_index() .rename(columns={"sr_return_quantity": "sr_item_qty"}) ) sr_grouped_df = sr_grouped_df[sr_grouped_df["sr_item_qty"] > 0] wr_merged_df = web_returns_df.merge( date_dim_df, left_on=["wr_returned_date_sk"], right_on=["d_date_sk"], how="inner", ) wr_merged_df = wr_merged_df[["wr_item_sk", "wr_return_quantity"]] wr_grouped_df = ( wr_merged_df.groupby(["wr_item_sk"]) .agg({"wr_return_quantity": "sum"}) .reset_index() .rename(columns={"wr_return_quantity": "wr_item_qty"}) ) wr_grouped_df = wr_grouped_df[wr_grouped_df["wr_item_qty"] > 0].reset_index( drop=True ) sr_wr_merged_df = sr_grouped_df.merge( wr_grouped_df, left_on=["sr_item_sk"], right_on=["wr_item_sk"], how="inner" ) sr_wr_merged_df = sr_wr_merged_df[["sr_item_sk", "sr_item_qty", "wr_item_qty"]] product_reviews_df = product_reviews_df[ ~product_reviews_df.pr_review_content.isnull() ].reset_index(drop=True) product_reviews_df["pr_item_sk"] = product_reviews_df["pr_item_sk"].astype("int32") sr_wr_merged_df["sr_item_sk"] = sr_wr_merged_df["sr_item_sk"].astype("int32") merged_df = product_reviews_df.merge( sr_wr_merged_df, left_on=["pr_item_sk"], right_on=["sr_item_sk"], how="inner" ) cols_keep = [ "pr_item_sk", "pr_review_content", "pr_review_sk", "sr_item_qty", "wr_item_qty", ] merged_df = merged_df[cols_keep] merged_df["tolerance_flag"] = ( (merged_df["sr_item_qty"] - merged_df["wr_item_qty"]) / ((merged_df["sr_item_qty"] + merged_df["wr_item_qty"]) / 2) ).abs() <= 0.1 merged_df = merged_df[merged_df["tolerance_flag"] == True].reset_index(drop=True) merged_df = merged_df[["pr_item_sk", "pr_review_content", "pr_review_sk"]] merged_df["pr_review_content"] = merged_df.pr_review_content.str.lower() merged_df["pr_review_content"] = merged_df.pr_review_content.str.replace( [".", "?", "!"], [eol_char], regex=False ) sentences = merged_df.map_partitions(create_sentences_from_reviews) # need the global position in the sentence tokenized df sentences["x"] = 1 sentences["sentence_tokenized_global_pos"] = sentences.x.cumsum() del sentences["x"] word_df = sentences.map_partitions( create_words_from_sentences, global_position_column="sentence_tokenized_global_pos", ) # This file comes from the official TPCx-BB kit # We extracted it from bigbenchqueriesmr.jar sentiment_dir = "/".join(config["data_dir"].split("/")[:-3] + ["sentiment_files"]) with open(f"{sentiment_dir}/negativeSentiment.txt") as fh: negativeSentiment = list(map(str.strip, fh.readlines())) # dedupe for one extra record in the source file negativeSentiment = list(set(negativeSentiment)) sent_df = cudf.DataFrame({"word": negativeSentiment}) sent_df["sentiment"] = "NEG" sent_df = dask_cudf.from_cudf(sent_df, npartitions=1) word_sentence_sentiment = word_df.merge(sent_df, how="inner", on="word") merged_df["pr_review_sk"] = merged_df["pr_review_sk"].astype("int32") temp = word_sentence_sentiment.merge( sentences, how="left", left_on="sentence_idx_global_pos", right_on="sentence_tokenized_global_pos", ) temp = temp[["review_idx_global_pos", "word", "sentiment", "sentence"]] merged_df = merged_df[["pr_item_sk", "pr_review_sk"]] final = temp.merge( merged_df, how="inner", left_on="review_idx_global_pos", right_on="pr_review_sk" ) final = final.rename( columns={ "pr_item_sk": "item_sk", "sentence": "review_sentence", "word": "sentiment_word", } ) keepcols = ["item_sk", "review_sentence", "sentiment", "sentiment_word"] final = final[keepcols] final = final.persist() final = final.sort_values(by=keepcols) wait(final) return final
def _kneighbors(self, X, k): """ Internal function to query the kNN model. :param X: :param k: :return: """ client = default_client() if k is None: k = self.n_neighbors # Break apart Dask.array/dataframe into chunks/parts data_parts = X.to_delayed() parts = list(map(delayed, data_parts)) parts = client.compute(parts) # Start computation in the background yield wait(parts) for part in parts: if part.status == 'error': yield part # trigger error locally # A dict in the form of { part_key: part } key_to_part_dict = dict([(str(part.key), part) for part in parts]) who_has = yield client.who_has(parts) worker_parts = {} for key, workers in who_has.items(): worker = parse_host_port(first(workers)) if worker not in worker_parts: worker_parts[worker] = [] worker_parts[worker].append(key_to_part_dict[key]) """ Create IP Handles on each worker hosting input data """ # Format of input_devarrays = ([(X, y)..], dev) input_devarrays = [(worker, client.submit(input_to_device_arrays, part, {"k": k}, workers=[worker])) for worker, part in worker_parts.items()] yield wait(input_devarrays) """ Gather IPC handles for each worker and call _fit() on each worker containing data. """ exec_node, model = self.model # Need to fetch coefficient parts on worker on_worker = list(filter(lambda x: x[0] == exec_node, input_devarrays)) not_on_worker = list(filter(lambda x: x[0] != exec_node, input_devarrays)) ipc_handles = [client.submit(get_input_ipc_handles, future, workers=[a_worker]) for a_worker, future in not_on_worker] raw_arrays = [future for a_worker, future in on_worker] # IPC Handles are loaded in separate threads on worker so they can be # used to make calls through cython run = client.submit(_kneighbors_on_worker, (ipc_handles, raw_arrays), model, {"k": k}, workers=[exec_node]) yield wait(run) dfs = [client.submit(build_dask_dfs, f, {"k": k}, workers=[worker]) for worker, f in input_devarrays] yield wait(dfs) return gen.Return(dfs)
def bfs(graph, start, return_distances=False): """ Find the distances and predecessors for a breadth first traversal of a graph. The input graph must contain edge list as dask-cudf dataframe with one partition per GPU. Parameters ---------- graph : cugraph.DiGraph cuGraph graph descriptor, should contain the connectivity information as dask cudf edge list dataframe(edge weights are not used for this algorithm). Undirected Graph not currently supported. start : Integer Specify starting vertex for breadth-first search; this function iterates over edges in the component reachable from this node. return_distances : bool, optional, default=False Indicates if distances should be returned Returns ------- df : cudf.DataFrame df['vertex'][i] gives the vertex id of the i'th vertex df['distance'][i] gives the path distance for the i'th vertex from the starting vertex (Only if return_distances is True) df['predecessor'][i] gives for the i'th vertex the vertex it was reached from in the traversal Examples -------- >>> import cugraph.dask as dcg >>> Comms.initialize() >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.DiGraph() >>> dg.from_dask_cudf_edgelist(ddf) >>> df = dcg.bfs(dg, 0) >>> Comms.destroy() """ client = default_client() if (graph.local_data is not None and graph.local_data['by'] == 'src'): data = graph.local_data['data'] else: data = get_local_data(graph, by='src') if graph.renumbered: start = graph.lookup_internal_vertex_id(cudf.Series([start])).compute() start = start.iloc[0] result = dict([(data.worker_info[wf[0]]["rank"], client.submit(call_bfs, Comms.get_session_id(), wf[1], data.local_data, start, return_distances, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items())]) wait(result) df = result[0].result() if graph.renumbered: df = graph.unrenumber(df, 'vertex').compute() df = graph.unrenumber(df, 'predecessor').compute() df["predecessor"].fillna(-1, inplace=True) return df
def _fit(self, X, _transform=False): """ Fit the model with X. Parameters ---------- X : dask cuDF input """ n_cols = X.shape[1] data = DistributedDataHandler.create(data=X, client=self.client) self.datatype = data.datatype if "svd_solver" in self.kwargs \ and self.kwargs["svd_solver"] == "tsqr": comms = Comms(comms_p2p=True) else: comms = Comms(comms_p2p=False) comms.init(workers=data.workers) data.calculate_parts_to_sizes(comms) worker_info = comms.worker_info(comms.worker_addresses) parts_to_sizes, _ = parts_to_ranks(self.client, worker_info, data.gpu_futures) total_rows = data.total_rows models = dict([(data.worker_info[wf[0]]["rank"], self.client.submit(self._create_model, comms.sessionId, self._model_func, self.datatype, **self.kwargs, pure=False, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items())]) pca_fit = dict([ (wf[0], self.client.submit(DecompositionSyncFitMixin._func_fit, models[data.worker_info[wf[0]]["rank"]], wf[1], total_rows, n_cols, parts_to_sizes, data.worker_info[wf[0]]["rank"], _transform, pure=False, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items()) ]) wait(list(pca_fit.values())) raise_exception_from_futures(list(pca_fit.values())) comms.destroy() self._set_internal_model(list(models.values())[0]) if _transform: out_futures = flatten_grouped_results(self.client, data.gpu_futures, pca_fit) return to_output(out_futures, self.datatype) return self
def fit(self, X, y): """ Fit the input data with a Random Forest regression model IMPORTANT: X is expected to be partitioned with at least one partition on each Dask worker being used by the forest (self.workers). When persisting data, you can use cuml.dask.common.utils.persist_across_workers to simplify this:: X_dask_cudf = dask_cudf.from_cudf(X_cudf, npartitions=n_workers) y_dask_cudf = dask_cudf.from_cudf(y_cudf, npartitions=n_workers) X_dask_cudf, y_dask_cudf = persist_across_workers(dask_client, [X_dask_cudf, y_dask_cudf]) (this is equivalent to calling `persist` with the data and workers):: X_dask_cudf, y_dask_cudf = dask_client.persist([X_dask_cudf, y_dask_cudf], workers={ X_dask_cudf=workers, y_dask_cudf=workers }) Parameters ---------- X : dask_cudf.Dataframe Dense matrix (floats or doubles) of shape (n_samples, n_features). Features of training examples. y : dask_cudf.Dataframe Dense matrix (floats or doubles) of shape (n_samples, 1) Labels of training examples. y must be partitioned the same way as X """ c = default_client() X_futures = workers_to_parts(c.sync(extract_ddf_partitions, X)) y_futures = workers_to_parts(c.sync(extract_ddf_partitions, y)) X_partition_workers = [w for w, xc in X_futures.items()] y_partition_workers = [w for w, xc in y_futures.items()] if set(X_partition_workers) != set(self.workers) or \ set(y_partition_workers) != set(self.workers): raise ValueError(""" X is not partitioned on the same workers expected by RF\n X workers: %s\n y workers: %s\n RF workers: %s """ % (str(X_partition_workers), str(y_partition_workers), str(self.workers))) futures = list() for w, xc in X_futures.items(): futures.append( c.submit( RandomForestRegressor._fit, self.rfs[w], xc, y_futures[w], random.random(), workers=[w], )) wait(futures) raise_exception_from_futures(futures) return self
def __init__(self, n_estimators=10, max_depth=-1, max_features="auto", n_bins=8, split_algo=1, split_criterion=2, bootstrap=True, bootstrap_features=False, verbose=False, min_rows_per_node=2, rows_sample=1.0, max_leaves=-1, n_streams=4, accuracy_metric="mse", min_samples_leaf=None, min_weight_fraction_leaf=None, n_jobs=None, max_leaf_nodes=None, min_impurity_decrease=None, min_impurity_split=None, oob_score=None, random_state=None, warm_start=None, class_weight=None, quantile_per_tree=False, criterion=None, workers=None): unsupported_sklearn_params = { "criterion": criterion, "min_samples_leaf": min_samples_leaf, "min_weight_fraction_leaf": min_weight_fraction_leaf, "max_leaf_nodes": max_leaf_nodes, "min_impurity_decrease": min_impurity_decrease, "min_impurity_split": min_impurity_split, "oob_score": oob_score, "n_jobs": n_jobs, "random_state": random_state, "warm_start": warm_start, "class_weight": class_weight, } for key, vals in unsupported_sklearn_params.items(): if vals is not None: raise TypeError( " The Scikit-learn variable ", key, " is not supported in cuML," " please read the cuML documentation for" " more information", ) self.n_estimators = n_estimators self.n_estimators_per_worker = list() c = default_client() if workers is None: workers = c.has_what().keys() self.workers = workers n_workers = len(workers) if n_estimators < n_workers: raise ValueError( "n_estimators cannot be lower than number of dask workers.") n_est_per_worker = math.floor(n_estimators / n_workers) for i in range(n_workers): self.n_estimators_per_worker.append(n_est_per_worker) remaining_est = n_estimators - (n_est_per_worker * n_workers) for i in range(remaining_est): self.n_estimators_per_worker[i] = ( self.n_estimators_per_worker[i] + 1) seeds = list() seeds.append(0) for i in range(1, len(self.n_estimators_per_worker)): sd = self.n_estimators_per_worker[i - 1] + seeds[i - 1] seeds.append(sd) key = str(uuid1()) self.rfs = { worker: c.submit( RandomForestRegressor._func_build_rf, self.n_estimators_per_worker[n], max_depth, n_streams, max_features, n_bins, split_algo, split_criterion, bootstrap, bootstrap_features, verbose, min_rows_per_node, rows_sample, max_leaves, accuracy_metric, quantile_per_tree, seeds[n], key="%s-%s" % (key, n), workers=[worker], ) for n, worker in enumerate(workers) } rfs_wait = list() for r in self.rfs.values(): rfs_wait.append(r) wait(rfs_wait) raise_exception_from_futures(rfs_wait)
def __init__(self, dask_client, op_constructor, op_args, chunks, **kwargs): """ Dask Operator constructor :param dask_client: [no default] - DaskClient; client object to use when submitting tasks (see dask_util module) :param op_constructor: [no default] - pointer to function or list of pointers to functions; Pointer to constructor(s) :param op_args: [no default] - list; List containing lists of arguments to run the constructor. It can instantiate the same operator on multiple workers or different ones if requested by passing a list of list of arguments (e.g., [(arg1,arg2,arg3,...)]) If op_kind = blocky the order is column wise :param chunks: [no default] - list; List defining how many operators wants to instantiated. Note, the list must contain the same number of elements as the number of Dask workers present in the DaskClient. :param op_kind: [diag] - string; Mode to run the Dask Operator, diag = block diagonal operator blocky = blocky opearator (note: len(op_args) must be equal to np.sum(chunks)**2) :param setbackground_func_name: [None] - string; Name of the function to set the model point on which the Jacobian is computed. See NonLinearOperator in operator module. :param spread_op: [None] - DaskSpreadOp; Spreading operator to distribute a model vector to the set_background functions :param set_aux_name: [None] - string; Name of the function to set the auxiliary vector. Useful for VpOperator. :param spread_op_aux: [None] - DaskSpreadOp; Spreading operator to distribute an auxiliary vector to the set_aux functions """ # Client to submit tasks if not isinstance(dask_client, DaskClient): raise TypeError("Passed client is not a Dask Client object!") if not isinstance(op_args, list): raise TypeError("Passed operator arguments not a list!") self.dask_client = dask_client self.client = self.dask_client.getClient() wrkIds = self.dask_client.getWorkerIds() N_wrk = self.dask_client.getNworkers() # Check if number of provided chunks is the same as workers if len(chunks) != N_wrk: raise ValueError( "Number of provide chunks (%s) different than the number of workers (%s)" % (len(chunks), N_wrk)) # Check whether it is a blocky or block diagonal Dask operator self.op_kind = kwargs.get("op_kind", "diag") if self.op_kind not in "diag blocky": raise ValueError("Unknown op_kind provided (%s)" % self.op_kind) # Check if many arguments are passed to construct different operators N_args = len(op_args) N_ops = int(np.sum(chunks)) if self.op_kind == "diag" else int( np.sum(chunks))**2 if N_args > 1: if N_args != N_ops: raise ValueError( "Number of lists of arguments (%s) different than the number of requested operators (%s)" % (N_args, N_ops)) else: if N_ops > 1: op_args = [op_args for ii in range(N_ops)] # Instantiation of the operators on each worker self.dask_ops = [] self.dask_ops_adj = [] # Check if a list of constructors has been passed if isinstance(op_constructor, list): opt_list = op_constructor else: opt_list = [op_constructor] * N_ops self.n_col = 1 if self.op_kind == "diag" else int(np.sum(chunks)) # Creating list of adjoint operators if self.n_col > 1: opt_list_adj = opt_list.copy() op_args_adj = op_args.copy() # Creating adjoint operators for iwrk, wrkId in enumerate(wrkIds): for iop in range(chunks[iwrk]): for i_col in range(self.n_col): self.dask_ops_adj.append( self.client.submit(call_constructor, opt_list_adj.pop(0), op_args_adj.pop(0), workers=[wrkId], pure=False)) # Creating forward operators for i_col in range(self.n_col): for iwrk, wrkId in enumerate(wrkIds): for iop in range(chunks[iwrk]): self.dask_ops.append( self.client.submit(call_constructor, opt_list.pop(0), op_args.pop(0), workers=[wrkId], pure=False)) daskD.wait(self.dask_ops) # Checking for errors during operators construction for idx, fut in enumerate(self.dask_ops): if fut.status == 'error': print("Error for dask operator %s" % idx) print(fut.result()) # Creating domain and range of the Dask operator dom_vecs = [] # List of remote domain vectors rng_vecs = [] # List of remote range vectors op_list = self.dask_ops if self.n_col > 1: # Dealing with a blocky operator op_list = np.diag( np.asarray(self.dask_ops).reshape((self.n_col, self.n_col)).T) for op in op_list: dom_vecs.append(self.client.submit(call_getDomain, op, pure=False)) rng_vecs.append(self.client.submit(call_getRange, op, pure=False)) daskD.wait(dom_vecs + rng_vecs) _check_dask_error(dom_vecs + rng_vecs) self.domain = DaskVector(self.dask_client, dask_vectors=dom_vecs) self.range = DaskVector(self.dask_client, dask_vectors=rng_vecs) # Set background function name "necessary for non-linear operator Jacobian" self.set_background_name = kwargs.get("setbackground_func_name", None) if self.set_background_name: if self.op_kind != "diag": raise ValueError( "Set background not currently supported for blocky operators" ) if not isinstance(self.set_background_name, list): self.set_background_name = [self.set_background_name] * len( self.dask_ops) # Creating a spreading operator useful for self.Sprd = kwargs.get("spread_op", None) if self.Sprd: if not isinstance(self.Sprd, DaskSpread): raise TypeError( "Provided spread_op not a DaskSpreadOp class!") self.model_tmp = self.Sprd.getRange().clone() # Set aux function name "necessary for VP operator" self.set_aux_name = kwargs.get("set_aux_name", None) if self.set_aux_name: if self.op_kind != "diag": raise ValueError( "set_aux_name not currently supported for blocky operators" ) if not isinstance(self.set_aux_name, list): self.set_aux_name = [self.set_aux_name] * len(self.dask_ops) # Creating a spreading operator useful self.SprdAux = kwargs.get("spread_op_aux", None) if self.SprdAux: if not isinstance(self.SprdAux, DaskSpread): raise TypeError( "Provided spread_op_aux not a DaskSpreadOp class!") self.tmp_aux = self.SprdAux.getRange().clone() return
def _train(client, params, data, labels, sample_weight, dmatrix_kwargs={}, **kwargs): """ Asynchronous version of train See Also -------- train """ # Break apart Dask.array/dataframe into chunks/parts data_parts = data.to_delayed() label_parts = labels.to_delayed() if isinstance(data_parts, np.ndarray): assert data_parts.shape[1] == 1 data_parts = data_parts.flatten().tolist() if isinstance(label_parts, np.ndarray): assert label_parts.ndim == 1 or label_parts.shape[1] == 1 label_parts = label_parts.flatten().tolist() if sample_weight is not None: sample_weight_parts = sample_weight.to_delayed() if isinstance(sample_weight_parts, np.ndarray): assert sample_weight_parts.ndim == 1 or sample_weight_parts.shape[ 1] == 1 sample_weight_parts = sample_weight_parts.flatten().tolist() # Arrange parts into pairs. This enforces co-locality parts = list( map(delayed, zip(data_parts, label_parts, sample_weight_parts))) else: # Arrange parts into pairs. This enforces co-locality parts = list(map(delayed, zip(data_parts, label_parts))) parts = client.compute(parts) # Start computation in the background yield wait(parts) for part in parts: if part.status == 'error': yield part # trigger error locally # Because XGBoost-python doesn't yet allow iterative training, we need to # find the locations of all chunks and map them to particular Dask workers key_to_part_dict = dict([(part.key, part) for part in parts]) who_has = yield client.scheduler.who_has(keys=[part.key for part in parts]) worker_map = defaultdict(list) for key, workers in who_has.items(): worker_map[first(workers)].append(key_to_part_dict[key]) ncores = yield client.scheduler.ncores() # Number of cores per worker # Start the XGBoost tracker on the Dask scheduler host, port = parse_host_port(client.scheduler.address) env = yield client._run_on_scheduler(start_tracker, host.strip('/:'), len(worker_map)) # Tell each worker to train on the chunks/parts that it has locally futures = [ client.submit(train_part, env, assoc(params, 'nthread', ncores[worker]), list_of_parts, workers=worker, dmatrix_kwargs=dmatrix_kwargs, **kwargs) for worker, list_of_parts in worker_map.items() ] # Get the results, only one will be non-None results = yield client._gather(futures) result = [v for v in results if v][0] num_class = params.get("num_class") if num_class: result.set_attr(num_class=str(num_class)) raise gen.Return(result)
def main(client, settings): worldSize = MPI.COMM_WORLD.Get_size() - 2 global start # Setup with h5py.File(settings['databasePath'], 'r') as h5pyFile: database = SVDatabase(h5pyFile, settings['refStruct'], args.names) wait(database.load(h5pyFile)) names = list(database.attrs['structNames']) random.shuffle(names) splits = np.array_split(names, worldSize) from svreg.database import worker_load futures = client.map( worker_load, [settings['databasePath']] * worldSize, splits, [database.attrs['svNames']] * worldSize, [database.attrs['elements']] * worldSize, [settings['allSums']] * worldSize, ) client.gather(client.compute(futures)) evaluator = SVEvaluator(database, settings) regressor = SVRegressor(settings, database) archive = Archive(os.path.join(settings['outputPath'], 'archive')) costFxn = buildCostFunction(settings, len(database.attrs['natoms']), sum(database.attrs['natoms'].values())) # Begin symbolic regression if args.trees is not None: with open(args.trees, 'r') as f: treeNames = [s.strip() for s in f.readlines()] regressor.trees = [ MCTree.from_str(t, database.attrs['elements'], regressor.svNodePool) for t in treeNames ] regressor.initializeTrees(elements=database.attrs['elements']) regressor.initializeOptimizers() print("Currently optimizing:") for pidx, t in enumerate(regressor.trees): print(pidx, t) print() print() N = settings['optimizerPopSize'] rawPopulations = None errors = None costs = None population = Population(settings, regressor.svNodePool, database.attrs['elements']) numCompletedTrees = 0 maxNumTrees = settings['numRegressorSteps'] * settings['numberOfTrees'] start = time.time() fxnEvals = 1 while numCompletedTrees < maxNumTrees: # Remove any converged trees, update population, and print new results staleIndices, messages = regressor.checkStale() populationChanged = False # A tree has finished optimizing for staleIdx, staleMessage in zip(staleIndices, messages): candidate = regressor.trees[staleIdx] opt = regressor.optimizers[staleIdx] candidate.cost = opt.result.fbest # TODO: this might not agree perfectly with opt.result.xbest candidateParamsIdx = np.argmin(costs[staleIdx]) # candidate.cost = costs[staleIdx][candidateParamsIdx] err = errors[staleIdx][candidateParamsIdx] print() print() print("Completed tree {}:".format(staleIdx)) print("\t", candidate.cost, candidate) print("Stopping criterion:", staleMessage) numCompletedTrees += 1 # Log completed tree archive.update(candidate, candidate.cost, err, opt.result.xbest, opt) archive.log() # Randomly insert into current population inserted = population.attemptInsert(candidate) if inserted: populationChanged = True # Replace completed tree with new tree # Make sure new tree isn't already in archive or active population currentRegNames = [md5Hash(t) for t in regressor.trees] newTree, parent1, parent2 = population.newIndividual() generatedNew = False while not generatedNew: inArchive = False inReg = False for t in regressor.trees: if newTree == t: inReg = True for tname in archive: t = archive[tname].tree if newTree == t: inArchive = True if inArchive: print("Already in archive:", newTree) elif inReg: print("Already being optimized:", newTree) else: generatedNew = True if not generatedNew: newTree, parent1, parent2 = population.newIndividual() print("New tree:") print('\t', parent1) print('\t+') print('\t', parent2) print('\t=') print('\t', newTree) # Insert new tree into list of trees being optimized argsCopy = deepcopy(regressor.optimizerArgs) path = os.path.join(settings['outputPath'], 'outcmaes', '{}/') d = {'verb_filenameprefix': path.format(md5Hash(newTree))} d.update(regressor.optimizerArgs[-1]) argsCopy[-1] = d newOpt = regressor.optimizer(newTree.populate(N=1)[0], *argsCopy) regressor.trees[staleIdx] = newTree regressor.optimizers[staleIdx] = newOpt if staleIndices: if populationChanged: # Print current population if it was updated print() print() print("Current population:") popCosts = [t.cost for t in population] argsort = np.argsort(popCosts) for idx in argsort: print(population[idx].cost, population[idx]) print() else: print() print() print("No new fitted trees were added to the population.") print() if staleIndices: print() print("Currently optimizing:") for pidx, t in enumerate(regressor.trees): print(pidx, t) print() print() # Continue optimization of currently active trees populationDict, rawPopulations = regressor.generatePopulationDict(N) graph, keys = evaluator.evaluate(regressor.trees, populationDict, N, worldSize, settings['allSums'], useGPU=settings['useGPU']) perWorkerResults = client.get(graph, keys, direct=True) #, resources={'GPU': 1}) perStructResults, perStructNames = zip(*perWorkerResults) perStructResults = list( itertools.chain.from_iterable(perStructResults)) perStructNames = list(itertools.chain.from_iterable(perStructNames)) perStructResults = [ x for _, x in sorted(zip(perStructNames, perStructResults)) ] energies = {struct: [] for struct in database.attrs['structNames']} forces = {struct: [] for struct in database.attrs['structNames']} counter = 0 for struct in database.attrs['structNames']: res = perStructResults[counter] energies[struct] = [s[0] for s in res] forces[struct] = [s[1] for s in res] counter += 1 # Save the (per-struct) errors and the single-value costs errors = computeErrors(settings['refStruct'], energies, forces, database) costs = costFxn(errors) # Add ridge regression penalty penalties = np.array([ np.linalg.norm(pop, axis=1) * settings['ridgePenalty'] for pop in rawPopulations ]) # Update optimizers regressor.updateOptimizers(rawPopulations, costs, penalties) printTreeCosts(fxnEvals, [opt.result.fbest for opt in regressor.optimizers], penalties, start) fxnEvals += 1 print('Done')
def launch_python_post(): curDir = os.path.dirname(os.path.abspath(__file__)) logger = PyPostTools.pyPostLogger() logger.write("Initializing WRF Python Post-Processing Program") #Step 1: Load program settings logger.write(" 1. Application Initalization") logger.write(" - Loading control file, python_post_control.txt") _pySet = PyPostSettings.PyPostSettings() logger.write(" - Success!") logger.write(" - Testing Environmental Variables") try: dask_nodes = os.environ["PYTHON_POST_NODES"] dask_threads = os.environ["PYTHON_POST_THREADS"] postDir = os.environ["PYTHON_POST_DIR"] targetDir = os.environ["PYTHON_POST_TARG_DIR"] except KeyError: logger.write( "***FAIL*** KeyError encountered while trying to access important environmental variables, abort." ) sys.exit("") logger.write(" - Success!") logger.write(" - Initializing Dask (" + str(dask_nodes) + " Nodes Requested), Collecting routines needed") _routines = Routines.Routines() # Start Dask Tasks #cLoop = IOLoop.current() #t = Thread(target = cLoop.start, daemon = True) #t.start() logger.write(" - Async IO Loop initialized...") async def f(port): s = Scheduler(port=scheduler_port) s = await s await s.finished() return 1 asyncio.gather(f(scheduler_port)) #asyncio.get_event_loop().run_until_complete(f(scheduler_port)) logger.write(" - Dask Scheduler initialized (Port " + str(scheduler_port) + ")...") dask_client = Client("tcp://" + socket.gethostname() + ":" + str(scheduler_port)) logger.write(" - Dask Client initialized...") logger.write(" - Writing Dask Worker Job Files...") with PyPostTools.cd(targetDir): writeFile1 = PyPostTools.write_job_file(socket.gethostname(), scheduler_port, project_name="Nowcast", queue="default", nodes=dask_nodes, wall_time=60, nProcs=1) writeFile2 = PyPostTools.write_worker_file(socket.gethostname(), scheduler_port, nProcs=1) if (writeFile1 == False or writeFile2 == False): dask_client.close() logger.write( " - Failed to write job files, are you missing an important parameter?" ) sys.exit("") return else: logger.write( " - Dask Worker Job File Written, Submitting to Queue.") PyPostTools.popen("chmod +x launch-worker.sh") PyPostTools.popen("chmod +x dask-worker.job") PyPostTools.popen("qsub dask-worker.job") # Wait here for workers. logger.write(" -> Worker Job submitted to queue, waiting for workers...") while len(dask_client.scheduler_info()['workers']) < int(dask_nodes): time.sleep(2) logger.write(" -> Workers are now connected.") #logger.write(" - Adding local packages to dask workers") #dask_client.upload_file("PyPostTools.py") #dask_client.upload_file("ArrayTools.py") #dask_client.upload_file("Calculation.py") #dask_client.upload_file("ColorMaps.py") #dask_client.upload_file("Conversions.py") #dask_client.upload_file("Plotting.py") #dask_client.upload_file("PyPostSettings.py") #dask_client.upload_file("Routines.py") logger.write(" - Success!") logger.write(" 1. Done.") logger.write(" 2. Start Post-Processing Calculations") calculation_future = start_calculations(dask_client, _routines, dask_threads) if (calculation_future != None): wait(calculation_future) result_calc = dask_client.gather(calculation_future)[0] if (result_calc != 0): logger.write( "***FAIL*** An error occured in calculations method, check worker logs for more info." ) logger.close() sys.exit("") logger.write(" 2. Done.") logger.write(" 3. Generating Figures") logger.write(" - Collecting files from target directory (" + targetDir + ").") fList3 = sorted(glob.glob(targetDir + "WRFPRS_F*")) logger.write(" - " + str(len(fList3)) + " files have been found.") logger.write(" -> Pushing run_plotting_routines() to dask.") fullDict = _pySet.get_full_dict() plotting_future = start_plotting(dask_client, fullDict, dask_threads) wait(plotting_future) result_plot = dask_client.gather(plotting_future)[0] if (result_plot != 0): logger.write( "***FAIL*** An error occured in plotting method, check worker logs for more info." ) logger.close() sys.exit("") logger.write(" 3. Done.") logger.write(" 4. Final Steps") logger.write(" 4. Done, Closing Dask Client.") dask_client.retire_workers(workers=dask_client.scheduler_info()['workers'], close=True) dask_client.close() logger.write("All Steps Completed.") logger.write("***SUCCESS*** Program execution complete.") logger.close()
def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1.0e-5, nstart=None): """ Find the PageRank values for each vertex in a graph using multiple GPUs. cuGraph computes an approximation of the Pagerank using the power method. The input graph must contain edge list as dask-cudf dataframe with one partition per GPU. Parameters ---------- graph : cugraph.DiGraph cuGraph graph descriptor, should contain the connectivity information as dask cudf edge list dataframe(edge weights are not used for this algorithm). Undirected Graph not currently supported. alpha : float The damping factor alpha represents the probability to follow an outgoing edge, standard value is 0.85. Thus, 1.0-alpha is the probability to “teleport” to a random vertex. Alpha should be greater than 0.0 and strictly lower than 1.0. personalization : cudf.Dataframe GPU Dataframe containing the personalization information. Currently not supported. personalization['vertex'] : cudf.Series Subset of vertices of graph for personalization personalization['values'] : cudf.Series Personalization values for vertices max_iter : int The maximum number of iterations before an answer is returned. If this value is lower or equal to 0 cuGraph will use the default value, which is 30. tolerance : float Set the tolerance the approximation, this parameter should be a small magnitude value. The lower the tolerance the better the approximation. If this value is 0.0f, cuGraph will use the default value which is 1.0E-5. Setting too small a tolerance can lead to non-convergence due to numerical roundoff. Usually values between 0.01 and 0.00001 are acceptable. nstart : not supported initial guess for pagerank Returns ------- PageRank : dask_cudf.DataFrame GPU data frame containing two dask_cudf.Series of size V: the vertex identifiers and the corresponding PageRank values. ddf['vertex'] : dask_cudf.Series Contains the vertex identifiers ddf['pagerank'] : dask_cudf.Series Contains the PageRank score Examples -------- >>> import cugraph.dask as dcg >>> Comms.initialize(p2p=True) >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.DiGraph() >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst', edge_attr='value') >>> pr = dcg.pagerank(dg) >>> Comms.destroy() """ from cugraph.structure.graph import null_check if personalization is not None: raise Exception("Personalization not supported") nstart = None client = default_client() input_graph.compute_renumber_edge_list(transposed=True) (ddf, num_verts, partition_row_size, partition_col_size, vertex_partition_offsets) = shuffle(input_graph, transposed=True) num_edges = len(ddf) data = get_distributed_data(ddf) if personalization is not None: null_check(personalization["vertex"]) null_check(personalization["values"]) if input_graph.renumbered is True: personalization = input_graph.add_internal_vertex_id( personalization, "vertex", "vertex").compute() result = [ client.submit(call_pagerank, Comms.get_session_id(), wf[1], num_verts, num_edges, vertex_partition_offsets, alpha, max_iter, tol, personalization, nstart, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items()) ] wait(result) ddf = dask_cudf.from_delayed(result) if input_graph.renumbered: return input_graph.unrenumber(ddf, 'vertex') return ddf
def test_end_to_end(): cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) # NOTE: The LocalCUDACluster needs to be started before any imports that # could potentially create a CUDA context. import dask_cudf import cudf import numpy as np from dask_cuml.neighbors import NearestNeighbors as cumlKNN def create_df(f, m, n): X = np.random.rand(m, n) ret = cudf.DataFrame( [(i, X[:, i].astype(np.float32)) for i in range(n)], index=cudf.dataframe.RangeIndex(f * m, f * m + m, 1)) return ret def get_meta(df): ret = df.iloc[:0] return ret # Per gpu/worker train_m = 500 train_n = 25 search_m = 10 search_k = 15 workers = client.has_what().keys() # Create dfs on each worker (gpu) dfs = [ client.submit(create_df, n, train_m, train_n, workers=[worker]) for worker, n in list(zip(workers, list(range(len(workers))))) ] # Wait for completion wait(dfs) meta = client.submit(get_meta, dfs[0]).result() X_df = dask_cudf.from_delayed(dfs, meta=meta) X_pd = X_df.compute().to_pandas() cumlNN = cumlKNN() cumlNN.fit(X_df) sklNN = NearestNeighbors(metric="sqeuclidean") sklNN.fit(X_pd) cuml_D, cuml_I = cumlNN.kneighbors(X_df[0:search_m - 1], search_k) sk_D, sk_I = sklNN.kneighbors(X_pd[0:search_m], search_k) cuml_I_nd = np.array(cuml_I.compute().as_gpu_matrix(), dtype=sk_I.dtype) cuml_D_nd = np.array(cuml_D.compute().as_gpu_matrix(), dtype=sk_D.dtype) print(str(cuml_D_nd.dtype)) print(str(sk_D.dtype)) assert np.array_equal(cuml_I_nd, sk_I) assert np.allclose(cuml_D_nd, sk_D, atol=1e-5) cluster.close()
def louvain(input_graph, max_iter=100, resolution=1.0, load_balance=True): """ Compute the modularity optimizing partition of the input graph using the Louvain method on multiple GPUs Examples -------- >>> import cugraph.dask as dcg >>> Comms.initialize() >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv('datasets/karate.csv', chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.Graph() >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst', edge_attr='value') >>> parts, modularity_score = dcg.louvain(dg) """ # FIXME: finish docstring: describe parameters, etc. # FIXME: import here to prevent circular import: cugraph->louvain # wrapper->cugraph/structure->cugraph/dask->dask/louvain->cugraph/structure # from cugraph.structure.graph import Graph # FIXME: dask methods to populate graphs from edgelists are only present on # DiGraph classes. Disable the Graph check for now and assume inputs are # symmetric DiGraphs. # if type(graph) is not Graph: # raise Exception("input graph must be undirected") client = default_client() # Calling renumbering results in data that is sorted by degree input_graph.compute_renumber_edge_list(transposed=False) sorted_by_degree = True (ddf, num_verts, partition_row_size, partition_col_size, vertex_partition_offsets) = shuffle(input_graph, transposed=False) num_edges = len(ddf) data = get_distributed_data(ddf) result = dict([(data.worker_info[wf[0]]["rank"], client.submit(call_louvain, Comms.get_session_id(), wf[1], num_verts, num_edges, partition_row_size, partition_col_size, vertex_partition_offsets, sorted_by_degree, max_iter, resolution, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items())]) wait(result) (parts, modularity_score) = result[0].result() if input_graph.renumbered: # MG renumbering is lazy, but it's safe to assume it's been called at # this point if renumbered=True parts = input_graph.unrenumber(parts, "vertex") return parts, modularity_score
return dask_xgboost.predict(client, bst, X) # Create a context from dask_sql import Context, run_server c = Context() c.register_function(predict_price, "predict_price", [("total_amount", np.float64), ("trip_distance", np.float64), ("passenger_count", np.float64)], np.float64) # Load the data from S3 df = dd.read_csv("s3://nyc-tlc/trip data/yellow_tripdata_2019-01.csv", dtype={ "payment_type": "UInt8", "VendorID": "UInt8", "passenger_count": "UInt8", "RatecodeIDq": "UInt8", }, storage_options={ "anon": True }).persist() wait(df) c.create_table("nyc-taxi", df) c.sql("SELECT 1 + 1").compute() # Finally, spin up the dask-sql server run_server(context=c, client=client)
def test_score(nrows, ncols, nclusters, n_parts, input_type, cluster): client = None try: client = Client(cluster) from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X, y = make_blobs(n_samples=int(nrows), n_features=ncols, centers=nclusters, n_parts=n_parts, cluster_std=0.01, shuffle=False, random_state=10) wait(X) if input_type == "dataframe": X_train = to_dask_cudf(X) y_train = to_dask_cudf(y) y = y_train elif input_type == "array": X_train, y_train = X, y cumlModel = cumlKMeans(init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_train) actual_score = cumlModel.score(X_train) predictions = cumlModel.predict(X_train).compute() if input_type == "dataframe": X = cp.array(X_train.compute().as_gpu_matrix()) predictions = cp.array(predictions) centers = cp.array(cumlModel.cluster_centers_.as_gpu_matrix()) elif input_type == "array": X = X_train.compute() centers = cumlModel.cluster_centers_ expected_score = 0 for idx, label in enumerate(predictions): x = X[idx] y = centers[label] dist = cp.sqrt(cp.sum((x - y)**2)) expected_score += dist**2 assert actual_score + SCORE_EPS \ >= (-1 * expected_score) \ >= actual_score - SCORE_EPS finally: client.close()
def main(client, config): store_sales_df, store_returns_df = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) n_workers = len(client.scheduler_info()["workers"]) ### going via repartition for split_out drop duplicates ### see issue: https://github.com/rapidsai/tpcx-bb-internal/issues/492 unique_sales = store_sales_df[[ "ss_ticket_number", "ss_customer_sk" ]].map_partitions(lambda df: df.drop_duplicates()) unique_sales = unique_sales.shuffle(on=["ss_customer_sk"]) unique_sales = unique_sales.map_partitions(lambda df: df.drop_duplicates()) unique_sales = unique_sales.persist() wait(unique_sales) orders_count = (unique_sales.groupby(by="ss_customer_sk").agg({ "ss_ticket_number": "count" }).reset_index()) orders_df = (store_sales_df.groupby(by="ss_customer_sk").agg({ "ss_item_sk": "count", "ss_net_paid": "sum" }).reset_index()) ### free up memory no longer needed del store_sales_df orders_df = orders_df.merge(orders_count, how="inner", on="ss_customer_sk") orders_df = orders_df.rename( columns={ "ss_customer_sk": "user_sk", "ss_ticket_number": "orders_count", "ss_item_sk": "orders_items", "ss_net_paid": "orders_money", }) orders_df = orders_df.persist() wait(orders_df) del unique_sales returns_count = (store_returns_df[[ "sr_ticket_number", "sr_customer_sk" ]].drop_duplicates(split_out=n_workers).groupby(by="sr_customer_sk").agg({ "sr_ticket_number": "count" }).reset_index()) returns_df = (store_returns_df.groupby(by="sr_customer_sk").agg({ "sr_item_sk": "count", "sr_return_amt": "sum" }).reset_index()) ### free up memory no longer needed del store_returns_df returns_df = returns_df.merge(returns_count, how="inner", on="sr_customer_sk") returns_df = returns_df.rename( columns={ "sr_customer_sk": "user_sk", "sr_ticket_number": "returns_count", "sr_item_sk": "returns_items", "sr_return_amt": "returns_money", }) returns_df = returns_df.persist() wait(returns_df) final_df = orders_df.merge(returns_df, how="left", on="user_sk") final_df["orderRatio"] = (final_df["returns_count"] / final_df["orders_count"]).round(7) final_df["itemsRatio"] = (final_df["returns_items"] / final_df["orders_items"]).round(7) final_df["monetaryRatio"] = (final_df["returns_money"] / final_df["orders_money"]).round(7) ratio_columns = ["orderRatio", "itemsRatio", "monetaryRatio"] final_df = final_df.map_partitions(remove_inf_and_nulls, column_names=ratio_columns, value=0.0) final_df = final_df.rename(columns={"returns_count": "frequency"}) keep_cols = [ "user_sk", "orderRatio", "itemsRatio", "monetaryRatio", "frequency" ] final_df = final_df[keep_cols] final_df = final_df.fillna(0) final_df = final_df.repartition(npartitions=1).persist() wait(final_df) final_df = final_df.sort_values(["user_sk"]).reset_index(drop=True) final_df = final_df.persist() wait(final_df) feature_cols = ["orderRatio", "itemsRatio", "monetaryRatio", "frequency"] results_dict = get_clusters(client=client, ml_input_df=final_df, feature_cols=feature_cols) return results_dict
def test_end_to_end(nrows, ncols, nclusters, n_parts, delayed_predict, input_type, cluster): client = None try: client = Client(cluster) from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X, y = make_blobs(n_samples=int(nrows), n_features=ncols, centers=nclusters, n_parts=n_parts, cluster_std=0.01, random_state=10) wait(X) if input_type == "dataframe": X_train = to_dask_cudf(X) y_train = to_dask_cudf(y) elif input_type == "array": X_train, y_train = X, y cumlModel = cumlKMeans(init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_train) cumlLabels = cumlModel.predict(X_train, delayed_predict) n_workers = len(list(client.has_what().keys())) # Verifying we are grouping partitions. This should be changed soon. if n_parts is not None and n_parts < n_workers: parts_len = n_parts else: parts_len = n_workers if input_type == "dataframe": assert cumlLabels.npartitions == parts_len cumlPred = cp.array(cumlLabels.compute().to_pandas().values) labels = cp.squeeze(y_train.compute().to_pandas().values) elif input_type == "array": assert len(cumlLabels.chunks[0]) == parts_len cumlPred = cp.array(cumlLabels.compute()) labels = cp.squeeze(y_train.compute()) assert cumlPred.shape[0] == nrows assert cp.max(cumlPred) == nclusters - 1 assert cp.min(cumlPred) == 0 score = adjusted_rand_score(labels, cumlPred) print(str(score)) assert 1.0 == score finally: client.close()
def generate_training(sensor_path, ground_truth_path, size=11, chunk_size=500, classes=21, savedir=".", use_dask=False, client=None): """Yield one instance of data with one hot labels Args: chunk_size: number of images per tfrecord size: N x N image size savedir: directory to save tfrecords use_dask: optional dask client to parallelize computation Returns: filename: tfrecords path """ #turn ground truth into a dataframe of coords results = get_coordinates(ground_truth_path) print("There are {} label pixels in the labeled ground truth".format( results.shape[0])) #Remove unclassified pixels? results = results[~(results.label == 0)] #Create chunks to write based on a spatial block results["chunk"] = np.arange(len(results)) // chunk_size basename = os.path.splitext(os.path.basename(sensor_path))[0] filenames = [] if use_dask: if client is None: raise ValueError( "use_dask is {} but no client specified".format(use_dask)) for g, df in results.groupby("chunk"): coordinates = zip(df.easting, df.northing) filename = "{}/{}_{}.tfrecord".format(savedir, basename, g) #Submit to dask client fn = client.submit(_record_wrapper_, labels=df.label.values, sensor_path=sensor_path, coordinates=coordinates, size=size, classes=classes, filename=filename, train=True) filenames.append(fn) wait(filenames) filenames = [x.result() for x in filenames] else: for g, df in results.groupby("chunk"): filename = "{}/{}_{}.tfrecord".format(savedir, basename, g) coordinates = zip(df.easting, df.northing) #Write record fn = _record_wrapper_(labels=df.label.values, sensor_path=sensor_path, coordinates=coordinates, size=size, classes=classes, filename=filename, train=True) filenames.append(fn) return filenames
def main(client, config): ( store_sales_df, date_dim_df, web_sales_df, store_retuns_df, store_table_df, item_table_df, ) = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) # SELECT sr_item_sk, sr_customer_sk, sr_ticket_number, sr_return_quantity # FROM # store_returns sr, # date_dim d2 # WHERE d2.d_year = ${hiveconf:q21_year} # AND d2.d_moy BETWEEN ${hiveconf:q21_month} AND ${hiveconf:q21_month} + 6 --which were returned in the next six months # AND sr.sr_returned_date_sk = d2.d_date_sk d2 = date_dim_df.query( f"d_year == {q21_year} and d_moy >= {q21_month} and d_moy <= {q21_month+6}", meta=date_dim_df._meta, ).reset_index(drop=True) part_sr = store_retuns_df.merge(d2, left_on="sr_returned_date_sk", right_on="d_date_sk", how="inner") cols_2_keep = [ "sr_item_sk", "sr_customer_sk", "sr_ticket_number", "sr_return_quantity", ] part_sr = part_sr[cols_2_keep] part_sr = part_sr.persist() wait(part_sr) # SELECT # ws_item_sk, ws_bill_customer_sk, ws_quantity # FROM # web_sales ws, # date_dim d3 # WHERE d3.d_year BETWEEN ${hiveconf:q21_year} AND ${hiveconf:q21_year} + 2 -- in the following three years (re-purchased by the returning customer afterwards through # the web sales channel) # AND ws.ws_sold_date_sk = d3.d_date_sk # ) part_ws d3 = date_dim_df.query( f"d_year >= {q21_year} and d_year <= {q21_year + 2}", meta=date_dim_df._meta) part_ws = web_sales_df.merge(d3, left_on="ws_sold_date_sk", right_on="d_date_sk", how="inner") cols_2_keep = ["ws_item_sk", "ws_bill_customer_sk", "ws_quantity"] part_ws = part_ws[cols_2_keep] part_ws = part_ws.persist() wait(part_ws) # part_ws ON ( # part_sr.sr_item_sk = part_ws.ws_item_sk # AND part_sr.sr_customer_sk = part_ws.ws_bill_customer_sk part_ws_part_sr_m = hash_merge( lhs=part_sr, rhs=part_ws, left_on=["sr_item_sk", "sr_customer_sk"], right_on=["ws_item_sk", "ws_bill_customer_sk"], how="inner", ) cols_2_keep = [ "sr_item_sk", "sr_customer_sk", "sr_ticket_number", "sr_return_quantity", "ws_quantity", ] part_ws_part_sr_m = part_ws_part_sr_m[cols_2_keep] part_ws_part_sr_m = part_ws_part_sr_m.persist() wait(part_ws_part_sr_m) del part_sr, part_ws # SELECT ss_item_sk, ss_store_sk, ss_customer_sk, ss_ticket_number, ss_quantity # FROM # store_sales ss, # date_dim d1 # WHERE d1.d_year = ${hiveconf:q21_year} # AND d1.d_moy = ${hiveconf:q21_month} # AND ss.ss_sold_date_sk = d1.d_date_sk # ) part_ss d1 = date_dim_df.query(f"d_year == {q21_year} and d_moy == {q21_month} ", meta=date_dim_df._meta) part_ss = store_sales_df.merge(d1, left_on="ss_sold_date_sk", right_on="d_date_sk", how="inner") cols_2_keep = [ "ss_item_sk", "ss_store_sk", "ss_customer_sk", "ss_ticket_number", "ss_quantity", ] part_ss = part_ss[cols_2_keep] # part_ss ON ( # part_ss.ss_ticket_number = part_sr.sr_ticket_number # AND part_ss.ss_item_sk = part_sr.sr_item_sk # AND part_ss.ss_customer_sk = part_sr.sr_customer_sk part_ws_part_sr_m_part_ss_join_df = hash_merge( lhs=part_ss, rhs=part_ws_part_sr_m, left_on=["ss_ticket_number", "ss_item_sk", "ss_customer_sk"], right_on=["sr_ticket_number", "sr_item_sk", "sr_customer_sk"], how="inner", ) cols_2_keep = [ "ss_store_sk", "ss_quantity", "sr_return_quantity", "ws_quantity", "ss_item_sk", ] part_ws_part_sr_m_part_ss_join_df = part_ws_part_sr_m_part_ss_join_df[ cols_2_keep] # INNER JOIN store part_s ON ( # part_s.s_store_sk = part_ss.ss_store_sk # ) part_ws_part_sr_m_part_ss_part_s_join_df = store_table_df.merge( part_ws_part_sr_m_part_ss_join_df, left_on="s_store_sk", right_on="ss_store_sk", how="inner", ) cols_2_keep = [ "s_store_name", "sr_return_quantity", "ss_quantity", "ws_quantity", "s_store_id", "ss_item_sk", ] part_ws_part_sr_m_part_ss_part_s_join_df = part_ws_part_sr_m_part_ss_part_s_join_df[ cols_2_keep] # INNER JOIN item part_i ON ( # part_i.i_item_sk = part_ss.ss_item_sk # ) final_df = item_table_df.merge( part_ws_part_sr_m_part_ss_part_s_join_df, left_on="i_item_sk", right_on="ss_item_sk", how="inner", ) # GROUP BY # part_i.i_item_id, # part_i.i_item_desc, # part_s.s_store_id, # part_s.s_store_name # ORDER BY # part_i.i_item_id, # part_i.i_item_desc, # part_s.s_store_id, # part_s.s_store_name cols_2_keep = [ "i_item_id", "i_item_desc", "s_store_name", "ss_quantity", "sr_return_quantity", "ws_quantity", "s_store_id", ] grouped_df = final_df[cols_2_keep] agg_df = grouped_df.groupby( by=["i_item_id", "i_item_desc", "s_store_id", "s_store_name"]).agg({ "ss_quantity": "sum", "sr_return_quantity": "sum", "ws_quantity": "sum" }) agg_df = agg_df.repartition(npartitions=1).persist() sorted_agg_df = agg_df.reset_index().map_partitions( lambda df: df.sort_values( by=["i_item_id", "i_item_desc", "s_store_id", "s_store_name"])) sorted_agg_df = sorted_agg_df.head(q21_limit) sorted_agg_df = sorted_agg_df.rename( columns={ "ss_quantity": "store_sales_quantity", "sr_return_quantity": "store_returns_quantity", "ws_quantity": "web_sales_quantity", }) sorted_agg_df["i_item_desc"] = sorted_agg_df["i_item_desc"].str.strip() return sorted_agg_df
d.insertDataValue("name", dims, np.arange(lx * ly * lz)) volume = lx * ly * lz print("Dataset volume is {} ({} GByte)".format(volume, (volume * 8) / 2**30)) test = from_dataset(d, slice_dim=Dimension.Z) test = client.persist(test) test = test.concatenate(Dimension.Y, test) test = test.concatenate(Dimension.Y, test) test = test.concatenate(Dimension.Y, test) test = test.concatenate(Dimension.Y, test) test = test.concatenate(Dimension.Y, test) test = client.persist(test) #test = test.persist() wait(test) start_time = timeit.default_timer() sliced = test.slice(Dimension.Y, 6) sliced = sliced.compute() print(timeit.default_timer() - start_time) start_time = timeit.default_timer() sliced = test.slice(Dimension.X, 7) sliced = sliced.compute() print(timeit.default_timer() - start_time) start_time = timeit.default_timer() sliced = test.slice(Dimension.X, 8) sliced = sliced.compute() print(timeit.default_timer() - start_time)
def dump_write(render_folder_name, full_volume_shape, dtype, color_channel_count, output_file_name, tile_hash, leaf_level_count, tile_level_count, compression_method, compression_options, output_file_type, do_use_simple_for_loop=False): # dumps volumetric data into h5/n5/zarr #self.inputLoc = inputloc tile_shape = (full_volume_shape / (2**tile_level_count)).astype(int) leaf_shape = (full_volume_shape / (2**leaf_level_count)).astype(int) # check if dataset name is provided splitted_name = output_file_name.split(':') if len(splitted_name) == 1: output_file_name = splitted_name[0] dataset_name = "volume" elif len(splitted_name) ==2: output_file_name = splitted_name[0] dataset_name = splitted_name[1] else: raise ValueError('output file name has more than one ":"', output_file_name) #self.setting = setting #self.tilelist = tilelist tile_id_list = list(tile_hash.keys()) leaf_ids_per_tile_list = list(tile_hash.values()) # # Unpack the settings # volSize = tuple(map(int,setting['volSize'])) # tileSize = setting['tileSize'] # #volReference = setting['volReference'] # depthFull = setting['depthFull'] # depthBase = setting['depthBase'] # leafSize = setting['leaf_shape'] # dtype = setting['dtype'] # chunkSize = tuple(map(int,setting['chunkSize'])) # compression_method = setting['compression'] # comp_opts = setting['compression_opts'] chunk_shape = tile_shape full_volume_shape_including_color_channel = np.append(full_volume_shape, color_channel_count) # append color channel chunk_shape_including_color_channel = np.append(chunk_shape, color_channel_count) full_volume_shape_with_color_channels_as_tuple = tuple(map(int, full_volume_shape_including_color_channel)) chunk_shape_with_color_as_tuple = tuple(map(int, chunk_shape_including_color_channel)) if output_file_type=='h5': # write into h5 with h5py.File(output_file_name, "w") as f: # dset_swc = f.create_dataset("reconstruction", (xyz_shifted.shape[0], 7), dtype='f') # for iter, xyz_ in enumerate(xyz_shifted): # dset_swc[iter, :] = np.array( # [edges[iter, 0].__int__(), 1, xyz_[0], xyz_[1], xyz_[2], 1.0, edges[iter, 1].__int__()]) dataset = f.create_dataset(dataset_name, full_volume_shape_with_color_channels_as_tuple, dtype=dtype, chunks=chunk_shape_with_color_as_tuple, compression=compression_method, compression_opts=compression_options) # crop chuncks from a tile read in tilelist for iter, tile_id in enumerate(tile_id_list): print('{} : {} out of {}'.format(tile_id, iter+1, len(tile_id_list))) leaf_id_within_tile = tile_hash[tile_id] dump_single_tile_id(tile_id, leaf_id_within_tile, render_folder_name, tile_shape, leaf_shape, chunk_shape_with_color_as_tuple, dtype, dataset, is_dataset_transposed=False) elif output_file_type=='n5' or output_file_type=='zarr': # write into z5 or n5 if do_use_simple_for_loop: use_zarr_format = (output_file_type == 'zarr') with z5py.File(output_file_name, 'a', use_zarr_format=use_zarr_format) as f: # require_dataset seems to choke on the compression_options {level: 9}, so this is a workaround g = f.require_group('/') try: dataset = g[dataset_name] except KeyError: dataset = f.create_dataset(dataset_name, shape=tuple(reversed(full_volume_shape_with_color_channels_as_tuple)), dtype=dtype, chunks=tuple(reversed(chunk_shape_with_color_as_tuple)), compression=compression_method, **compression_options) for tile_id in tqdm.tqdm(tile_id_list): leaf_ids_within_tile = tile_hash[tile_id] dump_single_tile_id(tile_id, leaf_ids_within_tile, render_folder_name, tile_shape, leaf_shape, chunk_shape_with_color_as_tuple, dtype, dataset, is_dataset_transposed=True) else: username = getpass.getuser() scratch_folder_path = '/scratch/%s' % username with LSFCluster(cores=2, memory='30 GB', local_dir=scratch_folder_path, projectstr='mouselight', queue='normal', extralist='-o /dev/null -e /dev/null') as cluster: cluster.adapt(minimum=1, maximum=1000) #cluster = LocalCluster(n_workers=4, threads_per_worker=1) #cluster.scale(200) with Client(cluster) as client: use_zarr_format = (output_file_type=='zarr') with z5py.File(output_file_name, 'a', use_zarr_format=use_zarr_format) as f: # require_dataset seems to choke on the compression_options {level: 9}, so this is a workaround g = f.require_group('/') try: dataset = g[dataset_name] except KeyError: dataset = f.create_dataset(dataset_name, shape=tuple(reversed(full_volume_shape_with_color_channels_as_tuple)), dtype=dtype, chunks=tuple(reversed(chunk_shape_with_color_as_tuple)), compression=compression_method, **compression_options) two_arg_dump_single_tile_id = \ partial(dump_single_tile_id, rendered_folder_path=render_folder_name, tile_shape=tile_shape, leaf_shape=leaf_shape, chunk_shape_with_color_as_tuple=chunk_shape_with_color_as_tuple, dtype=dtype, dataset=dataset, is_dataset_transposed=True) #with Pool(16) as pool : # foo = list(tqdm.tqdm(pool.imap(f, tile_id_list), total=len(tile_id_list))) # for tile_id in tqdm.tqdm(tile_id_list): # leaf_id_within_tile = tile_hash[tile_id] # f(tile_id, leaf_id_within_tile) print('About to process %d tiles' % len(tile_id_list)) futures = client.map(two_arg_dump_single_tile_id, tile_id_list, leaf_ids_per_tile_list, retries=2) progress(futures, notebook=False) # need notebook=False when running in Spyder wait(futures) # just to make sure... print('') print('All Dask jobs have exited') print('') print('futures:') print(futures)
def polish(client, settings): worldSize = MPI.COMM_WORLD.Get_size() - 2 # Setup with h5py.File(settings['databasePath'], 'r') as h5pyFile: database = SVDatabase(h5pyFile, settings['refStruct'], args.names) wait(database.load(h5pyFile)) names = list(database.attrs['structNames']) random.shuffle(names) splits = np.array_split(names, worldSize) from svreg.database import worker_load futures = client.map( worker_load, [settings['databasePath']] * worldSize, splits, [database.attrs['svNames']] * worldSize, [database.attrs['elements']] * worldSize, [settings['allSums']] * worldSize, ) client.gather(client.compute(futures)) evaluator = SVEvaluator(database, settings) regressor = SVRegressor(settings, database) costFxn = buildCostFunction(settings, len(database.attrs['natoms']), sum(database.attrs['natoms'].values())) if args.trees is not None: with open(args.trees, 'r') as f: treeNames = [s.strip() for s in f.readlines()] regressor.trees = [ MCTree.from_str(t, database.attrs['elements'], regressor.svNodePool) for t in treeNames ] else: from svreg.nodes import FunctionNode tree = MCTree(['Al']) from copy import deepcopy treeAl = SVTree() treeAl.nodes = [ FunctionNode('add'), FunctionNode('global'), deepcopy(regressor.svNodePool[1]), FunctionNode('add'), deepcopy(regressor.svNodePool[0]), # FunctionNode('softplus'), # FunctionNode('add'), # deepcopy(regressor.svNodePool[0]), # FunctionNode('softplus'), FunctionNode('add'), deepcopy(regressor.svNodePool[0]), deepcopy(regressor.svNodePool[1]), ] tree.chemistryTrees['Al'] = treeAl tree.updateSVNodes() regressor.trees = [tree] # tree = MCTree.from_file( # '/home/jvita/scripts/svreg/results/alznmg/al_lnames_conv/e799d5bc09fd37dba4e05f45a6c00e57/tree_file.pot', # database.attrs['elements'], # regressor.svNodePool # ) # tree.updateSVNodes() # regressor.trees = [tree] regressor.initializeOptimizers() savePath = os.path.join(settings['outputPath'], 'polished') if not os.path.isdir(savePath): os.mkdir(savePath) for tree in regressor.trees: print(tree) N = settings['optimizerPopSize'] from svreg.archive import Entry entries = {md5Hash(t): Entry(t, savePath) for t in regressor.trees} import pickle optStart = time.time() for optStep in range(1, settings['maxNumOptimizerSteps'] + 1): staleIndices, messages = regressor.checkStale() for staleIdx, staleMessage in zip(staleIndices, messages): print('Completed tree {}:'.format(staleIdx)) print("\t", regressor.optimizers[staleIdx].result.fbest, regressor.trees[staleIdx]) print("Stopping criterion:", staleMessage) del regressor.trees[staleIdx] del regressor.optimizers[staleIdx] populationDict, rawPopulations = regressor.generatePopulationDict(N) graph, keys = evaluator.evaluate(regressor.trees, populationDict, N, worldSize, settings['allSums'], useGPU=settings['useGPU']) perWorkerResults = client.get(graph, keys, direct=True) #, resources={'GPU': 1}) perStructResults, perStructNames = zip(*perWorkerResults) perStructResults = list( itertools.chain.from_iterable(perStructResults)) perStructNames = list(itertools.chain.from_iterable(perStructNames)) perStructResults = [ x for _, x in sorted(zip(perStructNames, perStructResults)) ] energies = {struct: [] for struct in database.attrs['structNames']} forces = {struct: [] for struct in database.attrs['structNames']} counter = 0 for struct in database.attrs['structNames']: res = perStructResults[counter] energies[struct] = [s[0] for s in res] forces[struct] = [s[1] for s in res] counter += 1 # Save the (per-struct) errors and the single-value costs errors = computeErrors(settings['refStruct'], energies, forces, database) costs = costFxn(errors) # Add ridge regression penalty penalties = np.array([ np.linalg.norm(pop, axis=1) * settings['ridgePenalty'] for pop in rawPopulations ]) # Update optimizers regressor.updateOptimizers(rawPopulations, costs, penalties) printTreeCosts(optStep, [opt.result.fbest for opt in regressor.optimizers], penalties, optStart) for treeNum, tree in enumerate(regressor.trees): opt = regressor.optimizers[treeNum] treeName = md5Hash(tree) entry = entries[treeName] bestIdx = np.argmin(costs[0]) entry.bestIdx = bestIdx entry.cost = costs[0][bestIdx] entry.bestParams = rawPopulations[0][bestIdx] entry.bestErrors = errors[0][bestIdx] bestEng = {} bestFcs = {} for s in energies: bestEng[s] = energies[s][0][bestIdx] bestFcs[s] = forces[s][0][bestIdx] pickle.dump( entry, open(os.path.join(savePath, treeName, 'entry.pkl'), 'wb')) pickle.dump( opt, open(os.path.join(savePath, treeName, 'opt.pkl'), 'wb')) pickle.dump( bestEng, open(os.path.join(savePath, treeName, 'energies.pkl'), 'wb')) pickle.dump( bestFcs, open(os.path.join(savePath, treeName, 'forces.pkl'), 'wb')) pickle.dump( tree, open(os.path.join(savePath, treeName, 'tree.pkl'), 'wb'))
def test_ranker(output, client, listen_port, group): if output == 'dataframe-with-categorical': X, y, w, g, dX, dy, dw, dg = _create_ranking_data( output=output, group=group, n_features=1, n_informative=1 ) else: X, y, w, g, dX, dy, dw, dg = _create_ranking_data( output=output, group=group, ) # rebalance small dask.Array dataset for better performance. if output == 'array': dX = dX.persist() dy = dy.persist() dw = dw.persist() dg = dg.persist() _ = wait([dX, dy, dw, dg]) client.rebalance() # use many trees + leaves to overfit, help ensure that Dask data-parallel strategy matches that of # serial learner. See https://github.com/microsoft/LightGBM/issues/3292#issuecomment-671288210. params = { "random_state": 42, "n_estimators": 50, "num_leaves": 20, "min_child_samples": 1 } dask_ranker = lgb.DaskLGBMRanker( client=client, time_out=5, local_listen_port=listen_port, tree_learner_type='data_parallel', **params ) dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg) rnkvec_dask = dask_ranker.predict(dX) rnkvec_dask = rnkvec_dask.compute() p1_pred_leaf = dask_ranker.predict(dX, pred_leaf=True) rnkvec_dask_local = dask_ranker.to_local().predict(X) local_ranker = lgb.LGBMRanker(**params) local_ranker.fit(X, y, sample_weight=w, group=g) rnkvec_local = local_ranker.predict(X) # distributed ranker should be able to rank decently well and should # have high rank correlation with scores from serial ranker. dcor = spearmanr(rnkvec_dask, y).correlation assert dcor > 0.6 assert spearmanr(rnkvec_dask, rnkvec_local).correlation > 0.8 assert_eq(rnkvec_dask, rnkvec_dask_local) # pref_leaf values should have the right shape # and values that look like valid tree nodes pred_leaf_vals = p1_pred_leaf.compute() assert pred_leaf_vals.shape == ( X.shape[0], dask_ranker.booster_.num_trees() ) assert np.max(pred_leaf_vals) <= params['num_leaves'] assert np.min(pred_leaf_vals) >= 0 assert len(np.unique(pred_leaf_vals)) <= params['num_leaves'] # be sure LightGBM actually used at least one categorical column, # and that it was correctly treated as a categorical feature if output == 'dataframe-with-categorical': cat_cols = [ col for col in dX.columns if dX.dtypes[col].name == 'category' ] tree_df = dask_ranker.booster_.trees_to_dataframe() node_uses_cat_col = tree_df['split_feature'].isin(cat_cols) assert node_uses_cat_col.sum() > 0 assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' client.close(timeout=CLIENT_CLOSE_TIMEOUT)