async def _extract_partitions(dask_obj, client=None, batch_enabled=False): client = default_client() if client is None else client worker_list = Comms.get_workers() # dask.dataframe or dask.array if isinstance(dask_obj, (daskDataFrame, daskArray, daskSeries)): # parts = persist_distributed_data(dask_obj, client) # FIXME: persist data to the same worker when batch_enabled=True if batch_enabled: persisted = client.persist(dask_obj, workers=worker_list[0]) else: persisted = [ client.persist(dask_obj.get_partition(p), workers=w) for p, w in enumerate(worker_list) ] parts = futures_of(persisted) # iterable of dask collections (need to colocate them) elif isinstance(dask_obj, collections.abc.Sequence): # NOTE: We colocate (X, y) here by zipping delayed # n partitions of them as (X1, y1), (X2, y2)... # and asking client to compute a single future for # each tuple in the list dela = [np.asarray(d.to_delayed()) for d in dask_obj] # TODO: ravel() is causing strange behavior w/ delayed Arrays which are # not yet backed by futures. Need to investigate this behavior. # ref: https://github.com/rapidsai/cuml/issues/2045 raveled = [d.flatten() for d in dela] parts = client.compute([p for p in zip(*raveled)]) await wait(parts) key_to_part = [(str(part.key), part) for part in parts] who_has = await client.who_has(parts) return [(first(who_has[key]), part) for key, part in key_to_part]
def symmetrize_ddf(df, src_name, dst_name, weight_name=None): """ Take a COO stored in a distributed DataFrame, and the column names of the source and destination columns and create a new data frame using the same column names that symmetrize the graph so that all edges appear in both directions. Note that if other columns exist in the data frame (e.g. edge weights) the other columns will also be replicated. That is, if (u,v,data) represents the source value (u), destination value (v) and some set of other columns (data) in the input data, then the output data will contain both (u,v,data) and (v,u,data) with matching data. If (u,v,data1) and (v,u,data2) exist in the input data where data1 != data2 then this code will arbitrarily pick the smaller data element to keep, if this is not desired then the caller should should correct the data prior to calling symmetrize. Parameters ---------- df : dask_cudf.DataFrame Input data frame containing COO. Columns should contain source ids, destination ids and any properties associated with the edges. src_name : string Name of the column in the data frame containing the source ids dst_name : string Name of the column in the data frame containing the destination ids multi : bool Set to True if graph is a Multi(Di)Graph. This allows multiple edges instead of dropping them. symmetrize : bool Default is True to perform symmetrization. If False only duplicate edges are dropped. Examples -------- >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', >>> dtype=['int32', 'int32', 'float32'], header=None) >>> sym_df = cugraph.symmetrize(M, '0', '1') """ if weight_name: ddf2 = df[[dst_name, src_name, weight_name]] ddf2.columns = [src_name, dst_name, weight_name] else: ddf2 = df[[dst_name, src_name]] ddf2.columns = [src_name, dst_name] worker_list = Comms.get_workers() num_workers = len(worker_list) ddf = df.append(ddf2).reset_index(drop=True) result = ddf.shuffle(on=[src_name, dst_name], ignore_index=True, npartitions=num_workers) result = result.map_partitions(lambda x: x.groupby( by=[src_name, dst_name], as_index=False).min().reset_index(drop=True)) return result
def persist_distributed_data(dask_df, client): client = default_client() if client is None else client worker_addresses = Comms.get_workers() _keys = dask_df.__dask_keys__() worker_dict = {} for i, key in enumerate(_keys): worker_dict[str(key)] = tuple([worker_addresses[i]]) persisted = client.persist(dask_df, workers=worker_dict) parts = futures_of(persisted) return parts
def _workers_to_parts(futures): """ Builds an ordered dict mapping each worker to their list of parts :param futures: list of (worker, part) tuples :return: """ w_to_p_map = OrderedDict.fromkeys(Comms.get_workers()) for w, p in futures: if w_to_p_map[w] is None: w_to_p_map[w] = [] w_to_p_map[w].append(p) return w_to_p_map
def _workers_to_parts(futures): """ Builds an ordered dict mapping each worker to their list of parts :param futures: list of (worker, part) tuples :return: """ w_to_p_map = OrderedDict.fromkeys(Comms.get_workers()) for w, p in futures: if w_to_p_map[w] is None: w_to_p_map[w] = [] w_to_p_map[w].append(p) keys_to_delete = [w for (w, p) in w_to_p_map.items() if p is None] for k in keys_to_delete: del w_to_p_map[k] return w_to_p_map
def symmetrize_ddf(df, src_name, dst_name, weight_name=None): """ Take a COO stored in a distributed DataFrame, and the column names of the source and destination columns and create a new data frame using the same column names that symmetrize the graph so that all edges appear in both directions. Note that if other columns exist in the data frame (e.g. edge weights) the other columns will also be replicated. That is, if (u,v,data) represents the source value (u), destination value (v) and some set of other columns (data) in the input data, then the output data will contain both (u,v,data) and (v,u,data) with matching data. If (u,v,data1) and (v,u,data2) exist in the input data where data1 != data2 then this code will arbitrarily pick the smaller data element to keep, if this is not desired then the caller should should correct the data prior to calling symmetrize. Parameters ---------- df : dask_cudf.DataFrame Input data frame containing COO. Columns should contain source ids, destination ids and any properties associated with the edges. src_name : string Name of the column in the data frame containing the source ids dst_name : string Name of the column in the data frame containing the destination ids weight_name : string, optional (default=None) Name of the column in the data frame containing the weights Examples -------- >>> # import cugraph.dask as dcg >>> # from cugraph.structure.symmetrize import symmetrize_ddf >>> # Init a DASK Cluster >>> # Download dataset from https://github.com/rapidsai/cugraph/datasets/.. >>> # chunksize = dcg.get_chunksize(datasets / 'karate.csv') >>> # ddf = dask_cudf.read_csv(datasets/'karate.csv', chunksize=chunksize, >>> # delimiter=' ', >>> # names=['src', 'dst', 'weight'], >>> # dtype=['int32', 'int32', 'float32']) >>> # sym_ddf = symmetrize_ddf(ddf, "src", "dst", "weight") """ # FIXME: Uncomment out the above (broken) example if weight_name: ddf2 = df[[dst_name, src_name, weight_name]] ddf2.columns = [src_name, dst_name, weight_name] else: ddf2 = df[[dst_name, src_name]] ddf2.columns = [src_name, dst_name] worker_list = Comms.get_workers() num_workers = len(worker_list) ddf = df.append(ddf2).reset_index(drop=True) result = ddf.shuffle(on=[src_name, dst_name], ignore_index=True, npartitions=num_workers) result = result.map_partitions(lambda x: x.groupby( by=[src_name, dst_name], as_index=False).min().reset_index(drop=True)) return result