def test_concat_two_empty_series(ignore_index, axis): s1 = gd.Series() s2 = gd.Series() ps1 = s1.to_pandas() ps2 = s2.to_pandas() got = gd.concat([s1, s2], axis=axis, ignore_index=ignore_index) expect = pd.concat([ps1, ps2], axis=axis, ignore_index=ignore_index) assert_eq(got, expect) @pytest.mark.parametrize( "df1,df2", [ ( gd.DataFrame({"k1": [0, 1], "k2": [2, 3], "v1": [4, 5]}), gd.DataFrame({"k1": [1, 0], "k2": [3, 2], "v2": [6, 7]}), ), ( gd.DataFrame({"k1": [0, 1], "k2": [2, 3], "v1": [4, 5]}), gd.DataFrame({"k1": [0, 1], "k2": [3, 2], "v2": [6, 7]}), ), ], ) def test_concat_dataframe_with_multiIndex(df1, df2): gdf1 = df1 gdf1 = gdf1.set_index(["k1", "k2"]) gdf2 = df2 gdf2 = gdf2.set_index(["k1", "k2"])
def degrees(self, vertex_subset=None): """ Compute vertex in-degree and out-degree. By default, this method computes vertex degrees for the entire set of vertices. If vertex_subset is provided, this method optionally filters out all but those listed in vertex_subset. Parameters ---------- vertex_subset : cudf.Series or iterable container, optional A container of vertices for displaying corresponding degree. If not set, degrees are computed for the entire set of vertices. Returns ------- df : cudf.DataFrame df['vertex'] : cudf.Series The vertex IDs (will be identical to vertex_subset if specified). df['in_degree'] : cudf.Series The in-degree of the vertex. df['out_degree'] : cudf.Series The out-degree of the vertex. Examples -------- >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', >>> dtype=['int32', 'int32', 'float32'], header=None) >>> sources = cudf.Series(M['0']) >>> destinations = cudf.Series(M['1']) >>> G = cugraph.Graph() >>> G.add_edge_list(sources, destinations, None) >>> df = G.degrees([0,9,12]) """ vertex_col, in_degree_col, out_degree_col = graph_new_wrapper._degrees( self) df = cudf.DataFrame() if vertex_subset is None: if self.renumbered is True: df['vertex'] = self.edgelist.renumber_map[vertex_col] else: df['vertex'] = vertex_col df['in_degree'] = in_degree_col df['out_degree'] = out_degree_col else: df['vertex'] = cudf.Series( np.asarray(vertex_subset, dtype=np.int32)) if self.renumbered is True: renumber_series = cudf.Series(self.edgelist.renumber_map.index, index=self.edgelist.renumber_map) vertices_renumbered = renumber_series.loc[vertex_subset] df['in_degree'] = cudf.Series( np.asarray([in_degree_col[i] for i in vertices_renumbered], dtype=np.int32)) df['out_degree'] = cudf.Series( np.asarray( [out_degree_col[i] for i in vertices_renumbered], dtype=np.int32)) else: df['in_degree'] = cudf.Series( np.asarray([in_degree_col[i] for i in vertex_subset], dtype=np.int32)) df['out_degree'] = cudf.Series( np.asarray([out_degree_col[i] for i in vertex_subset], dtype=np.int32)) return df
def test_gpu_file_iterator_ds(df, dataset, batch, engine): df_itr = cudf.DataFrame() for data_gd in dataset.to_iter(columns=mycols_csv): df_itr = cudf.concat([df_itr, data_gd], axis=0) if df_itr else data_gd assert_eq(df_itr.reset_index(drop=True), df.reset_index(drop=True))
def test_mh_support(tmpdir, batch_size): data = { "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Reviewers": [ ["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"], ], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Embedding": [ [0.1, 0.2, 0.3], [0.3, 0.4, 0.5], [0.6, 0.7, 0.8], [0.8, 0.4, 0.2], ], "Post": [1, 2, 3, 4], } df = cudf.DataFrame(data) cat_names = ["Authors", "Reviewers", "Engaging User"] cont_names = ["Embedding"] label_name = ["Post"] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_preprocess(ops.HashBucket(num_buckets=10)) processor.finalize() data_itr = tf_dataloader.KerasSequenceLoader( nvt.Dataset(df), cat_names=cat_names, cont_names=cont_names, label_names=label_name, batch_size=batch_size, shuffle=False, ) data_itr.map(processor) idx = 0 for X, y in data_itr: assert len(X) == 7 n_samples = y.shape[0] for mh_name in ["Authors", "Reviewers", "Embedding"]: for postfix in ["__nnzs", "__values"]: assert (mh_name + postfix) in X array = X[mh_name + postfix].numpy()[:, 0] if postfix == "__nnzs": if mh_name == "Embedding": assert (array == 3).all() else: lens = [ len(x) for x in data[mh_name][idx * batch_size:idx * batch_size + n_samples] ] assert (array == np.array(lens)).all() else: if mh_name == "Embedding": assert len(array) == (n_samples * 3) else: assert len(array) == sum(lens) idx += 1 assert idx == (3 // batch_size + 1)
def _encode( name, storage_name, path, gdf, cat_cache, na_sentinel=-1, freq_threshold=0, search_sorted=False, buckets=None, encode_type="joint", cat_names=None, ): if isinstance(buckets, int): buckets = {name: buckets for name in cat_names} value = None selection_l = name if isinstance(name, list) else [name] selection_r = name if isinstance(name, list) else [storage_name] list_col = _is_list_col(selection_l, gdf) if path: if cat_cache is not None: cat_cache = (cat_cache if isinstance(cat_cache, str) else cat_cache.get(storage_name, "disk")) if len(gdf): with get_worker_cache("cats") as cache: value = fetch_table_data(cache, path, columns=selection_r, cache=cat_cache, cats_only=True) else: value = cudf.io.read_parquet(path, index=False, columns=selection_r) value.index.name = "labels" value.reset_index(drop=False, inplace=True) if value is None: value = cudf.DataFrame() for c in selection_r: typ = gdf[selection_l[0]].dtype if len( selection_l) == 1 else gdf[c].dtype value[c] = cudf.Series([None], dtype=typ) value.index.name = "labels" value.reset_index(drop=False, inplace=True) if not search_sorted: if list_col: codes = cudf.DataFrame( {selection_l[0]: gdf[selection_l[0]].list.leaves}) codes["order"] = cp.arange(len(codes)) else: codes = cudf.DataFrame({"order": cp.arange(len(gdf))}, index=gdf.index) for c in selection_l: codes[c] = gdf[c].copy() if buckets and storage_name in buckets: na_sentinel = _hash_bucket(gdf, buckets, selection_l, encode_type=encode_type) # apply frequency hashing if freq_threshold and buckets and storage_name in buckets: merged_df = codes.merge(value, left_on=selection_l, right_on=selection_r, how="left").sort_values("order") merged_df.reset_index(drop=True, inplace=True) max_id = merged_df["labels"].max() merged_df["labels"].fillna(cudf.Series(na_sentinel + max_id + 1), inplace=True) labels = merged_df["labels"].values # only do hashing elif buckets and storage_name in buckets: labels = na_sentinel # no hashing else: na_sentinel = 0 labels = codes.merge(value, left_on=selection_l, right_on=selection_r, how="left").sort_values("order")["labels"] labels.fillna(na_sentinel, inplace=True) labels = labels.values else: # Use `searchsorted` if we are using a "full" encoding if list_col: labels = value[selection_r].searchsorted( gdf[selection_l[0]].list.leaves, side="left", na_position="first") else: labels = value[selection_r].searchsorted(gdf[selection_l], side="left", na_position="first") labels[labels >= len(value[selection_r])] = na_sentinel if list_col: labels = _encode_list_column(gdf[selection_l[0]], labels) return labels
def _stratify_split(X, y, n_train, n_test, x_numba, y_numba, random_state): """ Function to perform a stratified split based on y lables. Based on scikit-learn stratified split implementation. Parameters ---------- X, y: Shuffled input data and labels n_train: Number of samples in train set n_test: number of samples in test set x_numba: Determines whether the data should be converted to numba y_numba: Determines whether the labales should be converted to numba Returns ------- X_train, X_test: Data X divided into train and test sets y_train, y_test: Labels divided into train and test sets """ x_cudf = False y_cudf = False if isinstance(X, cudf.DataFrame): x_cudf = True elif hasattr(X, "__cuda_array_interface__"): X = cp.asarray(X) x_order = _strides_to_order(X.__cuda_array_interface__['strides'], cp.dtype(X.dtype)) if isinstance(y, cudf.Series): y_cudf = True elif hasattr(y, "__cuda_array_interface__"): y = cp.asarray(y) y_order = _strides_to_order(y.__cuda_array_interface__['strides'], cp.dtype(y.dtype)) elif isinstance(y, cudf.DataFrame): y_cudf = True # ensuring it has just one column if y.shape[1] != 1: raise ValueError('Expected one label, but found y' 'with shape = %d' % (y.shape)) classes, y_indices = cp.unique(y.values if y_cudf else y, return_inverse=True) n_classes = classes.shape[0] class_counts = cp.bincount(y_indices) if n_train < n_classes: raise ValueError('The train_size = %d should be greater or ' 'equal to the number of classes = %d' % (n_train, n_classes)) if n_test < n_classes: raise ValueError('The test_size = %d should be greater or ' 'equal to the number of classes = %d' % (n_test, n_classes)) class_indices = cp.array_split(cp.argsort(y_indices), n_classes) X_train = None # random_state won't be None or int, that's handled earlier if isinstance(random_state, np.random.RandomState): random_state = cp.random.RandomState(seed=random_state.get_state()[1]) # Break ties n_i = _approximate_mode(class_counts, n_train, random_state) class_counts_remaining = class_counts - n_i t_i = _approximate_mode(class_counts_remaining, n_test, random_state) for i in range(n_classes): permutation = random_state.permutation(class_counts[i].item()) perm_indices_class_i = class_indices[i].take(permutation) if hasattr(X, "__cuda_array_interface__") or \ isinstance(X, cupyx.scipy.sparse.csr_matrix): X_train_i = cp.array(X[perm_indices_class_i[:n_i[i]]], order=x_order) X_test_i = cp.array(X[perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]]], order=x_order) y_train_i = cp.array(y[perm_indices_class_i[:n_i[i]]], order=y_order) y_test_i = cp.array(y[perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]]], order=y_order) if X_train is None: X_train = cp.array(X_train_i, order=x_order) y_train = cp.array(y_train_i, order=y_order) X_test = cp.array(X_test_i, order=x_order) y_test = cp.array(y_test_i, order=y_order) else: X_train = cp.concatenate([X_train, X_train_i], axis=0) X_test = cp.concatenate([X_test, X_test_i], axis=0) y_train = cp.concatenate([y_train, y_train_i], axis=0) y_test = cp.concatenate([y_test, y_test_i], axis=0) elif x_cudf: X_train_i = X.iloc[perm_indices_class_i[:n_i[i]]] X_test_i = X.iloc[perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]]] y_train_i = y.iloc[perm_indices_class_i[:n_i[i]]] y_test_i = y.iloc[perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]]] if X_train is None: X_train = X_train_i y_train = y_train_i X_test = X_test_i y_test = y_test_i else: X_train = cudf.concat([X_train, X_train_i], ignore_index=False) X_test = cudf.concat([X_test, X_test_i], ignore_index=False) y_train = cudf.concat([y_train, y_train_i], ignore_index=False) y_test = cudf.concat([y_test, y_test_i], ignore_index=False) if x_numba: X_train = cuda.as_cuda_array(X_train) X_test = cuda.as_cuda_array(X_test) elif x_cudf: X_train = cudf.DataFrame(X_train) X_test = cudf.DataFrame(X_test) if y_numba: y_train = cuda.as_cuda_array(y_train) y_test = cuda.as_cuda_array(y_test) elif y_cudf: y_train = cudf.DataFrame(y_train) y_test = cudf.DataFrame(y_test) return X_train, X_test, y_train, y_test
def find_and_replace(self, to_replace, replacement, all_nan): """ Return col with *to_replace* replaced with *replacement*. """ # create a dataframe containing the pre-replacement categories # and a copy of them to work with. The index of this dataframe # represents the original ints that map to the categories old_cats = cudf.DataFrame() old_cats["cats"] = column.as_column(self.dtype.categories) new_cats = old_cats.copy(deep=True) # Create a column with the appropriate labels replaced old_cats["cats_replace"] = old_cats["cats"].replace( to_replace, replacement ) # Construct the new categorical labels # If a category is being replaced by an existing one, we # want to map it to None. If it's totally new, we want to # map it to the new label it is to be replaced by dtype_replace = cudf.Series(replacement) dtype_replace[dtype_replace.isin(old_cats["cats"])] = None new_cats["cats"] = new_cats["cats"].replace(to_replace, dtype_replace) # anything we mapped to None, we want to now filter out since # those categories don't exist anymore # Resetting the index creates a column 'index' that associates # the original integers to the new labels bmask = new_cats["cats"]._column.notna() new_cats = cudf.DataFrame( {"cats": new_cats["cats"]._column.apply_boolean_mask(bmask)} ).reset_index() # old_cats contains replaced categories and the ints that # previously mapped to those categories and the index of # new_cats is a RangeIndex that contains the new ints catmap = old_cats.merge( new_cats, left_on="cats_replace", right_on="cats", how="inner" ) # The index of this frame is now the old ints, but the column # named 'index', which came from the filtered categories, # contains the new ints that we need to map to to_replace_col = column.as_column(catmap.index).astype( self.cat().codes.dtype ) replacement_col = catmap["index"]._column.astype( self.cat().codes.dtype ) replaced = column.as_column(self.cat().codes) output = libcudf.replace.replace( replaced, to_replace_col, replacement_col ) return column.build_categorical_column( categories=new_cats["cats"], codes=column.as_column(output.base_data, dtype=output.dtype), mask=output.base_mask, offset=output.offset, size=output.size, ordered=self.dtype.ordered, )
def _query6(self): self._loadTables('query6') self.rideReqTable = self.rideReqTable[ self.rideReqTable['rideReq.time'] < self.rideReqTable.shape[0] / 10] rideReqIndex = self._createIndex( self.rideReqTable, 'rideReq.start', ) driverStatusIndex = self._createIndex( self.driverStatusTable, 'drvStat.pos', ) locationPolygon = self._createBox( self.locationTable, 'loc.bounds', ) trainX = {} for i in range(10): trainX['c{}'.format(i)] = np.random.rand(1000) trainX = cudf.DataFrame(trainX) trainY = np.random.choice([0.0, 1.0], size=1000) trainY = cudf.Series(trainY) linReg = cuml.LinearRegression() linReg.fit(trainX, trainY) startTime = time.time() (joinRideReq, numbaTime0) = self._spatialJoinDist( self.rideReqTable, self.locationTable, 'rideReq.start', 'loc.bounds', rideReqIndex, locationPolygon, 0.0) joinRideReq['count'] = 0 reqGroup = joinRideReq.groupby(['loc.locationId'], ).agg({ 'count': 'count', }).reset_index() (joinDriver, numbaTime1) = self._spatialJoinDist( self.driverStatusTable, self.locationTable, 'drvStat.pos', 'loc.bounds', driverStatusIndex, locationPolygon, 0.0) joinDriver['count'] = 0 driverGroup = joinDriver.groupby(['loc.locationId'], ).agg({ 'count': 'count', }).reset_index() join0 = reqGroup.merge(driverGroup, on='loc.locationId') join1 = join0.merge(self.locationTable, on='loc.locationId') featureName = [ 'loc.c0', 'loc.c1', 'loc.c2', 'loc.c3', 'loc.c4', 'loc.c5', 'loc.c6', 'loc.c7', 'loc.c8', 'loc.c9', ] join1['infer'] = linReg.predict(join1[featureName]) endTime = time.time() join1.to_csv( 'query6_gpu.csv', index=False, ) return endTime - startTime - numbaTime0 - numbaTime1
async def test_ucx_localcluster(): async with LocalCUDACluster( protocol="ucx", dashboard_address=None, n_workers=2, threads_per_worker=1, processes=True, asynchronous=True, enable_tcp_over_ucx=enable_tcp_over_ucx, enable_nvlink=enable_nvlink, enable_infiniband=enable_infiniband, ) as cluster: async with Client(cluster, asynchronous=True) as client: """ Next, simply call list using an asynchronous Dask client. The callback function is pushed to the workers and invoked when a message is received with a BlazingMessage """ try: ips_ports = await listen_async(callback=mock_msg_callback, client=client) print(str(ips_ports)) "<<<<<<<<<< Begin Test Logic >>>>>>>>>>>>" assert len(ips_ports) == len( client.scheduler_info()["workers"]) for k, v in ips_ports.items(): assert v is not None import numpy meta = {"worker_ids": tuple(ips_ports.keys())} data = cudf.DataFrame({ "%s" % x: cudf.Series(np.arange(37000)) for x in range(50) }) """ Loop through each of the workers, sending a test BlazingMessage to all other workers. """ for dask_addr, blazing_addr in ips_ports.items(): msg = BlazingMessage(meta, data) for n in range(1): async def send(msg): await UCX.get().send(msg) await client.run(send, msg, workers=[dask_addr], wait=True) """ Gather messages received on each worker for validation """ received = await client.run( lambda: get_worker()._test_msgs_received, wait=True) assert len(received) == len(ips_ports) for worker_addr, msgs in received.items(): for msg in msgs: cudf_test.assert_eq(msg.data, data) assert msg.metadata == meta assert len(msgs) == len(ips_ports) finally: print("Cleaning up") await cleanup(client)
def symmetrize_df(df, src_name, dst_name, multi=False, symmetrize=True): """ Take a COO stored in a DataFrame, along with the column names of the source and destination columns and create a new data frame using the same column names that symmetrize the graph so that all edges appear in both directions. Note that if other columns exist in the data frame (e.g. edge weights) the other columns will also be replicated. That is, if (u,v,data) represents the source value (u), destination value (v) and some set of other columns (data) in the input data, then the output data will contain both (u,v,data) and (v,u,data) with matching data. If (u,v,data1) and (v,u,data2) exist in the input data where data1 != data2 then this code will arbitrarily pick the smaller data element to keep, if this is not desired then the caller should should correct the data prior to calling symmetrize. Parameters ---------- df : cudf.DataFrame Input data frame containing COO. Columns should contain source ids, destination ids and any properties associated with the edges. src_name : string Name of the column in the data frame containing the source ids dst_name : string Name of the column in the data frame containing the destination ids multi : bool Set to True if graph is a Multi(Di)Graph. This allows multiple edges instead of dropping them. symmetrize : bool Default is True to perform symmetrization. If False only duplicate edges are dropped. Examples -------- >>> import cugraph.dask as dcg >>> Comms.initialize() >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'weight'], dtype=['int32', 'int32', 'float32']) >>> sym_ddf = cugraph.symmetrize_ddf(ddf, "src", "dst", "weight") >>> Comms.destroy() """ # # Now append the columns. We add sources to the end of destinations, # and destinations to the end of sources. Otherwise we append a # column onto itself. # if symmetrize: gdf = cudf.DataFrame() for idx, name in enumerate(df.columns): if name == src_name: gdf[src_name] = df[src_name].append(df[dst_name], ignore_index=True) elif name == dst_name: gdf[dst_name] = df[dst_name].append(df[src_name], ignore_index=True) else: gdf[name] = df[name].append(df[name], ignore_index=True) else: gdf = df if multi: return gdf else: return gdf.groupby(by=[src_name, dst_name], as_index=False).min()
def convert_output_to_cudf(input_G_or_matrix, cugraph_result): """ Convert cugraph_result to a cudf DataFrame. The conversion is based on the type of input_G_or_matrix, since different input types result in different cugraph_result types (see cugraph_input_output_map). """ input_type = type(input_G_or_matrix) expected_return_type = cuGraph_input_output_map[type(input_G_or_matrix)] assert type(cugraph_result) is expected_return_type if expected_return_type is cudf.DataFrame: return cugraph_result elif expected_return_type is pd.DataFrame: return cudf.from_pandas(cugraph_result) # A CuPy/SciPy input means the return value will be a 2-tuple of: # distance: cupy.ndarray # ndarray of shortest distances between source and vertex. # predecessor: cupy.ndarray # ndarray of predecessors of a vertex on the path from source, which # can be used to reconstruct the shortest paths. # or a 3-tuple of the above 2 plus # sp_counter: cupy.ndarray # for the i'th position in the array, the number of shortest paths # leading to the vertex at position i in the (input) vertex array. elif expected_return_type is tuple: if input_type in cupy_types: assert type(cugraph_result[0]) is cp.ndarray assert type(cugraph_result[1]) is cp.ndarray if len(cugraph_result) == 3: assert type(cugraph_result[2]) is cp.ndarray else: assert type(cugraph_result[0]) is np.ndarray assert type(cugraph_result[1]) is np.ndarray if len(cugraph_result) == 3: assert type(cugraph_result[2]) is np.ndarray # Get unique verts from input since they are not incuded in output if type(input_G_or_matrix) in [ cp_csr_matrix, cp_csc_matrix, sp_csr_matrix, sp_csc_matrix ]: coo = input_G_or_matrix.tocoo(copy=False) else: coo = input_G_or_matrix verts = sorted( set([n.item() for n in coo.col] + [n.item() for n in coo.row])) dists = [n.item() for n in cugraph_result[0]] preds = [n.item() for n in cugraph_result[1]] assert len(verts) == len(dists) == len(preds) d = {"vertex": verts, "distance": dists, "predecessor": preds} if len(cugraph_result) == 3: counters = [n.item() for n in cugraph_result[2]] assert len(counters) == len(verts) d.update({"sp_counter": counters}) return cudf.DataFrame(d) else: raise RuntimeError(f"unsupported return type: {expected_return_type}")
def symmetrize(source_col, dest_col, value_col=None, multi=False, symmetrize=True): """ Take a COO set of source destination pairs along with associated values stored in a single GPU or distributed create a new COO set of source destination pairs along with values where all edges exist in both directions. Return from this call will be a COO stored as two cudf Series or dask_cudf.Series -the symmetrized source column and the symmetrized dest column, along with an optional cudf Series containing the associated values (only if the values are passed in). Parameters ---------- source_col : cudf.Series or dask_cudf.Series This cudf.Series wraps a gdf_column of size E (E: number of edges). The gdf column contains the source index for each edge. Source indices must be an integer type. dest_col : cudf.Series or dask_cudf.Series This cudf.Series wraps a gdf_column of size E (E: number of edges). The gdf column contains the destination index for each edge. Destination indices must be an integer type. value_col : cudf.Series or dask_cudf.Series (optional) This cudf.Series wraps a gdf_column of size E (E: number of edges). The gdf column contains values associated with this edge. For this function the values can be any type, they are not examined, just copied. Examples -------- >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', >>> dtype=['int32', 'int32', 'float32'], header=None) >>> sources = cudf.Series(M['0']) >>> destinations = cudf.Series(M['1']) >>> values = cudf.Series(M['2']) >>> src, dst, val = cugraph.symmetrize(sources, destinations, values) """ input_df = None weight_name = None if type(source_col) is dask_cudf.Series: # FIXME convoluted way of just wrapping dask cudf Series in a ddf input_df = source_col.to_frame() input_df = input_df.rename(columns={source_col.name: "source"}) input_df["destination"] = dest_col else: input_df = cudf.DataFrame({ "source": source_col, "destination": dest_col }) csg.null_check(source_col) csg.null_check(dest_col) if value_col is not None: if isinstance(value_col, cudf.Series): weight_name = "value" input_df.insert(len(input_df.columns), "value", value_col) elif isinstance(value_col, cudf.DataFrame): input_df = cudf.concat([input_df, value_col], axis=1) output_df = None if type(source_col) is dask_cudf.Series: output_df = symmetrize_ddf(input_df, "source", "destination", weight_name).persist() else: output_df = symmetrize_df(input_df, "source", "destination", multi, symmetrize) if value_col is not None: if isinstance(value_col, cudf.Series): return ( output_df["source"], output_df["destination"], output_df["value"], ) elif isinstance(value_col, cudf.DataFrame): return ( output_df["source"], output_df["destination"], output_df[value_col.columns], ) return output_df["source"], output_df["destination"]
async def test_cuda_backend(): import cupy import cudf params, teardown_params = await CudaStorage.setup() storage = CudaStorage(**params) assert storage.level == StorageLevel.GPU data1 = cupy.asarray(np.random.rand(10, 10)) put_info1 = await storage.put(data1) get_data1 = await storage.get(put_info1.object_id) cupy.testing.assert_array_equal(data1, get_data1) info1 = await storage.object_info(put_info1.object_id) assert info1.size == put_info1.size await storage.delete(put_info1.object_id) data2 = cudf.DataFrame( pd.DataFrame( { 'col1': np.arange(10), 'col2': [f'str{i}' for i in range(10)], 'col3': np.random.rand(10) }, )) put_info2 = await storage.put(data2) get_data2 = await storage.get(put_info2.object_id) cudf.testing.assert_frame_equal(data2, get_data2) info2 = await storage.object_info(put_info2.object_id) assert info2.size == put_info2.size await CudaStorage.teardown(**teardown_params) # test writer and reader t = np.random.random(10) buffers = await AioSerializer(t).run() size = sum(getattr(buf, 'nbytes', len(buf)) for buf in buffers) async with await storage.open_writer(size=size) as writer: for buf in buffers: await writer.write(buf) async with await storage.open_reader(writer.object_id) as reader: content = await reader.read() b = content.to_host_array().tobytes() t2 = await AioDeserializer(io.BytesIO(b)).run() np.testing.assert_array_equal(t, t2) # write cupy array t = cupy.random.random((10, )) headers, buffers = serialize(t) async with await storage.open_writer(size=len(b)) as writer: for buffer in buffers: await writer.write(buffer.data) async with await storage.open_reader(writer.object_id) as reader: b2 = await reader.read() t2 = deserialize(headers, [b2]) cupy.testing.assert_array_equal(t, t2) await CudaStorage.teardown(**teardown_params)
def test_categorify_freq_limit(tmpdir, freq_limit, buckets, search_sort): df = cudf.DataFrame({ "Author": [ "User_A", "User_E", "User_B", "User_C", "User_A", "User_E", "User_B", "User_C", "User_B", "User_C", ], "Engaging User": [ "User_B", "User_B", "User_A", "User_D", "User_B", "User_c", "User_A", "User_D", "User_D", "User_D", ], }) isfreqthr = (isinstance(freq_limit, int) and freq_limit > 0) or (isinstance(freq_limit, dict)) if (not search_sort and isfreqthr) or (search_sort and not isfreqthr): cat_names = ["Author", "Engaging User"] cats = cat_names >> ops.Categorify( freq_threshold=freq_limit, out_path=str(tmpdir), search_sorted=search_sort, num_buckets=buckets, ) workflow = nvt.Workflow(cats) df_out = workflow.fit_transform( nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") if freq_limit and not buckets: # Column combinations are encoded if isinstance(freq_limit, dict): assert df_out["Author"].max() == 2 assert df_out["Engaging User"].max() == 1 else: assert len(df["Author"].unique()) == df_out["Author"].max() assert len(df["Engaging User"].unique() ) == df_out["Engaging User"].max() elif not freq_limit and buckets: if isinstance(buckets, dict): assert df_out["Author"].max() <= 9 assert df_out["Engaging User"].max() <= 19 else: assert df_out["Author"].max() <= 9 assert df_out["Engaging User"].max() <= 9 elif freq_limit and buckets: if isinstance(buckets, dict) and isinstance(buckets, dict): assert ( df_out["Author"].max() <= (df["Author"].hash_values() % buckets["Author"]).max() + 2 + 1) assert (df_out["Engaging User"].max() <= (df["Engaging User"].hash_values() % buckets["Engaging User"]).max() + 1 + 1)
def _transform_df(df, sample_key_cols, common_key_cols, common_cols, drop_cols): """ Inputs ------ df: pd.DataFrame A pandas datafarme read from a vcf file using variantworks.io.vcfio.VCFReader sample_key_cols: list List of `sample_variant` columns in the df common_key_cols: list List of common_variants columns across all samples at a location drop_cols : list Columns to drop Returns ------- A cuDF dataframe modified to """ sample_key_cols = list(set(sample_key_cols) - set(drop_cols)) common_key_cols = list(set(common_key_cols) - set(drop_cols)) common_cols = list(set(common_cols) - set(drop_cols)) df2 = df.drop(columns=drop_cols) df2 = df2[sample_key_cols].transpose() df2.reset_index(inplace=True) pid_attr_split = df2["index"].str.split("_", expand=True) pid_attr_split.columns = ["sample", "key"] pid_attr_split["key"] = "call_" + pid_attr_split["key"] df2 = pd.concat([df2, pid_attr_split], axis=1) df2.drop(columns="index", axis=1, inplace=True) temp = pd.DataFrame(pid_attr_split["sample"].unique()) unique_samples = len(temp) temp.columns = ["sample"] temp = temp.loc[temp.index.repeat(len(common_key_cols))] temp = temp.reset_index(drop=True) temp2 = df[common_key_cols].transpose().astype("float64") temp2["key"] = temp2.index temp2 = pd.concat([temp2] * unique_samples, axis=0) temp2 = temp2.reset_index(drop=True) temp = pd.concat([temp2, temp], axis=1) del temp2 df2 = pd.concat([df2, temp], axis=0) del temp res_df = pd.melt( df2, id_vars=["sample", "key"], value_vars=df2.columns[:-2], var_name="location", ) del df2 gdf1 = cudf.DataFrame(res_df) gdf2 = cudf.DataFrame(df[common_cols]) gdf1 = gdf1.merge(gdf2, how="left", left_on="location", right_index=True) del gdf2 gdf1 = gdf1.astype({"ref": "int8", "alt": "int8"}) gdf1 = gdf1[["chrom", "start_pos", "ref", "alt", "sample", "key", "value"]] gdf1 = gdf1.pivot( index=["chrom", "start_pos", "ref", "alt", "sample"], columns=["key"], values=["value"], ).reset_index() col_list = [i[1] if i[0] == "value" else i[0] for i in list(gdf1.columns)] gdf1.columns = col_list gdf1.rename(columns={"start_pos": "pos"}, inplace=True) return gdf1
def main(client, config): import dask_cudf import cudf item_df = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) wcs_tstamp_min = get_wcs_minima(config) item_df["i_item_sk"] = item_df["i_item_sk"].astype("int32") item_df["i_category_id"] = item_df["i_category_id"].astype("int8") # we eventually will only care about these categories, so we can filter now item_df_filtered = item_df.loc[ item_df.i_category_id.isin(q03_purchased_item_category_IN) ].reset_index(drop=True) # The main idea is that we don't fuse a filtration task with reading task yet # this causes more memory pressures as we try to read the whole thing ( and spill that) # at once and then do filtration . web_clickstream_flist = glob.glob(os.path.join(config["data_dir"], "web_clickstreams/*.parquet")) task_ls = [ delayed(pre_repartition_task)(fn, item_df.to_delayed()[0], wcs_tstamp_min) for fn in web_clickstream_flist ] meta_d = { "wcs_user_sk": np.ones(1, dtype=np.int32), "tstamp": np.ones(1, dtype=np.int32), "wcs_item_sk": np.ones(1, dtype=np.int32), "wcs_sales_sk": np.ones(1, dtype=np.int32), "i_category_id": np.ones(1, dtype=np.int8), } meta_df = cudf.DataFrame(meta_d) merged_df = dask_cudf.from_delayed(task_ls, meta=meta_df) merged_df = merged_df.shuffle(on="wcs_user_sk") meta_d = { "i_item_sk": np.ones(1, dtype=merged_df["wcs_item_sk"].dtype), "cnt": np.ones(1, dtype=merged_df["wcs_item_sk"].dtype), } meta_df = cudf.DataFrame(meta_d) grouped_df = merged_df.map_partitions( reduction_function, item_df_filtered.to_delayed()[0], meta=meta_df ) ### todo: check if this has any impact on stability grouped_df = grouped_df.persist(priority=10000) ### todo: remove this later after more testing wait(grouped_df) print("---" * 20) print("grouping complete ={}".format(len(grouped_df))) grouped_df = grouped_df.groupby(["i_item_sk"]).sum(split_every=2).reset_index() grouped_df.columns = ["i_item_sk", "cnt"] result_df = grouped_df.map_partitions( lambda df: df.sort_values(by=["cnt"], ascending=False) ) result_df.columns = ["lastviewed_item", "cnt"] result_df["purchased_item"] = q03_purchased_item_IN cols_order = ["purchased_item", "lastviewed_item", "cnt"] result_df = result_df[cols_order] result_df = result_df.persist() ### todo: remove this later after more testing wait(result_df) print(len(result_df)) result_df = result_df.head(q03_limit) print("result complete") print("---" * 20) return result_df
def load_vcf(vcf_file, info_keys=[], format_keys=[]): """Function to load VCF into gwas dataframe.""" # Load VCF file using pysam reader = pysam.VariantFile(vcf_file) if "*" in info_keys: header_dict = dict(reader.header.info) new_keys = [] for k in header_dict.keys(): new_keys.append(k) info_keys = new_keys if "*" in format_keys: header_dict = dict(reader.header.formats) new_keys = [] for k in header_dict.keys(): new_keys.append(k) format_keys = new_keys print(info_keys) info_keys = set(info_keys) print(format_keys) format_keys = set(format_keys) df_dict = defaultdict(list) for record in reader: if len(record.alts) != 1: continue if record.ref not in nucleotide_dict or record.alts[0] not in nucleotide_dict: continue # Run through all variants and all their keys in format for sample in record.samples: format_dict = dict(record.samples[sample]) for key, value in format_dict.items(): if key not in format_keys: continue # _add_basic_component(record, sample, df_dict) if key == "GT": if None in list(value): value = -1 else: value = sum(list(value)) _add_key_value(record, sample, f"call_{key}", value, df_dict) # Run through all variants and all their info keys info_dict = dict(record.info) for key, value in info_dict.items(): if key not in info_keys: continue # _add_basic_component(record, sample, df_dict) _add_key_value(record, sample, key, value, df_dict) df = pd.DataFrame.from_dict(df_dict) df, feature_mapping = _create_numerical_features(df) df = df.pivot_table( index=["chrom", "pos", "ref", "alt", "sample", "quality", "feature_id"], columns="key", values="value", ).reset_index() cuda_df = cudf.DataFrame(df) return cuda_df, feature_mapping
print(len(test)) if len(test) > 3: COMPUTE_CV = False else: print('this submission notebook will compute CV score, but commit notebook will not') train = pd.read_csv('./shopee-product-matching/train.csv') tmp = train.groupby('label_group').posting_id.agg('unique').to_dict() train['target'] = train.label_group.map(tmp) print('train shape is', train.shape) train.head() if COMPUTE_CV: test = pd.read_csv('./shopee-product-matching/train.csv') test_gf = cudf.DataFrame(test) print('Using train as test to compute CV (since commit notebook). Shape is', test_gf.shape) else: test = pd.read_csv('./shopee-product-matching/test.csv') test_gf = cudf.read_csv('./shopee-product-matching/test.csv') print('Test shape is', test_gf.shape) test_gf.head() def getMetric(col): def f1score(row): n = len(np.intersect1d(row.target,row[col]) ) return 2*n / (len(row.target)+len(row[col])) return f1score
def strong_connected_component(source, destination): """ Generate the strongly connected components using the FW-BW-TRIM approach, but skipping the trimming) Parameters ---------- source : cudf.Series A cudf series that contains the source side of an edge list destination : cudf.Series A cudf series that contains the destination side of an edge list Returns ------- cdf : cudf.DataFrame - a dataframe for components df['vertex'] - the vertex ID df['id'] - the component ID sdf : cudf.DataFrame - a dataframe with single vertex components df['vertex'] - the vertex ID count - int - the number of components found Examples -------- >>> # M = read_mtx_file(graph_file) >>> # sources = cudf.Series(M.row) >>> # destinations = cudf.Series(M.col) >>> # components, single_components, count = >>> # cugraph.strong_connected_component(source, destination) """ # FIXME: Uncomment out the above example max_value = np.iinfo(np.int32).max # NOQA # create the FW and BW graphs - this version dopes nopt modify the graphs G_fw = cugraph.Graph() G_bw = cugraph.Graph() G_fw.add_edge_list(source, destination) G_bw.add_edge_list(destination, source) # get a list of vertices and sort the list on out_degree d = G_fw.degrees() d = d.sort_values(by='out_degree', ascending=False) num_verts = len(d) # create space for the answers components = [None] * num_verts single_components = [None] * num_verts # Counts - aka array indexies count = 0 single_count = 0 # remove vertices that cannot be in a component bad = d.query('in_degree == 0 or out_degree == 0') if len(bad): bad = bad.drop(['in_degree', 'out_degree']) single_components[single_count] = bad single_count = single_count + 1 d = _filter_list(d, bad) # ----- Start processing ----- while len(d) > 0: v = d['vertex'][0] # compute the forward BFS bfs_fw = cugraph.bfs(G_fw, v) bfs_fw = bfs_fw.query("distance != @max_value") # Now backwards bfs_bw = cugraph.bfs(G_bw, v) bfs_bw = bfs_bw.query("distance != @max_value") # intersection common = bfs_fw.merge(bfs_bw, on='vertex', how='inner') if len(common) > 1: common['id'] = v components[count] = common d = _filter_list(d, common) count = count + 1 else: # v is an isolated vertex vdf = cudf.DataFrame() vdf['vertex'] = v single_components[single_count] = vdf single_count = single_count + 1 d = d.iloc[1:] # end of loop until vertex queue is empty comp = _compress_array(components, count) sing = _compress_array(single_components, single_count) return comp, sing, count
def concat(objs, axis=0, ignore_index=False, sort=None): """Concatenate DataFrames, Series, or Indices row-wise. Parameters ---------- objs : list of DataFrame, Series, or Index axis : {0/'index', 1/'columns'}, default 0 The axis to concatenate along. ignore_index : bool, default False Set True to ignore the index of the *objs* and provide a default range index instead. sort : bool, default False Sort non-concatenation axis if it is not already aligned. Returns ------- A new object of like type with rows from each object in ``objs``. Examples -------- Combine two ``Series``. >>> import cudf >>> s1 = cudf.Series(['a', 'b']) >>> s2 = cudf.Series(['c', 'd']) >>> s1 0 a 1 b dtype: object >>> s2 0 c 1 d dtype: object >>> cudf.concat([s1, s2]) 0 a 1 b 0 c 1 d dtype: object Clear the existing index and reset it in the result by setting the ``ignore_index`` option to ``True``. >>> cudf.concat([s1, s2], ignore_index=True) 0 a 1 b 2 c 3 d dtype: object Combine two DataFrame objects with identical columns. >>> df1 = cudf.DataFrame([['a', 1], ['b', 2]], ... columns=['letter', 'number']) >>> df1 letter number 0 a 1 1 b 2 >>> df2 = cudf.DataFrame([['c', 3], ['d', 4]], ... columns=['letter', 'number']) >>> df2 letter number 0 c 3 1 d 4 >>> cudf.concat([df1, df2]) letter number 0 a 1 1 b 2 0 c 3 1 d 4 Combine DataFrame objects with overlapping columns and return everything. Columns outside the intersection will be filled with ``null`` values. >>> df3 = cudf.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']], ... columns=['letter', 'number', 'animal']) >>> df3 letter number animal 0 c 3 cat 1 d 4 dog >>> cudf.concat([df1, df3], sort=False) letter number animal 0 a 1 None 1 b 2 None 0 c 3 cat 1 d 4 dog Combine ``DataFrame`` objects horizontally along the x axis by passing in ``axis=1``. >>> df4 = cudf.DataFrame([['bird', 'polly'], ['monkey', 'george']], ... columns=['animal', 'name']) >>> df4 animal name 0 bird polly 1 monkey george >>> cudf.concat([df1, df4], axis=1) letter number animal name 0 a 1 bird polly 1 b 2 monkey george """ if not objs: raise ValueError("No objects to concatenate") objs = [obj for obj in objs if obj is not None] # Return for single object if len(objs) == 1: if ignore_index: result = cudf.DataFrame( data=objs[0]._data.copy(deep=True), index=cudf.RangeIndex(len(objs[0])), ) else: result = objs[0].copy() return result if len(objs) == 0: raise ValueError("All objects passed were None") # Retrieve the base types of `objs`. In order to support sub-types # and object wrappers, we use `isinstance()` instead of comparing # types directly typs = set() for o in objs: if isinstance(o, cudf.MultiIndex): typs.add(cudf.MultiIndex) if issubclass(type(o), Index): typs.add(type(o)) elif isinstance(o, DataFrame): typs.add(DataFrame) elif isinstance(o, Series): typs.add(Series) else: raise ValueError(f"cannot concatenate object of type {type(o)}") allowed_typs = {Series, DataFrame} param_axis = _axis_map.get(axis, None) if param_axis is None: raise ValueError( '`axis` must be 0 / "index" or 1 / "columns", got: {0}'.format( param_axis ) ) else: axis = param_axis # when axis is 1 (column) we can concat with Series and Dataframes if axis == 1: assert typs.issubset(allowed_typs) df = DataFrame() _normalize_series_and_dataframe(objs, axis=axis) objs, match_index = _align_objs(objs) for idx, o in enumerate(objs): if not ignore_index and idx == 0: df.index = o.index for col in o._data.names: if col in df._data: raise NotImplementedError( "A Column with duplicate name found: {0}, cuDF\ doesn't support having multiple columns with\ same names yet.".format( col ) ) df[col] = o._data[col] result_columns = objs[0].columns for o in objs[1:]: result_columns = result_columns.append(o.columns) df.columns = result_columns.unique() if ignore_index: df.index = None return df elif not match_index: return df.sort_index() else: return df typ = list(typs)[0] if len(typs) > 1: if allowed_typs == typs: # This block of code will run when `objs` has # both Series & DataFrame kind of inputs. _normalize_series_and_dataframe(objs, axis=axis) typ = DataFrame else: raise ValueError( "`concat` cannot concatenate objects of " "types: %r." % sorted([t.__name__ for t in typs]) ) if typ is DataFrame: objs = [obj for obj in objs if obj.shape != (0, 0)] if len(objs) == 0: # If objs is empty, that indicates all of # objs are empty dataframes. return cudf.DataFrame() elif len(objs) == 1: if ignore_index: result = cudf.DataFrame( data=objs[0]._data.copy(deep=True), index=cudf.RangeIndex(len(objs[0])), ) else: result = objs[0].copy() return result else: return DataFrame._concat( objs, axis=axis, ignore_index=ignore_index, sort=sort ) elif typ is Series: return Series._concat( objs, axis=axis, index=None if ignore_index else True ) elif typ is cudf.MultiIndex: return cudf.MultiIndex._concat(objs) elif issubclass(typ, Index): return Index._concat(objs) else: raise ValueError(f"cannot concatenate object of type {typ}")
def generate_chunk(i_chunk, local_size, num_chunks, chunk_type, frac_match): # Setting a seed that triggers max amount of comm in the two-GPU case. cupy.random.seed(17561648246761420848) chunk_type = chunk_type or "build" frac_match = frac_match or 1.0 if chunk_type == "build": # Build dataframe # # "key" column is a unique sample within [0, local_size * num_chunks) # # "shuffle" column is a random selection of partitions (used for shuffle) # # "payload" column is a random permutation of the chunk_size start = local_size * i_chunk stop = start + local_size parts_array = cupy.arange(num_chunks, dtype="int64") suffle_array = cupy.repeat(parts_array, math.ceil(local_size / num_chunks)) df = cudf.DataFrame( { "key": cupy.arange(start, stop=stop, dtype="int64"), "shuffle": cupy.random.permutation(suffle_array)[:local_size], "payload": cupy.random.permutation( cupy.arange(local_size, dtype="int64") ), } ) else: # Other dataframe # # "key" column matches values from the build dataframe # for a fraction (`frac_match`) of the entries. The matching # entries are perfectly balanced across each partition of the # "base" dataframe. # # "payload" column is a random permutation of the chunk_size # Step 1. Choose values that DO match sub_local_size = local_size // num_chunks sub_local_size_use = max(int(sub_local_size * frac_match), 1) arrays = [] for i in range(num_chunks): bgn = (local_size * i) + (sub_local_size * i_chunk) end = bgn + sub_local_size ar = cupy.arange(bgn, stop=end, dtype="int64") arrays.append(cupy.random.permutation(ar)[:sub_local_size_use]) key_array_match = cupy.concatenate(tuple(arrays), axis=0) # Step 2. Add values that DON'T match missing_size = local_size - key_array_match.shape[0] start = local_size * num_chunks + local_size * i_chunk stop = start + missing_size key_array_no_match = cupy.arange(start, stop=stop, dtype="int64") # Step 3. Combine and create the final dataframe chunk (dask_cudf partition) key_array_combine = cupy.concatenate( (key_array_match, key_array_no_match), axis=0 ) df = cudf.DataFrame( { "key": cupy.random.permutation(key_array_combine), "payload": cupy.random.permutation( cupy.arange(local_size, dtype="int64") ), } ) return df
def parseHiveMetadataFor(curr_table, file_subset, partitions): metadata = {} names = [] n_cols = len(curr_table.input.columns) dtypes = curr_table.input.dtypes columns = curr_table.input.columns n_files = len(file_subset) col_indexes = {} for index in range(n_cols): col_name = columns[index] names.append('min_' + str(index) + '_' + col_name) names.append('max_' + str(index) + '_' + col_name) col_indexes[col_name] = index names.append('file_handle_index') names.append('row_group_index') minmax_metadata_table = [[] for _ in range(2 * n_cols + 2)] table_partition = {} for file_index, partition_name in enumerate(partitions): curr_table = partitions[partition_name] for col_name, col_value_id in curr_table: table_partition.setdefault(col_name, []).append(col_value_id) minmax_metadata_table[len(minmax_metadata_table) - 2].append(file_index) minmax_metadata_table[len(minmax_metadata_table) - 1].append(0) for index in range(n_cols): col_name = columns[index] if col_name in table_partition: col_value_ids = table_partition[col_name] index = col_indexes[col_name] minmax_metadata_table[2 * index] = col_value_ids minmax_metadata_table[2 * index + 1] = col_value_ids else: if dtypes[col_name] == np.object or dtypes[col_name] == np.dtype( 'datetime64[ms]') or dtypes[col_name] == np.datetime64: return cudf.DataFrame({}) minmax_metadata_table[2 * index] = [np.iinfo(dtypes[col_name]).min ] * n_files minmax_metadata_table[2 * index + 1] = [np.iinfo(dtypes[col_name]).max ] * n_files series = [] for index in range(n_cols): col_name = columns[index] col1 = pd.Series(minmax_metadata_table[2 * index], dtype=dtypes[col_name], name=names[2 * index]) col2 = pd.Series(minmax_metadata_table[2 * index + 1], dtype=dtypes[col_name], name=names[2 * index + 1]) series.append(col1) series.append(col2) index = n_cols col1 = pd.Series(minmax_metadata_table[2 * index], dtype=dtypes[col_name], name=names[2 * index]) col2 = pd.Series(minmax_metadata_table[2 * index + 1], dtype=dtypes[col_name], name=names[2 * index + 1]) series.append(col1) series.append(col2) frame = OrderedDict(((key, value) for (key, value) in zip(names, series))) metadata = cudf.DataFrame(frame) return metadata
def _top_level_groupby(gdf, cat_col_groups, tree_width, cont_cols, agg_list, on_host, concat_groups, name_sep): sum_sq = "std" in agg_list or "var" in agg_list calculate_min = "min" in agg_list calculate_max = "max" in agg_list # Top-level operation for category-based groupby aggregations output = {} k = 0 for i, cat_col_group in enumerate(cat_col_groups): if isinstance(cat_col_group, tuple): cat_col_group = list(cat_col_group) if isinstance(cat_col_group, str): cat_col_group = [cat_col_group] cat_col_group_str = _make_name(*cat_col_group, sep=name_sep) if concat_groups and len(cat_col_group) > 1: # Concatenate columns and replace cat_col_group # with the single name df_gb = cudf.DataFrame() ignore_index = True df_gb[cat_col_group_str] = _concat( [gdf[col] for col in cat_col_group], ignore_index) cat_col_group = [cat_col_group_str] else: # Compile aggregation dictionary and add "squared-sum" # column(s) (necessary when `cont_cols` is non-empty) df_gb = gdf[cat_col_group + cont_cols].copy(deep=False) agg_dict = {} agg_dict[cat_col_group[0]] = ["count"] for col in cont_cols: agg_dict[col] = ["sum"] if sum_sq: name = _make_name(col, "pow2", sep=name_sep) df_gb[name] = df_gb[col].pow(2) agg_dict[name] = ["sum"] if calculate_min: agg_dict[col].append("min") if calculate_max: agg_dict[col].append("max") # Perform groupby and flatten column index # (flattening provides better cudf support) if _is_list_col(cat_col_group, df_gb): # handle list columns by encoding the list values df_gb = cudf.DataFrame( {cat_col_group[0]: df_gb[cat_col_group[0]].list.leaves}) gb = df_gb.groupby(cat_col_group, dropna=False).agg(agg_dict) gb.columns = [ _make_name(*(tuple(cat_col_group) + name[1:]), sep=name_sep) if name[0] == cat_col_group[0] else _make_name( *(tuple(cat_col_group) + name), sep=name_sep) for name in gb.columns.to_flat_index() ] gb.reset_index(inplace=True, drop=False) del df_gb # Split the result by the hash value of the categorical column for j, split in enumerate( gb.partition_by_hash(cat_col_group, tree_width[cat_col_group_str], keep_index=False)): if on_host: output[k] = split.to_arrow(preserve_index=False) else: output[k] = split k += 1 del gb return output
def empty_dataframe(): import cudf return cudf.DataFrame({"a": [1.0], "b": [1.0]}).head(0)
def view_edge_list(self): """ Display the edge list. Compute it if needed. NOTE: If the graph is of type Graph() then the displayed undirected edges are the same as displayed by networkx Graph(), but the direction could be different i.e. an edge displayed by cugraph as (src, dst) could be displayed as (dst, src) by networkx. cugraph.Graph stores symmetrized edgelist internally. For displaying undirected edgelist for a Graph the upper trianglar matrix of the symmetrized edgelist is returned. networkx.Graph renumbers the input and stores the upper triangle of this renumbered input. Since the internal renumbering of networx and cugraph is different, the upper triangular matrix of networkx renumbered input may not be the same as cugraph's upper trianglar matrix of the symmetrized edgelist. Hence the displayed source and destination pairs in both will represent the same edge but node values could be swapped. Returns ------- edgelist_df : cudf.DataFrame This cudf.DataFrame wraps source, destination and weight gdf_column of size E (E: number of edges) The 'src' column contains the source index for each edge. Source indices are in the range [0, V) (V: number of vertices). The 'dst' column contains the destination index for each edge. Destination indices are in the range [0, V) (V: number of vertices). For weighted graphs, dataframe contains 'weight' column containing the weight value for each edge. """ if self.edgelist is None: graph_wrapper.view_edge_list(self) if type(self) is Graph: edgelist_df = self.edgelist.edgelist_df[self.edgelist.edgelist_df[ 'src'] <= self.edgelist.edgelist_df['dst']].\ reset_index(drop=True) self.edge_count = len(edgelist_df) else: edgelist_df = self.edgelist.edgelist_df if self.renumbered: if isinstance(self.edgelist.renumber_map, cudf.DataFrame): df = cudf.DataFrame() ncols = len(edgelist_df.columns) - 2 unrnb_df_ = edgelist_df.merge(self.edgelist.renumber_map, left_on='src', right_on='id', how='left').drop(['id', 'src']) unrnb_df = unrnb_df_.merge(self.edgelist.renumber_map, left_on='dst', right_on='id', how='left').drop(['id', 'dst']) cols = unrnb_df.columns.to_list() df = unrnb_df[cols[ncols:] + cols[0:ncols]] else: df = cudf.DataFrame() for c in edgelist_df.columns: if c in ['src', 'dst']: df[c] = self.edgelist.renumber_map[edgelist_df[c]].\ reset_index(drop=True) else: df[c] = edgelist_df[c] return df else: return edgelist_df
def compare(src1, dst1, val1, src2, dst2, val2): # # We will do comparison computations by using dataframe # merge functions (essentially doing fast joins). We # start by making two data frames # df1 = cudf.DataFrame() df1["src1"] = src1 df1["dst1"] = dst1 if val1 is not None: df1["val1"] = val1 df2 = cudf.DataFrame() df2["src2"] = src2 df2["dst2"] = dst2 if val2 is not None: df2["val2"] = val2 # # Check to see if all pairs in the original data frame # still exist in the new data frame. If we join (merge) # the data frames where (src1[i]=src2[i]) and (dst1[i]=dst2[i]) # then we should get exactly the same number of entries in # the data frame if we did not lose any data. # join = df1.merge(df2, left_on=["src1", "dst1"], right_on=["src2", "dst2"]) assert len(df1) == len(join) if val1 is not None: # # Check the values. In this join, if val1 and val2 are # the same then we are good. If they are different then # we need to check if the value is selected from the opposite # direction, so we'll merge with the edges reversed and # check to make sure that the values all match # diffs = join.query("val1 != val2") diffs_check = diffs.merge(df1, left_on=["src1", "dst1"], right_on=["dst1", "src1"]) query = diffs_check.query("val1_y != val2") if len(query) > 0: print("differences: ") print(query) assert 0 == len(query) # # Now check the symmetrized edges are present. If the original # data contains (u,v) we want to make sure that (v,u) is present # in the new data frame. # # We can accomplish this by doing the join (merge) where # (src1[i] = dst2[i]) and (dst1[i] = src2[i]), and verifying # that we get exactly the same number of entries in the data frame. # join = df1.merge(df2, left_on=["src1", "dst1"], right_on=["dst2", "src2"]) assert len(df1) == len(join) if val1 is not None: # # Check the values. In this join, if val1 and val2 are # the same then we are good. If they are different then # we need to check if the value is selected from the opposite # direction, so we'll merge with the edges reversed and # check to make sure that the values all match # diffs = join.query("val1 != val2") diffs_check = diffs.merge(df1, left_on=["src2", "dst2"], right_on=["src1", "dst1"]) query = diffs_check.query("val1_y != val2") if len(query) > 0: print("differences: ") print(query) assert 0 == len(query) # # Finally, let's check (in both directions) backwards. # We want to make sure that no edges were created in # the symmetrize logic that didn't already exist in one # direction or the other. This is a bit more complicated. # # The complication here is that the original data could, # for some edge (u,v) ALREADY contain the edge (v,u). The # symmetrized graph will not duplicate any edges, so the edge # (u,v) will only be present once. So we can't simply check # counts of df2 joined with df1. # # join1 will contain the join (merge) of df2 to df1 in the # forward direction # join2 will contain the join (merge) of df2 to df1 in the # reverse direction # # Finally, we'll do an outer join of join1 and join2, which # will combine any (u,v)/(v,u) pairs that might exist into # a joined row while keeping any (u,v) pairs that don't exist # in both data frames as single rows. This gives us a data frame # with the same number of rows as the symmetrized data. # join1 = df2.merge(df1, left_on=["src2", "dst2"], right_on=["src1", "dst1"]) join2 = df2.merge(df1, left_on=["src2", "dst2"], right_on=["dst1", "src1"]) joinM = join1.merge(join2, how="outer", on=["src2", "dst2"]) assert len(df2) == len(joinM)
def test_mh_model_support(tmpdir): df = cudf.DataFrame({ "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Reviewers": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Null User": ["User_B", "User_B", "User_A", "User_D"], "Post": [1, 2, 3, 4], "Cont1": [0.3, 0.4, 0.5, 0.6], "Cont2": [0.3, 0.4, 0.5, 0.6], "Cat1": ["A", "B", "A", "C"], }) cat_names = ["Cat1", "Null User", "Authors", "Reviewers"] # , "Engaging User"] cont_names = ["Cont1", "Cont2"] label_name = ["Post"] out_path = os.path.join(tmpdir, "train/") os.mkdir(out_path) cats = cat_names >> ops.Categorify() conts = cont_names >> ops.Normalize() processor = nvt.Workflow(cats + conts + label_name) df_out = processor.fit_transform(nvt.Dataset(df)).to_ddf().compute() data_itr = torch_dataloader.TorchAsyncItr( nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name, batch_size=2, ) emb_sizes = nvt.ops.get_embedding_sizes(processor) EMBEDDING_DROPOUT_RATE = 0.04 DROPOUT_RATES = [0.001, 0.01] HIDDEN_DIMS = [1000, 500] LEARNING_RATE = 0.001 model = Model( embedding_table_shapes=emb_sizes, num_continuous=len(cont_names), emb_dropout=EMBEDDING_DROPOUT_RATE, layer_hidden_dims=HIDDEN_DIMS, layer_dropout_rates=DROPOUT_RATES, ).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) def rmspe_func(y_pred, y): "Return y_pred and y to non-log space and compute RMSPE" y_pred, y = torch.exp(y_pred) - 1, torch.exp(y) - 1 pct_var = (y_pred - y) / y return (pct_var**2).mean().pow(0.5) train_loss, y_pred, y = process_epoch( data_itr, model, train=True, optimizer=optimizer, # transform=batch_transform, amp=False, ) train_rmspe = None train_rmspe = rmspe_func(y_pred, y) assert train_rmspe is not None assert len(y_pred) > 0 assert len(y) > 0
def shortest_path_length(G, source, target=None): """ Compute the distance from a source vertex to one or all vertexes in graph. Uses Single Source Shortest Path (SSSP). Parameters ---------- graph : cuGraph.Graph, NetworkX.Graph, or CuPy sparse COO matrix cuGraph graph descriptor with connectivity information. Edge weights, if present, should be single or double precision floating point values. source : Dependant on graph type. Index of the source vertex. If graph is an instance of cuGraph.Graph or CuPy sparse COO matrix: int If graph is an instance of a NetworkX.Graph: str target: Dependant on graph type. Vertex to find distance to. If graph is an instance of cuGraph.Graph or CuPy sparse COO matrix: int If graph is an instance of a NetworkX.Graph: str Returns ------- Return value type is based on the input type. If target is None, returns: cudf.DataFrame df['vertex'] vertex id df['distance'] gives the path distance from the starting vertex If target is not None, returns: Distance from source to target vertex. """ # verify target is in graph before traversing if target is not None: if not hasattr(G, "has_node"): # G is a cupy coo_matrix. Extract maximum possible vertex value as_matrix = G.toarray() if target < 0 or target >= max(as_matrix.shape[0], as_matrix.shape[1]): raise ValueError("Graph does not contain target vertex") elif not G.has_node(target): # G is an instance of cugraph or networkx graph raise ValueError("Graph does not contain target vertex") df = sssp(G, source) if isinstance(df, tuple): # cupy path, df is tuple of (distance, predecessor) if target: return df[0][target - 1] results = cudf.DataFrame() results["vertex"] = range(df[0].shape[0]) results["distance"] = df[0] return results else: # cugraph and networkx path if target: target_distance = df.loc[df["vertex"] == target] return target_distance.iloc[0]["distance"] results = cudf.DataFrame() results["vertex"] = df["vertex"] results["distance"] = df["distance"] return results
def _get_column_selection(self, arg): return cudf.DataFrame(self._df._get_columns_by_index(arg))
def np_to_cudf(X): df = cudf.DataFrame() for i in range(X.shape[1]): df['fea%d' % i] = cuda.to_device(np.ascontiguousarray(X[:, i])) return df