async def exchange_and_concat_bins(rank, eps, bins, timings=None): ret = [bins[rank]] if timings is not None: t1 = clock() await asyncio.gather(recv_bins(eps, ret), send_bins(eps, bins)) if timings is not None: t2 = clock() timings.append( (t2 - t1, sum([sys.getsizeof(b) for i, b in enumerate(bins) if i != rank])) ) return cudf.concat(ret)
def test_concat_columns(axis): pdf1 = pd.DataFrame(np.random.randint(10, size=(5, 3)), columns=[1, 2, 3]) pdf2 = pd.DataFrame(np.random.randint(10, size=(5, 4)), columns=[4, 5, 6, 7]) gdf1 = gd.from_pandas(pdf1) gdf2 = gd.from_pandas(pdf2) expect = pd.concat([pdf1, pdf2], axis=axis) got = gd.concat([gdf1, gdf2], axis=axis) assert_eq(expect, got)
def read_dists(dist_files: Generator, pcs: cudf.DataFrame, ndvi) -> pd.DataFrame: dfs = ([ cudf.read_csv(file).drop_duplicates("postcode").set_index("postcode"). rename(columns={"distance": re.split(r"_|\.", file.name)[1]}) for file in dist_files ], ) dfs = cudf.concat(dfs, axis=1).reset_index().pipe(fix_postcodes) return (dfs.set_index("postcode").join(ndvi).join( pcs).reset_index().groupby("lsoa11").median())
def _calc_bc_subset_fixed( G, Gnx, normalized, weight, endpoints, k, seed, result_dtype ): assert isinstance(k, int), ( "This test is meant for verifying coherence " "when k is given as an int" ) # In the fixed set we compare cu_bc against itself as we random.seed(seed) # on the same seed and then sample on the number of vertices themselves if seed is None: seed = 123 # random.seed(None) uses time, but we want same sources random.seed(seed) # It will be called again in cugraph's call sources = random.sample(range(G.number_of_vertices()), k) if G.renumbered: sources_df = cudf.DataFrame({'src': sources}) sources = G.unrenumber(sources_df, 'src')['src'].to_pandas().tolist() # The first call is going to proceed to the random sampling in the same # fashion as the lines above df = cugraph.betweenness_centrality( G, k=k, normalized=normalized, weight=weight, endpoints=endpoints, seed=seed, result_dtype=result_dtype, ) sorted_df = df.sort_values("vertex").rename( columns={"betweenness_centrality": "cu_bc"}, copy=False ).reset_index(drop=True) # The second call is going to process source that were already sampled # We set seed to None as k : int, seed : not none should not be normal # behavior df2 = cugraph.betweenness_centrality( G, k=sources, normalized=normalized, weight=weight, endpoints=endpoints, seed=None, result_dtype=result_dtype, ) sorted_df2 = df2.sort_values("vertex").rename( columns={"betweenness_centrality": "ref_bc"}, copy=False ).reset_index(drop=True) merged_sorted_df = cudf.concat( [sorted_df, sorted_df2["ref_bc"]], axis=1, sort=False ) return merged_sorted_df
def concatenate(objs, axis=0): if isinstance(objs[0], DataFrame) or isinstance(objs[0], Series): if len(objs) == 1: return objs[0] else: return cudf.concat(objs) elif isinstance(objs[0], cp.ndarray): return cp.concatenate(objs, axis=axis) elif isinstance(objs[0], np.ndarray): return np.concatenate(objs, axis=axis)
def test_concat(gdf, gddf, series): if series: gdf = gdf.x gddf = gddf.x a = (cudf.concat([gdf, gdf + 1, gdf + 2]).sort_values("x").reset_index(drop=True)) b = (dd.concat( [gddf, gddf + 1, gddf + 2], interleave_partitions=True).compute().sort_values("x").reset_index( drop=True)) dd.assert_eq(a, b)
def create_labels(self, size, labs_rep): df = cudf.DataFrame() for col in labs_rep: dist = col.distro or self.dist ser = dist.create_col(size, dtype=col.dtype, min_val=0, max_val=col.cardinality).ceil() ser.name = col.name df = cudf.concat([df, ser], axis=1) return df
def concat_cudf( dfs, axis=0, join="outer", uniform=False, filter_warning=True, sort=None, ignore_index=False, ): assert join == "outer" return cudf.concat(dfs, axis=axis, ignore_index=ignore_index)
def test_pandas_concat_compatibility_axis1_overlap(index, names, data): s1 = gd.Series(data[0], index=[0, 1, 2]) s2 = gd.Series(data[1], index=index) if names: s1.name = names[0] s2.name = names[1] ps1 = s1.to_pandas() ps2 = s2.to_pandas() got = gd.concat([s1, s2], axis=1) expect = pd.concat([ps1, ps2], axis=1) assert_eq(got, expect)
def fix_binary_predict_proba_result(proba): if proba.ndim == 1: if CumlToolBox.is_cupy_array(proba): proba = cupy.vstack([1 - proba, proba]).T else: proba = cudf.Series(proba) proba = cudf.concat([1 - proba, proba], axis=1) elif proba.shape[1] == 1: proba = cupy.hstack([1 - proba, proba]) return proba
def test_concat_misordered_columns(): df, df2, gdf, gdf2 = make_frames(False) gdf2 = gdf2[["z", "x", "y"]] df2 = df2[["z", "x", "y"]] res = gd.concat([gdf, gdf2]).to_pandas() sol = pd.concat([df, df2], sort=False) pd.util.testing.assert_frame_equal(res, sol, check_names=False, check_categorical=False)
def _spatialJoinDist(self, ldf, rdf, lName, rName, lTree, polygon, dist): (polyOffset, ringOffset, xPoints, yPoints) = polygon (points, tree) = lTree boundingBox = cuspatial.polygon_bounding_boxes(polyOffset, ringOffset, xPoints, yPoints) joinFilter = cuspatial.join_quadtree_and_bounding_boxes( tree, boundingBox, 0.0, 1.0, 0.0, 1.0, 1.0, 15) joinPolygon = cuspatial.quadtree_point_in_polygon( joinFilter, tree, points, ldf[lName + 'X'], ldf[lName + 'Y'], polyOffset, ringOffset, xPoints, yPoints, ) # https://github.com/rapidsai/cuspatial/issues/284 lGather = ldf.take(points.take( joinPolygon['point_index'])).reset_index(drop=True) rGather = rdf.take(joinPolygon['polygon_index']).reset_index(drop=True) dfConcat = cudf.concat([lGather, rGather], axis=1) dfConcat['distPred'] = False @cuda.jit def distPredFunc(lX, lY, rX, rY, out, dist): i = cuda.grid(1) if i < lX.shape[0]: dX = lX[i] - rX[i] dY = lY[i] - rY[i] dSquare = (dX * dX) + (dY * dY) out[i] = dSquare < (dist * dist) numbaTime = 0.0 if dist > 0.0: startTime = time.time() distPredFunc.forall(dfConcat.shape[0])(dfConcat[lName + 'X'], dfConcat[lName + 'Y'], dfConcat[rName + 'X'], dfConcat[rName + 'Y'], dfConcat['distPred'], dist) endTime = time.time() numbaTime = endTime - startTime dfConcat = dfConcat[dfConcat['distPred']] return (dfConcat, numbaTime)
def _to_df(self, X, extracted, columns): dfs = [cudf.DataFrame(arr, index=None) for arr in extracted] for df, pos in zip(dfs, np.cumsum([d.shape[1] for d in dfs])): df.reset_index(drop=True, inplace=True) df.columns = [f'c{i}' for i in range(pos - df.shape[1], pos)] df_out = cudf.concat(dfs, axis=1, ignore_index=True) if len(dfs) > 1 else dfs[0] if len(X) == len(df_out): df_out.index = X.index df_out.columns = columns return df_out
def test_fill_missing(tmpdir, datasets, engine="parquet"): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) columns = mycols_pq if engine == "parquet" else mycols_csv df = cudf.concat([cudf.read_parquet(path) for path in paths]) data_itr = nvtabular.io.GPUDatasetIterator(paths, columns=columns, use_row_groups=True, names=allcols_csv) op = nvt.ops.FillMissing(42) cont_names = ["x", "y"] columns_ctx = {} columns_ctx["continuous"] = {} columns_ctx["continuous"]["base"] = cont_names transformed = cudf.concat( [op.apply_op(df, columns_ctx, "continuous") for df in data_itr]) assert_eq(transformed[cont_names], df[cont_names].dropna(42))
def test_has_node(graph_file): gc.collect() cu_M = utils.read_csv_file(graph_file) nodes = cudf.concat([cu_M['0'], cu_M['1']]).unique() # cugraph add_edge_list G = cugraph.Graph() G.from_cudf_edgelist(cu_M, source='0', destination='1') for n in nodes: assert G.has_node(n)
def test_powerlaw(num_rows, distro): cats = list(json_sample["cats"].keys())[1:] cols = datagen._get_cols_from_schema(json_sample, distros=distro) df_gen = datagen.DatasetGen(datagen.PowerLawDistro(0.1)) df_pw = cudf.DataFrame() for x in range(10): df_pw_1 = df_gen.create_df(num_rows, cols) df_pw = cudf.concat([df_pw, df_pw_1], axis=0) sts, ps = df_gen.verify_df(df_pw[cats]) assert all(s > 0.9 for s in sts)
def test_has_node(graph_file): gc.collect() cu_M = utils.read_csv_file(graph_file) nodes = cudf.concat([cu_M["0"], cu_M["1"]]).unique() # cugraph add_edge_list G = cugraph.Graph() G.from_cudf_edgelist(cu_M, source="0", destination="1") for n in nodes.values_host: assert G.has_node(n)
def concat(df_list): if len(df_list) == 0: return None else: typ = str(type(df_list[0])) if "cudf" in typ: # delay import of cudf to handle CPU only tests import cudf return cudf.concat(df_list) else: return pandas.concat(df_list)
def test_concat(index): df, df2, gdf, gdf2 = make_frames(index) # Make empty frame gdf_empty1 = gdf2[:0] assert len(gdf_empty1) == 0 df_empty1 = gdf_empty1.to_pandas() # DataFrame res = gd.concat([gdf, gdf2, gdf, gdf_empty1]).to_pandas() sol = pd.concat([df, df2, df, df_empty1]) pd.util.testing.assert_frame_equal(res, sol, check_names=False) # Series for c in [i for i in ('x', 'y', 'z') if i != index]: res = gd.concat([gdf[c], gdf2[c], gdf[c]]).to_pandas() sol = pd.concat([df[c], df2[c], df[c]]) pd.util.testing.assert_series_equal(res, sol, check_names=False) # Index res = gd.concat([gdf.index, gdf2.index]).to_pandas() sol = df.index.append(df2.index) pd.util.testing.assert_index_equal(res, sol, check_names=False)
def test_gpu_file_iterator_parquet(datasets, batch): paths = glob.glob(str(datasets["parquet"]) + "/*.parquet") df_expect = cudf.read_parquet(paths[0], columns=mycols_pq) df_itr = cudf.DataFrame() data_itr = nvtabular.io.GPUFileIterator(paths[0], batch_size=batch, gpu_memory_frac=0.01, columns=mycols_pq) for data_gd in data_itr: df_itr = cudf.concat([df_itr, data_gd], axis=0) if df_itr else data_gd assert_eq(df_itr.reset_index(drop=True), df_expect.reset_index(drop=True))
def add_diff_user1(train, valid, col): gf1 = cudf.from_pandas(train[[col, "b_user_id", "tweet_id"]]).reset_index(drop=True) gf2 = cudf.from_pandas(valid[[col, "b_user_id", "tweet_id"]]).reset_index(drop=True) gf1["idx"] = gf1.index gf2["idx"] = gf2.index gf = cudf.concat([gf1, gf2], axis=0) gf_lang = gf[["b_user_id", col, "tweet_id"]] # .drop_duplicates() gf_lang = gf_lang[gf_lang[col] != 0] gf_lang = gf_lang.groupby(["b_user_id", col]).count() gf_lang = gf_lang[gf_lang > 3].reset_index() gf_lang = gf_lang.sort_values(["b_user_id", "tweet_id"], ascending=False) gf_lang["b_user_id_shifted"] = gf_lang["b_user_id"].shift(1) gf_lang = gf_lang[gf_lang["b_user_id_shifted"] != gf_lang["b_user_id"]] gf_lang.columns = ["b_user_id_lang", "top_" + col, "drop1", "drop2"] gf1 = gf1.merge( gf_lang[["b_user_id_lang", "top_" + col, "drop1", "drop2"]], how="left", left_on="b_user_id", right_on="b_user_id_lang", ) gf2 = gf2.merge( gf_lang[["b_user_id_lang", "top_" + col, "drop1", "drop2"]], how="left", left_on="b_user_id", right_on="b_user_id_lang", ) gf1 = gf1.sort_values("idx") gf2 = gf2.sort_values("idx") gf1["same_" + col] = gf1[col] == gf1["top_" + col] gf1["diff_" + col] = gf1[col] != gf1["top_" + col] gf1["nan_" + col] = 0 gf1.loc[gf1["top_" + col].isna(), "same_" + col] = 0 gf1.loc[gf1["top_" + col].isna(), "diff_" + col] = 0 gf1.loc[gf1["top_" + col].isna(), "nan_" + col] = 1 gf2["same_" + col] = gf2[col] == gf2["top_" + col] gf2["diff_" + col] = gf2[col] != gf2["top_" + col] gf2["nan_" + col] = 0 gf2.loc[gf2["top_" + col].isna(), "same_" + col] = 0 gf2.loc[gf2["top_" + col].isna(), "diff_" + col] = 0 gf2.loc[gf2["top_" + col].isna(), "nan_" + col] = 1 train["same_" + col] = gf1["same_" + col].fillna(0).astype("int32").to_array() train["diff_" + col] = gf1["diff_" + col].fillna(0).astype("int32").to_array() train["nan_" + col] = gf1["nan_" + col].fillna(0).astype("int32").to_array() valid["same_" + col] = gf2["same_" + col].fillna(0).astype("int32").to_array() valid["diff_" + col] = gf2["diff_" + col].fillna(0).astype("int32").to_array() valid["nan_" + col] = gf2["nan_" + col].fillna(0).astype("int32").to_array()
def _create_entity_nodes( events, columns, dropna=True, categorical_metadata=False, categories=dict(), DELIM="::", NODEID="node_id", CATEGORY="category", NODETYPE="node_type", ): nodes = [ cudf.DataFrame( dict([(NODEID, cudf.core.column.column_empty(0, "str")), (CATEGORY, cudf.core.column.column_empty( 0, "str" if not categorical_metadata else _empty_cat_dt( ))), (NODETYPE, cudf.core.column.column_empty( 0, "str" if not categorical_metadata else _empty_cat_dt( )))] + [(key, cudf.core.column.column_empty(0, col.dtype)) for key, col in events[columns].iteritems()])) ] for key, col in events[columns].iteritems(): cat = categories.get(key, key) col = col.unique() col = col.nans_to_nulls().dropna() if dropna else col if len(col) == 0: continue df = cudf.DataFrame({ key: cudf.core.column.as_column(col), NODEID: _prepend_str(col, cat + DELIM), CATEGORY: cat if not categorical_metadata else _str_scalar_to_category( len(col), cat), NODETYPE: key if not categorical_metadata else _str_scalar_to_category( len(col), key), }) df.reset_index(drop=True, inplace=True) nodes.append(df) nodes = cudf.concat(nodes) nodes = nodes.drop_duplicates(subset=[NODEID]) nodes = nodes[[NODEID, NODETYPE, CATEGORY] + list(columns)] nodes.reset_index(drop=True, inplace=True) return nodes
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = Workflow( client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name ) processor.add_preprocess( ops.GroupBy(cont_names=cont_names, stats=["count", "sum", "std"], out_path=str(tmpdir)) ) processor.finalize() dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) processor.apply(dataset) result = processor.get_ddf().compute(scheduler="synchronous") # Validate result assert len(df0) == len(result) assert "name-cat_x_std" in result.columns assert "name-cat_x_var" not in result.columns assert "name-string_x_std" in result.columns assert "name-string_x_var" not in result.columns # Check "count" assert_eq( result[["name-cat", "name-cat_count"]] .drop_duplicates() .sort_values("name-cat")["name-cat_count"], df0.groupby("name-cat").agg({"x": "count"})["x"], check_index=False, check_dtype=False, # May get int64 vs int32 check_names=False, ) # Check "std" assert_eq( result[["name-string", "name-string_x_std"]] .drop_duplicates() .sort_values("name-string")["name-string_x_std"], df0.groupby("name-string").agg({"x": "std"})["x"], check_index=False, check_names=False, )
def test_concat_series_dataframe_input(objs): pd_objs = objs gd_objs = [gd.from_pandas(obj) for obj in objs] expected = pd.concat(pd_objs) actual = gd.concat(gd_objs) assert_eq( expected.fillna(-1), actual.fillna(-1), check_dtype=False, check_index_type=False, )
def take(indices, depends): first = min(indices) last = max(indices) others = [] for d in depends: # TODO: this can be replaced with searchsorted # Normalize to index data in range before selection. firstindex = d.index[0] lastindex = d.index[-1] s = max(first, firstindex) e = min(last, lastindex) others.append(d.loc[s:e]) return gd.concat(others)
def test_ddf_dataset_itr(tmpdir, datasets, inp_format): paths = glob.glob(str(datasets["parquet"]) + "/*." + "parquet".split("-")[0]) ddf1 = dask_cudf.read_parquet(paths)[mycols_pq] df1 = ddf1.compute() if inp_format == "dask": ds = nvtabular.io.Dataset(ddf1.to_dask_dataframe()) elif inp_format == "dask_cudf": ds = nvtabular.io.Dataset(ddf1) elif inp_format == "cudf": ds = nvtabular.io.Dataset(df1) elif inp_format == "pandas": ds = nvtabular.io.Dataset(df1.to_pandas()) assert_eq(df1, cudf.concat(list(ds.to_iter(columns=mycols_pq))))
def test_fill_missing(tmpdir, df, dataset, engine): op = nvt.ops.FillMissing(42) cont_names = ["x", "y"] columns_ctx = {} columns_ctx["continuous"] = {} columns_ctx["continuous"]["base"] = cont_names for col in cont_names: idx = np.random.choice(df.shape[0] - 1, int(df.shape[0] * 0.2)) df[col].iloc[idx] = None transformed = cudf.concat([op.apply_op(df, columns_ctx, "continuous")]) assert_eq(transformed[cont_names], df[cont_names].fillna(42))
def test_string_concat(): data1 = ["a", "b", "c", "d", "e"] data2 = ["f", "g", "h", "i", "j"] ps1 = pd.Series(data1) ps2 = pd.Series(data2) gs1 = Series(data1) gs2 = Series(data2) expect = pd.concat([ps1, ps2]) got = concat([gs1, gs2]) assert_eq(expect, got)
def _compress_array(a, length): tmp = cudf.DataFrame() if length > 0: tmp_a = [None] * length for i in range(length): tmp_a[i] = a[i] tmp = cudf.concat(tmp_a) return tmp
def create_labels(self, size, labs_rep): df = cudf.DataFrame() for col in labs_rep: dist = col.distro or self.dist ser = dist.create_col(size, dtype=col.dtype, min_val=1, max_val=col.cardinality).ceil() # bring back down to correct representation because of ceil call ser = ser - 1 ser.name = col.name df = cudf.concat([df, ser], axis=1) return df