Ejemplo n.º 1
0
async def exchange_and_concat_bins(rank, eps, bins, timings=None):
    ret = [bins[rank]]
    if timings is not None:
        t1 = clock()
    await asyncio.gather(recv_bins(eps, ret), send_bins(eps, bins))
    if timings is not None:
        t2 = clock()
        timings.append(
            (t2 - t1, sum([sys.getsizeof(b) for i, b in enumerate(bins) if i != rank]))
        )
    return cudf.concat(ret)
Ejemplo n.º 2
0
def test_concat_columns(axis):
    pdf1 = pd.DataFrame(np.random.randint(10, size=(5, 3)), columns=[1, 2, 3])
    pdf2 = pd.DataFrame(np.random.randint(10, size=(5, 4)),
                        columns=[4, 5, 6, 7])
    gdf1 = gd.from_pandas(pdf1)
    gdf2 = gd.from_pandas(pdf2)

    expect = pd.concat([pdf1, pdf2], axis=axis)
    got = gd.concat([gdf1, gdf2], axis=axis)

    assert_eq(expect, got)
Ejemplo n.º 3
0
def read_dists(dist_files: Generator, pcs: cudf.DataFrame,
               ndvi) -> pd.DataFrame:
    dfs = ([
        cudf.read_csv(file).drop_duplicates("postcode").set_index("postcode").
        rename(columns={"distance": re.split(r"_|\.", file.name)[1]})
        for file in dist_files
    ], )
    dfs = cudf.concat(dfs, axis=1).reset_index().pipe(fix_postcodes)

    return (dfs.set_index("postcode").join(ndvi).join(
        pcs).reset_index().groupby("lsoa11").median())
Ejemplo n.º 4
0
def _calc_bc_subset_fixed(
    G, Gnx, normalized, weight, endpoints, k, seed, result_dtype
):
    assert isinstance(k, int), (
        "This test is meant for verifying coherence "
        "when k is given as an int"
    )
    # In the fixed set we compare cu_bc against itself as we random.seed(seed)
    # on the same seed and then sample on the number of vertices themselves
    if seed is None:
        seed = 123  # random.seed(None) uses time, but we want same sources
    random.seed(seed)  # It will be called again in cugraph's call
    sources = random.sample(range(G.number_of_vertices()), k)

    if G.renumbered:
        sources_df = cudf.DataFrame({'src': sources})
        sources = G.unrenumber(sources_df, 'src')['src'].to_pandas().tolist()

    # The first call is going to proceed to the random sampling in the same
    # fashion as the lines above
    df = cugraph.betweenness_centrality(
        G,
        k=k,
        normalized=normalized,
        weight=weight,
        endpoints=endpoints,
        seed=seed,
        result_dtype=result_dtype,
    )
    sorted_df = df.sort_values("vertex").rename(
        columns={"betweenness_centrality": "cu_bc"}, copy=False
    ).reset_index(drop=True)

    # The second call is going to process source that were already sampled
    # We set seed to None as k : int, seed : not none should not be normal
    # behavior
    df2 = cugraph.betweenness_centrality(
        G,
        k=sources,
        normalized=normalized,
        weight=weight,
        endpoints=endpoints,
        seed=None,
        result_dtype=result_dtype,
    )
    sorted_df2 = df2.sort_values("vertex").rename(
        columns={"betweenness_centrality": "ref_bc"}, copy=False
    ).reset_index(drop=True)

    merged_sorted_df = cudf.concat(
        [sorted_df, sorted_df2["ref_bc"]], axis=1, sort=False
    )

    return merged_sorted_df
Ejemplo n.º 5
0
def concatenate(objs, axis=0):
    if isinstance(objs[0], DataFrame) or isinstance(objs[0], Series):
        if len(objs) == 1:
            return objs[0]
        else:
            return cudf.concat(objs)
    elif isinstance(objs[0], cp.ndarray):
        return cp.concatenate(objs, axis=axis)

    elif isinstance(objs[0], np.ndarray):
        return np.concatenate(objs, axis=axis)
Ejemplo n.º 6
0
def test_concat(gdf, gddf, series):
    if series:
        gdf = gdf.x
        gddf = gddf.x
    a = (cudf.concat([gdf, gdf + 1,
                      gdf + 2]).sort_values("x").reset_index(drop=True))
    b = (dd.concat(
        [gddf, gddf + 1, gddf + 2],
        interleave_partitions=True).compute().sort_values("x").reset_index(
            drop=True))
    dd.assert_eq(a, b)
Ejemplo n.º 7
0
 def create_labels(self, size, labs_rep):
     df = cudf.DataFrame()
     for col in labs_rep:
         dist = col.distro or self.dist
         ser = dist.create_col(size,
                               dtype=col.dtype,
                               min_val=0,
                               max_val=col.cardinality).ceil()
         ser.name = col.name
         df = cudf.concat([df, ser], axis=1)
     return df
Ejemplo n.º 8
0
def concat_cudf(
    dfs,
    axis=0,
    join="outer",
    uniform=False,
    filter_warning=True,
    sort=None,
    ignore_index=False,
):
    assert join == "outer"
    return cudf.concat(dfs, axis=axis, ignore_index=ignore_index)
Ejemplo n.º 9
0
def test_pandas_concat_compatibility_axis1_overlap(index, names, data):
    s1 = gd.Series(data[0], index=[0, 1, 2])
    s2 = gd.Series(data[1], index=index)
    if names:
        s1.name = names[0]
        s2.name = names[1]
    ps1 = s1.to_pandas()
    ps2 = s2.to_pandas()
    got = gd.concat([s1, s2], axis=1)
    expect = pd.concat([ps1, ps2], axis=1)
    assert_eq(got, expect)
Ejemplo n.º 10
0
    def fix_binary_predict_proba_result(proba):
        if proba.ndim == 1:
            if CumlToolBox.is_cupy_array(proba):
                proba = cupy.vstack([1 - proba, proba]).T
            else:
                proba = cudf.Series(proba)
                proba = cudf.concat([1 - proba, proba], axis=1)
        elif proba.shape[1] == 1:
            proba = cupy.hstack([1 - proba, proba])

        return proba
Ejemplo n.º 11
0
def test_concat_misordered_columns():
    df, df2, gdf, gdf2 = make_frames(False)
    gdf2 = gdf2[["z", "x", "y"]]
    df2 = df2[["z", "x", "y"]]

    res = gd.concat([gdf, gdf2]).to_pandas()
    sol = pd.concat([df, df2], sort=False)

    pd.util.testing.assert_frame_equal(res,
                                       sol,
                                       check_names=False,
                                       check_categorical=False)
Ejemplo n.º 12
0
    def _spatialJoinDist(self, ldf, rdf, lName, rName, lTree, polygon, dist):
        (polyOffset, ringOffset, xPoints, yPoints) = polygon
        (points, tree) = lTree

        boundingBox = cuspatial.polygon_bounding_boxes(polyOffset, ringOffset,
                                                       xPoints, yPoints)

        joinFilter = cuspatial.join_quadtree_and_bounding_boxes(
            tree, boundingBox, 0.0, 1.0, 0.0, 1.0, 1.0, 15)

        joinPolygon = cuspatial.quadtree_point_in_polygon(
            joinFilter,
            tree,
            points,
            ldf[lName + 'X'],
            ldf[lName + 'Y'],
            polyOffset,
            ringOffset,
            xPoints,
            yPoints,
        )

        # https://github.com/rapidsai/cuspatial/issues/284
        lGather = ldf.take(points.take(
            joinPolygon['point_index'])).reset_index(drop=True)
        rGather = rdf.take(joinPolygon['polygon_index']).reset_index(drop=True)

        dfConcat = cudf.concat([lGather, rGather], axis=1)
        dfConcat['distPred'] = False

        @cuda.jit
        def distPredFunc(lX, lY, rX, rY, out, dist):
            i = cuda.grid(1)
            if i < lX.shape[0]:
                dX = lX[i] - rX[i]
                dY = lY[i] - rY[i]
                dSquare = (dX * dX) + (dY * dY)
                out[i] = dSquare < (dist * dist)

        numbaTime = 0.0
        if dist > 0.0:
            startTime = time.time()
            distPredFunc.forall(dfConcat.shape[0])(dfConcat[lName + 'X'],
                                                   dfConcat[lName + 'Y'],
                                                   dfConcat[rName + 'X'],
                                                   dfConcat[rName + 'Y'],
                                                   dfConcat['distPred'], dist)
            endTime = time.time()
            numbaTime = endTime - startTime

            dfConcat = dfConcat[dfConcat['distPred']]

        return (dfConcat, numbaTime)
Ejemplo n.º 13
0
    def _to_df(self, X, extracted, columns):
        dfs = [cudf.DataFrame(arr, index=None) for arr in extracted]
        for df, pos in zip(dfs, np.cumsum([d.shape[1] for d in dfs])):
            df.reset_index(drop=True, inplace=True)
            df.columns = [f'c{i}' for i in range(pos - df.shape[1], pos)]
        df_out = cudf.concat(dfs, axis=1,
                             ignore_index=True) if len(dfs) > 1 else dfs[0]
        if len(X) == len(df_out):
            df_out.index = X.index
        df_out.columns = columns

        return df_out
Ejemplo n.º 14
0
def test_fill_missing(tmpdir, datasets, engine="parquet"):
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    columns = mycols_pq if engine == "parquet" else mycols_csv

    df = cudf.concat([cudf.read_parquet(path) for path in paths])

    data_itr = nvtabular.io.GPUDatasetIterator(paths,
                                               columns=columns,
                                               use_row_groups=True,
                                               names=allcols_csv)

    op = nvt.ops.FillMissing(42)

    cont_names = ["x", "y"]
    columns_ctx = {}
    columns_ctx["continuous"] = {}
    columns_ctx["continuous"]["base"] = cont_names

    transformed = cudf.concat(
        [op.apply_op(df, columns_ctx, "continuous") for df in data_itr])
    assert_eq(transformed[cont_names], df[cont_names].dropna(42))
Ejemplo n.º 15
0
def test_has_node(graph_file):
    gc.collect()

    cu_M = utils.read_csv_file(graph_file)
    nodes = cudf.concat([cu_M['0'], cu_M['1']]).unique()

    # cugraph add_edge_list
    G = cugraph.Graph()
    G.from_cudf_edgelist(cu_M, source='0', destination='1')

    for n in nodes:
        assert G.has_node(n)
Ejemplo n.º 16
0
def test_powerlaw(num_rows, distro):
    cats = list(json_sample["cats"].keys())[1:]

    cols = datagen._get_cols_from_schema(json_sample, distros=distro)

    df_gen = datagen.DatasetGen(datagen.PowerLawDistro(0.1))
    df_pw = cudf.DataFrame()
    for x in range(10):
        df_pw_1 = df_gen.create_df(num_rows, cols)
        df_pw = cudf.concat([df_pw, df_pw_1], axis=0)
    sts, ps = df_gen.verify_df(df_pw[cats])
    assert all(s > 0.9 for s in sts)
Ejemplo n.º 17
0
def test_has_node(graph_file):
    gc.collect()

    cu_M = utils.read_csv_file(graph_file)
    nodes = cudf.concat([cu_M["0"], cu_M["1"]]).unique()

    # cugraph add_edge_list
    G = cugraph.Graph()
    G.from_cudf_edgelist(cu_M, source="0", destination="1")

    for n in nodes.values_host:
        assert G.has_node(n)
Ejemplo n.º 18
0
def concat(df_list):
    if len(df_list) == 0:
        return None
    else:
        typ = str(type(df_list[0]))
        if "cudf" in typ:
            # delay import of cudf to handle CPU only tests
            import cudf

            return cudf.concat(df_list)
        else:
            return pandas.concat(df_list)
Ejemplo n.º 19
0
def test_concat(index):
    df, df2, gdf, gdf2 = make_frames(index)
    # Make empty frame
    gdf_empty1 = gdf2[:0]
    assert len(gdf_empty1) == 0
    df_empty1 = gdf_empty1.to_pandas()
    # DataFrame
    res = gd.concat([gdf, gdf2, gdf, gdf_empty1]).to_pandas()
    sol = pd.concat([df, df2, df, df_empty1])
    pd.util.testing.assert_frame_equal(res, sol, check_names=False)

    # Series
    for c in [i for i in ('x', 'y', 'z') if i != index]:
        res = gd.concat([gdf[c], gdf2[c], gdf[c]]).to_pandas()
        sol = pd.concat([df[c], df2[c], df[c]])
        pd.util.testing.assert_series_equal(res, sol, check_names=False)

    # Index
    res = gd.concat([gdf.index, gdf2.index]).to_pandas()
    sol = df.index.append(df2.index)
    pd.util.testing.assert_index_equal(res, sol, check_names=False)
Ejemplo n.º 20
0
def test_gpu_file_iterator_parquet(datasets, batch):
    paths = glob.glob(str(datasets["parquet"]) + "/*.parquet")
    df_expect = cudf.read_parquet(paths[0], columns=mycols_pq)
    df_itr = cudf.DataFrame()
    data_itr = nvtabular.io.GPUFileIterator(paths[0],
                                            batch_size=batch,
                                            gpu_memory_frac=0.01,
                                            columns=mycols_pq)
    for data_gd in data_itr:
        df_itr = cudf.concat([df_itr, data_gd], axis=0) if df_itr else data_gd

    assert_eq(df_itr.reset_index(drop=True), df_expect.reset_index(drop=True))
Ejemplo n.º 21
0
def add_diff_user1(train, valid, col):
    gf1 = cudf.from_pandas(train[[col, "b_user_id", "tweet_id"]]).reset_index(drop=True)
    gf2 = cudf.from_pandas(valid[[col, "b_user_id", "tweet_id"]]).reset_index(drop=True)
    gf1["idx"] = gf1.index
    gf2["idx"] = gf2.index

    gf = cudf.concat([gf1, gf2], axis=0)
    gf_lang = gf[["b_user_id", col, "tweet_id"]]  # .drop_duplicates()
    gf_lang = gf_lang[gf_lang[col] != 0]
    gf_lang = gf_lang.groupby(["b_user_id", col]).count()
    gf_lang = gf_lang[gf_lang > 3].reset_index()
    gf_lang = gf_lang.sort_values(["b_user_id", "tweet_id"], ascending=False)
    gf_lang["b_user_id_shifted"] = gf_lang["b_user_id"].shift(1)
    gf_lang = gf_lang[gf_lang["b_user_id_shifted"] != gf_lang["b_user_id"]]
    gf_lang.columns = ["b_user_id_lang", "top_" + col, "drop1", "drop2"]
    gf1 = gf1.merge(
        gf_lang[["b_user_id_lang", "top_" + col, "drop1", "drop2"]],
        how="left",
        left_on="b_user_id",
        right_on="b_user_id_lang",
    )
    gf2 = gf2.merge(
        gf_lang[["b_user_id_lang", "top_" + col, "drop1", "drop2"]],
        how="left",
        left_on="b_user_id",
        right_on="b_user_id_lang",
    )

    gf1 = gf1.sort_values("idx")
    gf2 = gf2.sort_values("idx")

    gf1["same_" + col] = gf1[col] == gf1["top_" + col]
    gf1["diff_" + col] = gf1[col] != gf1["top_" + col]
    gf1["nan_" + col] = 0
    gf1.loc[gf1["top_" + col].isna(), "same_" + col] = 0
    gf1.loc[gf1["top_" + col].isna(), "diff_" + col] = 0
    gf1.loc[gf1["top_" + col].isna(), "nan_" + col] = 1

    gf2["same_" + col] = gf2[col] == gf2["top_" + col]
    gf2["diff_" + col] = gf2[col] != gf2["top_" + col]
    gf2["nan_" + col] = 0
    gf2.loc[gf2["top_" + col].isna(), "same_" + col] = 0
    gf2.loc[gf2["top_" + col].isna(), "diff_" + col] = 0
    gf2.loc[gf2["top_" + col].isna(), "nan_" + col] = 1

    train["same_" + col] = gf1["same_" + col].fillna(0).astype("int32").to_array()
    train["diff_" + col] = gf1["diff_" + col].fillna(0).astype("int32").to_array()
    train["nan_" + col] = gf1["nan_" + col].fillna(0).astype("int32").to_array()

    valid["same_" + col] = gf2["same_" + col].fillna(0).astype("int32").to_array()
    valid["diff_" + col] = gf2["diff_" + col].fillna(0).astype("int32").to_array()
    valid["nan_" + col] = gf2["nan_" + col].fillna(0).astype("int32").to_array()
Ejemplo n.º 22
0
def _create_entity_nodes(
    events,
    columns,
    dropna=True,
    categorical_metadata=False,
    categories=dict(),
    DELIM="::",
    NODEID="node_id",
    CATEGORY="category",
    NODETYPE="node_type",
):
    nodes = [
        cudf.DataFrame(
            dict([(NODEID, cudf.core.column.column_empty(0, "str")),
                  (CATEGORY,
                   cudf.core.column.column_empty(
                       0, "str" if not categorical_metadata else _empty_cat_dt(
                       ))),
                  (NODETYPE,
                   cudf.core.column.column_empty(
                       0, "str" if not categorical_metadata else _empty_cat_dt(
                       )))] +
                 [(key, cudf.core.column.column_empty(0, col.dtype))
                  for key, col in events[columns].iteritems()]))
    ]

    for key, col in events[columns].iteritems():
        cat = categories.get(key, key)
        col = col.unique()
        col = col.nans_to_nulls().dropna() if dropna else col
        if len(col) == 0:
            continue
        df = cudf.DataFrame({
            key:
            cudf.core.column.as_column(col),
            NODEID:
            _prepend_str(col, cat + DELIM),
            CATEGORY:
            cat if not categorical_metadata else _str_scalar_to_category(
                len(col), cat),
            NODETYPE:
            key if not categorical_metadata else _str_scalar_to_category(
                len(col), key),
        })
        df.reset_index(drop=True, inplace=True)
        nodes.append(df)

    nodes = cudf.concat(nodes)
    nodes = nodes.drop_duplicates(subset=[NODEID])
    nodes = nodes[[NODEID, NODETYPE, CATEGORY] + list(columns)]
    nodes.reset_index(drop=True, inplace=True)
    return nodes
Ejemplo n.º 23
0
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction):

    engine = "parquet"
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    df1 = cudf.read_parquet(paths[0])[mycols_pq]
    df2 = cudf.read_parquet(paths[1])[mycols_pq]
    df0 = cudf.concat([df1, df2], axis=0)

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = Workflow(
        client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name
    )

    processor.add_preprocess(
        ops.GroupBy(cont_names=cont_names, stats=["count", "sum", "std"], out_path=str(tmpdir))
    )
    processor.finalize()

    dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)
    processor.apply(dataset)
    result = processor.get_ddf().compute(scheduler="synchronous")

    # Validate result
    assert len(df0) == len(result)
    assert "name-cat_x_std" in result.columns
    assert "name-cat_x_var" not in result.columns
    assert "name-string_x_std" in result.columns
    assert "name-string_x_var" not in result.columns

    # Check "count"
    assert_eq(
        result[["name-cat", "name-cat_count"]]
        .drop_duplicates()
        .sort_values("name-cat")["name-cat_count"],
        df0.groupby("name-cat").agg({"x": "count"})["x"],
        check_index=False,
        check_dtype=False,  # May get int64 vs int32
        check_names=False,
    )

    # Check "std"
    assert_eq(
        result[["name-string", "name-string_x_std"]]
        .drop_duplicates()
        .sort_values("name-string")["name-string_x_std"],
        df0.groupby("name-string").agg({"x": "std"})["x"],
        check_index=False,
        check_names=False,
    )
Ejemplo n.º 24
0
def test_concat_series_dataframe_input(objs):
    pd_objs = objs
    gd_objs = [gd.from_pandas(obj) for obj in objs]

    expected = pd.concat(pd_objs)
    actual = gd.concat(gd_objs)

    assert_eq(
        expected.fillna(-1),
        actual.fillna(-1),
        check_dtype=False,
        check_index_type=False,
    )
Ejemplo n.º 25
0
 def take(indices, depends):
     first = min(indices)
     last = max(indices)
     others = []
     for d in depends:
         # TODO: this can be replaced with searchsorted
         # Normalize to index data in range before selection.
         firstindex = d.index[0]
         lastindex = d.index[-1]
         s = max(first, firstindex)
         e = min(last, lastindex)
         others.append(d.loc[s:e])
     return gd.concat(others)
Ejemplo n.º 26
0
def test_ddf_dataset_itr(tmpdir, datasets, inp_format):
    paths = glob.glob(str(datasets["parquet"]) + "/*." + "parquet".split("-")[0])
    ddf1 = dask_cudf.read_parquet(paths)[mycols_pq]
    df1 = ddf1.compute()
    if inp_format == "dask":
        ds = nvtabular.io.Dataset(ddf1.to_dask_dataframe())
    elif inp_format == "dask_cudf":
        ds = nvtabular.io.Dataset(ddf1)
    elif inp_format == "cudf":
        ds = nvtabular.io.Dataset(df1)
    elif inp_format == "pandas":
        ds = nvtabular.io.Dataset(df1.to_pandas())
    assert_eq(df1, cudf.concat(list(ds.to_iter(columns=mycols_pq))))
Ejemplo n.º 27
0
def test_fill_missing(tmpdir, df, dataset, engine):
    op = nvt.ops.FillMissing(42)

    cont_names = ["x", "y"]
    columns_ctx = {}
    columns_ctx["continuous"] = {}
    columns_ctx["continuous"]["base"] = cont_names
    for col in cont_names:
        idx = np.random.choice(df.shape[0] - 1, int(df.shape[0] * 0.2))
        df[col].iloc[idx] = None

    transformed = cudf.concat([op.apply_op(df, columns_ctx, "continuous")])
    assert_eq(transformed[cont_names], df[cont_names].fillna(42))
Ejemplo n.º 28
0
def test_string_concat():
    data1 = ["a", "b", "c", "d", "e"]
    data2 = ["f", "g", "h", "i", "j"]

    ps1 = pd.Series(data1)
    ps2 = pd.Series(data2)
    gs1 = Series(data1)
    gs2 = Series(data2)

    expect = pd.concat([ps1, ps2])
    got = concat([gs1, gs2])

    assert_eq(expect, got)
Ejemplo n.º 29
0
def _compress_array(a, length):

    tmp = cudf.DataFrame()

    if length > 0:
        tmp_a = [None] * length

        for i in range(length):
            tmp_a[i] = a[i]

        tmp = cudf.concat(tmp_a)

    return tmp
Ejemplo n.º 30
0
 def create_labels(self, size, labs_rep):
     df = cudf.DataFrame()
     for col in labs_rep:
         dist = col.distro or self.dist
         ser = dist.create_col(size,
                               dtype=col.dtype,
                               min_val=1,
                               max_val=col.cardinality).ceil()
         # bring back down to correct representation because of ceil call
         ser = ser - 1
         ser.name = col.name
         df = cudf.concat([df, ser], axis=1)
     return df