def test_extract_subgraph_specific_query(property_graph_instance): """ Graph of only transactions after time 1639085000 for merchant_id 4 (should be a graph of 2 vertices, 1 edge) """ pG = property_graph_instance selection = pG.select_edges("(_TYPE_=='transactions') & " "(merchant_id==4) & " "(time>1639085000)") G = pG.extract_subgraph(selection=selection, create_using=DiGraph_inst, edge_weight_property="card_num") expected_edgelist = cudf.DataFrame({ "src": [89216], "dst": [4], "weights": [8832] }) actual_edgelist = G.unrenumber(G.edgelist.edgelist_df, "src", preserve_order=True) actual_edgelist = G.unrenumber(actual_edgelist, "dst", preserve_order=True) assert G.is_directed() assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True)
def test_enable_batch_edgelist_replication(graph_file, directed, dask_client): gc.collect() G = utils.generate_cugraph_graph_from_file(graph_file, directed) G.enable_batch() df = G.edgelist.edgelist_df for worker in G.batch_edgelists: replicated_df = G.batch_edgelists[worker].result() assert_frame_equal(df, replicated_df)
def test_basic_assert_frame_equal( rdtype, rname, index, check_exact, check_dtype, check_names, check_like, mismatch, ): data = [1, 2, 1] p_left = pd.DataFrame(index=[1, 2, 3]) p_left["a"] = np.array(data, dtype="int8") p_left["b"] = np.array(data, dtype="int16") if mismatch: p_left["c"] = np.array([1, 2, 3], dtype="int64") else: p_left["c"] = np.array(data, dtype="int64") p_right = pd.DataFrame(index=index) for dtype, name in zip(rdtype, rname): p_right[name] = np.array(data, dtype=dtype) left = cudf.from_pandas(p_left) right = cudf.from_pandas(p_right) kind = None try: pd.testing.assert_frame_equal( p_left, p_right, check_exact=check_exact, check_dtype=check_dtype, check_names=check_names, check_like=check_like, ) except BaseException as e: kind = type(e) if kind is not None: with pytest.raises(kind): assert_frame_equal( left, right, check_exact=check_exact, check_dtype=check_dtype, check_names=check_names, check_like=check_like, ) else: assert_frame_equal( left, right, check_exact=check_exact, check_dtype=check_dtype, check_names=check_names, check_like=check_like, )
def test_replicate_cudf_dataframe_no_weights(input_data_path, dask_client): gc.collect() df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst"], dtype=["int32", "int32"], ) worker_to_futures = replication.replicate_cudf_dataframe(df) for worker in worker_to_futures: replicated_df = worker_to_futures[worker].result() assert_frame_equal(df, replicated_df)
def test_orc_reader_basic(datadir, inputfile, columns, use_index, engine): path = datadir / inputfile try: orcfile = pa.orc.ORCFile(path) except pa.ArrowIOError as e: pytest.skip(".orc file is not found: %s" % e) expect = orcfile.read(columns=columns).to_pandas() got = cudf.read_orc(path, engine=engine, columns=columns, use_index=use_index) assert_frame_equal(cudf.from_pandas(expect), got, check_categorical=False)
def test_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq): reference_file = "TestOrcFile.demo-12-zlib.orc" pdf_fname = datadir / reference_file gdf_fname = tmpdir.join("gdf.orc") try: orcfile = pa.orc.ORCFile(pdf_fname) except Exception as excpr: if type(excpr).__name__ == "ArrowIOError": pytest.skip(".orc file is not found") else: print(type(excpr).__name__) expect = cudf.from_pandas(orcfile.read().to_pandas()) expect.to_orc(gdf_fname.strpath, statistics=stats_freq) got = cudf.from_pandas(pa.orc.ORCFile(gdf_fname).read().to_pandas()) assert_frame_equal(expect, got)
def test_orc_writer(datadir, tmpdir, reference_file, columns, compression): pdf_fname = datadir / reference_file gdf_fname = tmpdir.join("gdf.orc") try: orcfile = pa.orc.ORCFile(pdf_fname) except Exception as excpr: if type(excpr).__name__ == "ArrowIOError": pytest.skip(".orc file is not found") else: print(type(excpr).__name__) expect = cudf.from_pandas(orcfile.read(columns=columns).to_pandas()) expect.to_orc(gdf_fname.strpath, compression=compression) got = cudf.from_pandas( pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas()) assert_frame_equal(expect, got)
def test_extract_subgraph_edge_prop_condition_only(property_graph_instance): pG = property_graph_instance selection = pG.select_edges("_TYPE_=='transactions'") G = pG.extract_subgraph(selection=selection, create_using=DiGraph_inst) # last item is the DataFrame rows transactions = dataset1["transactions"][-1] (srcs, dsts) = zip(*[(t[0], t[1]) for t in transactions]) expected_edgelist = cudf.DataFrame({"src": srcs, "dst": dsts}) expected_edgelist = expected_edgelist.sort_values(by="src", ignore_index=True) actual_edgelist = G.unrenumber(G.edgelist.edgelist_df, "src", preserve_order=True) actual_edgelist = G.unrenumber(actual_edgelist, "dst", preserve_order=True) actual_edgelist = actual_edgelist.sort_values(by="src", ignore_index=True) assert G.is_directed() assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True)
def test_extract_subgraph_graph_without_vert_props(): """ Ensure a subgraph can be extracted from a PropertyGraph that does not have vertex properties. """ from cugraph.experimental import PropertyGraph transactions = dataset1["transactions"] relationships = dataset1["relationships"] pG = PropertyGraph() pG.add_edge_data(cudf.DataFrame(columns=transactions[0], data=transactions[1]), type_name="transactions", vertex_id_columns=("user_id", "merchant_id"), property_columns=None) pG.add_edge_data(cudf.DataFrame(columns=relationships[0], data=relationships[1]), type_name="relationships", vertex_id_columns=("user_id_1", "user_id_2"), property_columns=None) G = pG.extract_subgraph(selection=pG.select_edges("_SRC_ == 89216"), create_using=DiGraph_inst, edge_weight_property="relationship_type", default_edge_weight=0) expected_edgelist = cudf.DataFrame({ "src": [89216, 89216, 89216], "dst": [4, 89021, 32431], "weights": [0, 9, 9] }) actual_edgelist = G.unrenumber(G.edgelist.edgelist_df, "src", preserve_order=True) actual_edgelist = G.unrenumber(actual_edgelist, "dst", preserve_order=True) assert G.is_directed() assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True)
def test_select_vertices_from_previous_selection(property_graph_instance): """ Ensures that the intersection of vertices of multiple types (only vertices that are both type A and type B) can be selected. """ pG = property_graph_instance # Select referrals from only taxpayers who are users (should be 1) selection = pG.select_vertices("_TYPE_ == 'taxpayers'") selection = pG.select_vertices("_TYPE_ == 'users'", from_previous_selection=selection) selection += pG.select_edges("_TYPE_ == 'referrals'") G = pG.extract_subgraph(create_using=DiGraph_inst, selection=selection) expected_edgelist = cudf.DataFrame({"src": [89021], "dst": [78634]}) actual_edgelist = G.unrenumber(G.edgelist.edgelist_df, "src", preserve_order=True) actual_edgelist = G.unrenumber(actual_edgelist, "dst", preserve_order=True) assert G.is_directed() assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True)
def test_extract_subgraph_vertex_edge_prop_condition(property_graph_instance): pG = property_graph_instance selection = pG.select_vertices("(user_location==47906) | " "(user_location==78750)") selection += pG.select_edges("_TYPE_=='referrals'") G = pG.extract_subgraph(selection=selection, create_using=DiGraph_inst, edge_weight_property="stars") expected_edgelist = cudf.DataFrame({ "src": [78634], "dst": [32431], "weights": [4] }) actual_edgelist = G.unrenumber(G.edgelist.edgelist_df, "src", preserve_order=True) actual_edgelist = G.unrenumber(actual_edgelist, "dst", preserve_order=True) assert G.is_directed() assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True)
def test_extract_subgraph_vertex_prop_condition_only(property_graph_instance): pG = property_graph_instance selection = pG.select_vertices("(_TYPE_=='taxpayers') & (amount<100)") G = pG.extract_subgraph(selection=selection, create_using=DiGraph_inst, edge_weight_property="stars") expected_edgelist = cudf.DataFrame({ "src": [89021], "dst": [78634], "weights": [4] }) actual_edgelist = G.unrenumber(G.edgelist.edgelist_df, "src", preserve_order=True) actual_edgelist = G.unrenumber(actual_edgelist, "dst", preserve_order=True) assert G.is_directed() # check_like=True ignores differences in column/index ordering assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True)
def test_chunked_orc_writer(datadir, tmpdir, reference_file, columns, compression): pdf_fname = datadir / reference_file gdf_fname = tmpdir.join("chunked_gdf.orc") try: orcfile = pa.orc.ORCFile(pdf_fname) except Exception as excpr: if type(excpr).__name__ == "ArrowIOError": pytest.skip(".orc file is not found") else: print(type(excpr).__name__) pdf = orcfile.read(columns=columns).to_pandas() gdf = cudf.from_pandas(pdf) expect = pd.concat([pdf, pdf]).reset_index(drop=True) writer = ORCWriter(gdf_fname, compression=compression) writer.write_table(gdf) writer.write_table(gdf) writer.close() got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas() assert_frame_equal(cudf.from_pandas(expect), cudf.from_pandas(got))
def test_extract_subgraph_default_edge_weight(property_graph_instance): """ Ensure the default_edge_weight value is added to edges with missing properties used for weights. """ pG = property_graph_instance selection = pG.select_edges("_TYPE_=='transactions'") G = pG.extract_subgraph(create_using=DiGraph_inst, selection=selection, edge_weight_property="volume", default_edge_weight=99) # last item is the DataFrame rows transactions = dataset1["transactions"][-1] (srcs, dsts, weights) = zip(*[(t[0], t[1], t[2]) for t in transactions]) # replace None with the expected value (convert to a list to replace) weights_list = list(weights) weights_list[weights.index(None)] = 99. weights = tuple(weights_list) expected_edgelist = cudf.DataFrame({ "src": srcs, "dst": dsts, "weights": weights }) expected_edgelist = expected_edgelist.sort_values(by="src", ignore_index=True) actual_edgelist = G.unrenumber(G.edgelist.edgelist_df, "src", preserve_order=True) actual_edgelist = G.unrenumber(actual_edgelist, "dst", preserve_order=True) actual_edgelist = actual_edgelist.sort_values(by="src", ignore_index=True) assert G.is_directed() assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True)
def test_edge_props_to_graph(property_graph_instance): """ Access the property DataFrames directly and use them to perform a more complex query, then call edge_props_to_graph() to create the corresponding graph. """ pG = property_graph_instance # Select referrals from only taxpayers who are users (should be 1) # Find the list of vertices that are both users and taxpayers def contains_both(df): return (df["_TYPE_"] == "taxpayers").any() and \ (df["_TYPE_"] == "users").any() verts = pG._vertex_prop_dataframe.groupby("_VERTEX_")\ .apply(contains_both) verts = verts[verts].keys() # get an array of only verts that have both # Find the "referral" edge_props containing only those verts referrals = pG._edge_prop_dataframe["_TYPE_"] == "referrals" srcs = pG._edge_prop_dataframe[referrals]["_SRC_"].isin(verts) dsts = pG._edge_prop_dataframe[referrals]["_DST_"].isin(verts) matching_edges = (srcs & dsts) indices = matching_edges.index[matching_edges] edge_props = pG._edge_prop_dataframe.loc[indices] G = pG.edge_props_to_graph(edge_props, create_using=DiGraph_inst) expected_edgelist = cudf.DataFrame({"src": [89021], "dst": [78634]}) actual_edgelist = G.unrenumber(G.edgelist.edgelist_df, "src", preserve_order=True) actual_edgelist = G.unrenumber(actual_edgelist, "dst", preserve_order=True) assert G.is_directed() assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True)