def test_add_edge_data_prop_columns(df_type): """ add_edge_data() on "transactions" table, subset of properties. """ from cugraph.experimental import PropertyGraph transactions = dataset1["transactions"] transactions_df = df_type(columns=transactions[0], data=transactions[1]) expected_props = ["card_num", "card_type"] pG = PropertyGraph() pG.add_edge_data(transactions_df, type_name="transactions", vertex_id_columns=("user_id", "merchant_id"), property_columns=expected_props) assert pG.num_vertices == 7 assert pG.num_edges == 4 assert sorted(pG.edge_property_names) == sorted(expected_props)
def test_add_vertex_data_prop_columns(df_type): """ add_vertex_data() on "merchants" table, subset of properties. """ from cugraph.experimental import PropertyGraph merchants = dataset1["merchants"] merchants_df = df_type(columns=merchants[0], data=merchants[1]) expected_props = ["merchant_name", "merchant_sales", "merchant_location"] pG = PropertyGraph() pG.add_vertex_data(merchants_df, type_name="merchants", vertex_id_column="merchant_id", property_columns=expected_props) assert pG.num_vertices == 5 assert pG.num_edges == 0 assert sorted(pG.vertex_property_names) == sorted(expected_props)
def test_add_vertex_data(df_type): """ add_vertex_data() on "merchants" table, all properties. """ from cugraph.experimental import PropertyGraph merchants = dataset1["merchants"] merchants_df = df_type(columns=merchants[0], data=merchants[1]) pG = PropertyGraph() pG.add_vertex_data(merchants_df, type_name="merchants", vertex_id_column="merchant_id", property_columns=None) assert pG.num_vertices == 5 assert pG.num_edges == 0 expected_props = merchants[0].copy() assert sorted(pG.vertex_property_names) == sorted(expected_props)
def test_null_data(df_type): """ test for null data """ from cugraph.experimental import PropertyGraph pG = PropertyGraph() assert pG.num_vertices == 0 assert pG.num_edges == 0 assert sorted(pG.vertex_property_names) == sorted([])
def read_reddit(raw_path, self_loop=False): coo_adj = sp.load_npz(os.path.join(raw_path, "reddit_graph.npz")) edgelist = cudf.DataFrame() edgelist['src'] = cudf.Series(coo_adj.row) edgelist['dst'] = cudf.Series(coo_adj.col) edgelist['wt'] = cudf.Series(coo_adj.data) # features and labels reddit_data = np.load(os.path.join(raw_path, "reddit_data.npz")) features = reddit_data["feature"] cu_features = cudf.DataFrame(features) cu_features['name'] = np.arange(cu_features.shape[0]) labels = reddit_data["label"] # tarin/val/test indices node_types = reddit_data["node_types"] train_mask = (node_types == 1) val_mask = (node_types == 2) test_mask = (node_types == 3) # add features to nodes and edges pg = PropertyGraph() pg.add_edge_data(edgelist, vertex_col_names=("src", "dst")) pg.add_vertex_data(cu_features, vertex_col_name="name") pg._vertex_prop_dataframe.drop(columns=['name'], inplace=True) gstore = CuGraphStorage(pg) return gstore, labels, train_mask, val_mask, test_mask
def read_cora(graph_path, feat_path, self_loop=False): cora_M = cudf.read_csv(graph_path, sep='\t', header=None) cora_content = cudf.read_csv(feat_path, sep='\t', header=None) # the last column is true label labels = cora_content['1434'] cora_content.drop(columns='1434', inplace=True) # add weight into graph cora_M['weight'] = 1.0 # add features to nodes and edges pg = PropertyGraph() pg.add_edge_data(cora_M, vertex_col_names=("0", "1")) pg.add_vertex_data(cora_content, vertex_col_name="0") pg._vertex_prop_dataframe.drop(columns=['0'], inplace=True) pg._edge_prop_dataframe.drop(columns=['0', '1'], inplace=True) gstore = CuGraphStorage(pg) # define train, test and val splits indices = np.arange(len(labels)) random.shuffle(indices) idx_train, idx_val, idx_test = np.split(indices, [1000, 1500]) return gstore, labels, idx_train, idx_val, idx_test
def test_extract_subgraph_graph_without_vert_props(): """ Ensure a subgraph can be extracted from a PropertyGraph that does not have vertex properties. """ from cugraph.experimental import PropertyGraph transactions = dataset1["transactions"] relationships = dataset1["relationships"] pG = PropertyGraph() pG.add_edge_data(cudf.DataFrame(columns=transactions[0], data=transactions[1]), type_name="transactions", vertex_id_columns=("user_id", "merchant_id"), property_columns=None) pG.add_edge_data(cudf.DataFrame(columns=relationships[0], data=relationships[1]), type_name="relationships", vertex_id_columns=("user_id_1", "user_id_2"), property_columns=None) G = pG.extract_subgraph(selection=pG.select_edges("_SRC_ == 89216"), create_using=DiGraph_inst, edge_weight_property="relationship_type", default_edge_weight=0) expected_edgelist = cudf.DataFrame({ "src": [89216, 89216, 89216], "dst": [4, 89021, 32431], "weights": [0, 9, 9] }) actual_edgelist = G.unrenumber(G.edgelist.edgelist_df, "src", preserve_order=True) actual_edgelist = G.unrenumber(actual_edgelist, "dst", preserve_order=True) assert G.is_directed() assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True)
def create_pg(): """ Fixture which returns an instance of a PropertyGraph with vertex and edge data added from dataset1, parameterized for different DataFrame types. """ dataframe_type = cudf.DataFrame (merchants, users, taxpayers, transactions, relationships, referrals) = dataset1.values() pG = PropertyGraph() # Vertex and edge data is added as one or more DataFrames; either a Pandas # DataFrame to keep data on the CPU, a cuDF DataFrame to keep data on GPU, # or a dask_cudf DataFrame to keep data on distributed GPUs. # For dataset1: vertices are merchants and users, edges are transactions, # relationships, and referrals. # property_columns=None (the default) means all columns except # vertex_col_name will be used as properties for the vertices/edges. pG.add_vertex_data(dataframe_type(columns=merchants[0], data=merchants[1]), type_name="merchants", vertex_col_name="merchant_id", property_columns=None) pG.add_vertex_data(dataframe_type(columns=users[0], data=users[1]), type_name="users", vertex_col_name="user_id", property_columns=None) pG.add_vertex_data(dataframe_type(columns=taxpayers[0], data=taxpayers[1]), type_name="taxpayers", vertex_col_name="payer_id", property_columns=None) pG.add_edge_data(dataframe_type(columns=transactions[0], data=transactions[1]), type_name="transactions", vertex_col_names=("user_id", "merchant_id"), property_columns=None) pG.add_edge_data(dataframe_type(columns=relationships[0], data=relationships[1]), type_name="relationships", vertex_col_names=("user_id_1", "user_id_2"), property_columns=None) pG.add_edge_data(dataframe_type(columns=referrals[0], data=referrals[1]), type_name="referrals", vertex_col_names=("user_id_1", "user_id_2"), property_columns=None) return pG
def test_different_vertex_edge_input_dataframe_types(): """ Ensures that a PropertyGraph initialized with one DataFrame type cannot be extended with another. """ df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) from cugraph.experimental import PropertyGraph pG = PropertyGraph() pG.add_vertex_data(df, type_name="foo", vertex_id_column="a") with pytest.raises(TypeError): pG.add_edge_data(pdf, type_name="bar", vertex_id_columns=("a", "b")) pG = PropertyGraph() pG.add_vertex_data(pdf, type_name="foo", vertex_id_column="a") with pytest.raises(TypeError): pG.add_edge_data(df, type_name="bar", vertex_id_columns=("a", "b")) # Different order pG = PropertyGraph() pG.add_edge_data(df, type_name="bar", vertex_id_columns=("a", "b")) with pytest.raises(TypeError): pG.add_vertex_data(pdf, type_name="foo", vertex_id_column="a") # Same API call, different types pG = PropertyGraph() pG.add_vertex_data(df, type_name="foo", vertex_id_column="a") with pytest.raises(TypeError): pG.add_vertex_data(pdf, type_name="foo", vertex_id_column="a") pG = PropertyGraph() pG.add_edge_data(df, type_name="bar", vertex_id_columns=("a", "b")) with pytest.raises(TypeError): pG.add_edge_data(pdf, type_name="bar", vertex_id_columns=("a", "b"))
def test_add_edge_data_bad_args(): """ add_edge_data() with various bad args, checks that proper exceptions are raised. """ from cugraph.experimental import PropertyGraph transactions = dataset1["transactions"] transactions_df = cudf.DataFrame(columns=transactions[0], data=transactions[1]) pG = PropertyGraph() with pytest.raises(TypeError): pG.add_edge_data(42, type_name="transactions", vertex_id_columns=("user_id", "merchant_id"), property_columns=None) with pytest.raises(TypeError): pG.add_edge_data(transactions_df, type_name=42, vertex_id_columns=("user_id", "merchant_id"), property_columns=None) with pytest.raises(ValueError): pG.add_edge_data(transactions_df, type_name="transactions", vertex_id_columns=("user_id", "bad_column"), property_columns=None) with pytest.raises(ValueError): pG.add_edge_data(transactions_df, type_name="transactions", vertex_id_columns=("user_id", "merchant_id"), property_columns=["bad_column_name", "time"]) with pytest.raises(TypeError): pG.add_edge_data(transactions_df, type_name="transactions", vertex_id_columns=("user_id", "merchant_id"), property_columns="time")
def test_add_vertex_data_bad_args(): """ add_vertex_data() with various bad args, checks that proper exceptions are raised. """ from cugraph.experimental import PropertyGraph merchants = dataset1["merchants"] merchants_df = cudf.DataFrame(columns=merchants[0], data=merchants[1]) pG = PropertyGraph() with pytest.raises(TypeError): pG.add_vertex_data(42, type_name="merchants", vertex_id_column="merchant_id", property_columns=None) with pytest.raises(TypeError): pG.add_vertex_data(merchants_df, type_name=42, vertex_id_column="merchant_id", property_columns=None) with pytest.raises(ValueError): pG.add_vertex_data(merchants_df, type_name="merchants", vertex_id_column="bad_column_name", property_columns=None) with pytest.raises(ValueError): pG.add_vertex_data( merchants_df, type_name="merchants", vertex_id_column="merchant_id", property_columns=["bad_column_name", "merchant_name"]) with pytest.raises(TypeError): pG.add_vertex_data(merchants_df, type_name="merchants", vertex_id_column="merchant_id", property_columns="merchant_name")
def property_graph_instance(request): """ FIXME: fill this in """ dataframe_type = request.param[0] from cugraph.experimental import PropertyGraph (merchants, users, taxpayers, transactions, relationships, referrals) = dataset1.values() pG = PropertyGraph() # Vertex and edge data is added as one or more DataFrames; either a Pandas # DataFrame to keep data on the CPU, a cuDF DataFrame to keep data on GPU, # or a dask_cudf DataFrame to keep data on distributed GPUs. # For dataset1: vertices are merchants and users, edges are transactions, # relationships, and referrals. # property_columns=None (the default) means all columns except # vertex_id_column will be used as properties for the vertices/edges. pG.add_vertex_data(dataframe_type(columns=merchants[0], data=merchants[1]), type_name="merchants", vertex_id_column="merchant_id", property_columns=None) pG.add_vertex_data(dataframe_type(columns=users[0], data=users[1]), type_name="users", vertex_id_column="user_id", property_columns=None) pG.add_vertex_data(dataframe_type(columns=taxpayers[0], data=taxpayers[1]), type_name="taxpayers", vertex_id_column="payer_id", property_columns=None) pG.add_edge_data(dataframe_type(columns=transactions[0], data=transactions[1]), type_name="transactions", vertex_id_columns=("user_id", "merchant_id"), property_columns=None) pG.add_edge_data(dataframe_type(columns=relationships[0], data=relationships[1]), type_name="relationships", vertex_id_columns=("user_id_1", "user_id_2"), property_columns=None) pG.add_edge_data(dataframe_type(columns=referrals[0], data=referrals[1]), type_name="referrals", vertex_id_columns=("user_id_1", "user_id_2"), property_columns=None) return pG