def test_split_even(): prepare_dist() g = create_random_graph(10000) num_parts = 4 num_hops = 2 partition_graph(g, 'dist_graph_test', num_parts, '/tmp/dist_graph', num_hops=num_hops, part_method='metis') node_mask = np.random.randint(0, 100, size=g.number_of_nodes()) > 30 edge_mask = np.random.randint(0, 100, size=g.number_of_edges()) > 30 selected_nodes = np.nonzero(node_mask)[0] selected_edges = np.nonzero(edge_mask)[0] all_nodes1 = [] all_nodes2 = [] all_edges1 = [] all_edges2 = [] for i in range(num_parts): dgl.distributed.set_num_client(num_parts) part_g, node_feats, edge_feats, gpb = load_partition('/tmp/dist_graph/dist_graph_test.json', i) local_nids = F.nonzero_1d(part_g.ndata['inner_node']) local_nids = F.gather_row(part_g.ndata[dgl.NID], local_nids) nodes = node_split(node_mask, gpb, i, force_even=True) all_nodes1.append(nodes) subset = np.intersect1d(F.asnumpy(nodes), F.asnumpy(local_nids)) print('part {} get {} nodes and {} are in the partition'.format(i, len(nodes), len(subset))) dgl.distributed.set_num_client(num_parts * 2) nodes1 = node_split(node_mask, gpb, i * 2, force_even=True) nodes2 = node_split(node_mask, gpb, i * 2 + 1, force_even=True) nodes3 = F.cat([nodes1, nodes2], 0) all_nodes2.append(nodes3) subset = np.intersect1d(F.asnumpy(nodes), F.asnumpy(nodes3)) print('intersection has', len(subset)) dgl.distributed.set_num_client(num_parts) local_eids = F.nonzero_1d(part_g.edata['inner_edge']) local_eids = F.gather_row(part_g.edata[dgl.EID], local_eids) edges = edge_split(edge_mask, gpb, i, force_even=True) all_edges1.append(edges) subset = np.intersect1d(F.asnumpy(edges), F.asnumpy(local_eids)) print('part {} get {} edges and {} are in the partition'.format(i, len(edges), len(subset))) dgl.distributed.set_num_client(num_parts * 2) edges1 = edge_split(edge_mask, gpb, i * 2, force_even=True) edges2 = edge_split(edge_mask, gpb, i * 2 + 1, force_even=True) edges3 = F.cat([edges1, edges2], 0) all_edges2.append(edges3) subset = np.intersect1d(F.asnumpy(edges), F.asnumpy(edges3)) print('intersection has', len(subset)) all_nodes1 = F.cat(all_nodes1, 0) all_edges1 = F.cat(all_edges1, 0) all_nodes2 = F.cat(all_nodes2, 0) all_edges2 = F.cat(all_edges2, 0) all_nodes = np.nonzero(node_mask)[0] all_edges = np.nonzero(edge_mask)[0] assert np.all(all_nodes == F.asnumpy(all_nodes1)) assert np.all(all_edges == F.asnumpy(all_edges1)) assert np.all(all_nodes == F.asnumpy(all_nodes2)) assert np.all(all_edges == F.asnumpy(all_edges2))
def test_split(): #prepare_dist() g = create_random_graph(10000) num_parts = 4 num_hops = 2 partition_graph(g, 'dist_graph_test', num_parts, '/tmp/dist_graph', num_hops=num_hops, part_method='metis') node_mask = np.random.randint(0, 100, size=g.number_of_nodes()) > 30 edge_mask = np.random.randint(0, 100, size=g.number_of_edges()) > 30 selected_nodes = np.nonzero(node_mask)[0] selected_edges = np.nonzero(edge_mask)[0] # The code now collects the roles of all client processes and use the information # to determine how to split the workloads. Here is to simulate the multi-client # use case. def set_roles(num_clients): dgl.distributed.role.CUR_ROLE = 'default' dgl.distributed.role.GLOBAL_RANK = {i:i for i in range(num_clients)} dgl.distributed.role.PER_ROLE_RANK['default'] = {i:i for i in range(num_clients)} for i in range(num_parts): set_roles(num_parts) part_g, node_feats, edge_feats, gpb, _, _, _ = load_partition('/tmp/dist_graph/dist_graph_test.json', i) local_nids = F.nonzero_1d(part_g.ndata['inner_node']) local_nids = F.gather_row(part_g.ndata[dgl.NID], local_nids) nodes1 = np.intersect1d(selected_nodes, F.asnumpy(local_nids)) nodes2 = node_split(node_mask, gpb, rank=i, force_even=False) assert np.all(np.sort(nodes1) == np.sort(F.asnumpy(nodes2))) local_nids = F.asnumpy(local_nids) for n in nodes1: assert n in local_nids set_roles(num_parts * 2) nodes3 = node_split(node_mask, gpb, rank=i * 2, force_even=False) nodes4 = node_split(node_mask, gpb, rank=i * 2 + 1, force_even=False) nodes5 = F.cat([nodes3, nodes4], 0) assert np.all(np.sort(nodes1) == np.sort(F.asnumpy(nodes5))) set_roles(num_parts) local_eids = F.nonzero_1d(part_g.edata['inner_edge']) local_eids = F.gather_row(part_g.edata[dgl.EID], local_eids) edges1 = np.intersect1d(selected_edges, F.asnumpy(local_eids)) edges2 = edge_split(edge_mask, gpb, rank=i, force_even=False) assert np.all(np.sort(edges1) == np.sort(F.asnumpy(edges2))) local_eids = F.asnumpy(local_eids) for e in edges1: assert e in local_eids set_roles(num_parts * 2) edges3 = edge_split(edge_mask, gpb, rank=i * 2, force_even=False) edges4 = edge_split(edge_mask, gpb, rank=i * 2 + 1, force_even=False) edges5 = F.cat([edges3, edges4], 0) assert np.all(np.sort(edges1) == np.sort(F.asnumpy(edges5)))
def test_split(): prepare_dist() g = create_random_graph(10000) num_parts = 4 num_hops = 2 partition_graph(g, 'dist_graph_test', num_parts, '/tmp/dist_graph', num_hops=num_hops, part_method='metis') node_mask = np.random.randint(0, 100, size=g.number_of_nodes()) > 30 edge_mask = np.random.randint(0, 100, size=g.number_of_edges()) > 30 selected_nodes = np.nonzero(node_mask)[0] selected_edges = np.nonzero(edge_mask)[0] for i in range(num_parts): dgl.distributed.set_num_client(num_parts) part_g, node_feats, edge_feats, gpb, _ = load_partition( '/tmp/dist_graph/dist_graph_test.json', i) local_nids = F.nonzero_1d(part_g.ndata['inner_node']) local_nids = F.gather_row(part_g.ndata[dgl.NID], local_nids) nodes1 = np.intersect1d(selected_nodes, F.asnumpy(local_nids)) nodes2 = node_split(node_mask, gpb, i, force_even=False) assert np.all(np.sort(nodes1) == np.sort(F.asnumpy(nodes2))) local_nids = F.asnumpy(local_nids) for n in nodes1: assert n in local_nids dgl.distributed.set_num_client(num_parts * 2) nodes3 = node_split(node_mask, gpb, i * 2, force_even=False) nodes4 = node_split(node_mask, gpb, i * 2 + 1, force_even=False) nodes5 = F.cat([nodes3, nodes4], 0) assert np.all(np.sort(nodes1) == np.sort(F.asnumpy(nodes5))) dgl.distributed.set_num_client(num_parts) local_eids = F.nonzero_1d(part_g.edata['inner_edge']) local_eids = F.gather_row(part_g.edata[dgl.EID], local_eids) edges1 = np.intersect1d(selected_edges, F.asnumpy(local_eids)) edges2 = edge_split(edge_mask, gpb, i, force_even=False) assert np.all(np.sort(edges1) == np.sort(F.asnumpy(edges2))) local_eids = F.asnumpy(local_eids) for e in edges1: assert e in local_eids dgl.distributed.set_num_client(num_parts * 2) edges3 = edge_split(edge_mask, gpb, i * 2, force_even=False) edges4 = edge_split(edge_mask, gpb, i * 2 + 1, force_even=False) edges5 = F.cat([edges3, edges4], 0) assert np.all(np.sort(edges1) == np.sort(F.asnumpy(edges5)))
def check_rpc_bipartite_etype_sampling_empty(tmpdir, num_server): """sample on bipartite via sample_etype_neighbors() which yields empty sample results""" generate_ip_config("rpc_ip_config.txt", num_server, num_server) g = create_random_bipartite() num_parts = num_server num_hops = 1 orig_nids, _ = partition_graph(g, 'test_sampling', num_parts, tmpdir, num_hops=num_hops, part_method='metis', reshuffle=True, return_mapping=True) pserver_list = [] ctx = mp.get_context('spawn') for i in range(num_server): p = ctx.Process(target=start_server, args=( i, tmpdir, num_server > 1, 'test_sampling')) p.start() time.sleep(1) pserver_list.append(p) deg = get_degrees(g, orig_nids['game'], 'game') empty_nids = F.nonzero_1d(deg == 0) block, gpb = start_bipartite_etype_sample_client(0, tmpdir, num_server > 1, nodes={'game': empty_nids, 'user': [1]}) print("Done sampling") for p in pserver_list: p.join() assert block is not None assert block.number_of_edges() == 0 assert len(block.etypes) == len(g.etypes)
def check_rpc_hetero_etype_sampling_empty_shuffle(tmpdir, num_server): generate_ip_config("rpc_ip_config.txt", num_server, num_server) g = create_random_hetero(dense=True, empty=True) num_parts = num_server num_hops = 1 orig_nids, _ = partition_graph(g, 'test_sampling', num_parts, tmpdir, num_hops=num_hops, part_method='metis', reshuffle=True, return_mapping=True) pserver_list = [] ctx = mp.get_context('spawn') for i in range(num_server): p = ctx.Process(target=start_server, args=(i, tmpdir, num_server > 1, 'test_sampling')) p.start() time.sleep(1) pserver_list.append(p) fanout = 3 deg = get_degrees(g, orig_nids['n3'], 'n3') empty_nids = F.nonzero_1d(deg == 0) block, gpb = start_hetero_etype_sample_client(0, tmpdir, num_server > 1, fanout, nodes={'n3': empty_nids}) print("Done sampling") for p in pserver_list: p.join() assert block.number_of_edges() == 0 assert len(block.etypes) == len(g.etypes)
def test_split(): prepare_dist() g = create_random_graph(10000) num_parts = 4 num_hops = 2 partition_graph(g, 'test', num_parts, '/tmp', num_hops=num_hops, part_method='metis') node_mask = np.random.randint(0, 100, size=g.number_of_nodes()) > 30 edge_mask = np.random.randint(0, 100, size=g.number_of_edges()) > 30 selected_nodes = np.nonzero(node_mask)[0] selected_edges = np.nonzero(edge_mask)[0] for i in range(num_parts): part_g, node_feats, edge_feats, meta = load_partition( '/tmp/test.json', i) num_nodes, num_edges, node_map, edge_map, num_partitions = meta gpb = GraphPartitionBook(part_id=i, num_parts=num_partitions, node_map=node_map, edge_map=edge_map, part_graph=part_g) local_nids = F.nonzero_1d(part_g.ndata['local_node']) local_nids = F.gather_row(part_g.ndata[dgl.NID], local_nids) nodes1 = np.intersect1d(selected_nodes, F.asnumpy(local_nids)) nodes2 = node_split(node_mask, gpb, i) assert np.all(np.sort(nodes1) == np.sort(F.asnumpy(nodes2))) local_nids = F.asnumpy(local_nids) for n in nodes1: assert n in local_nids local_eids = F.nonzero_1d(part_g.edata['local_edge']) local_eids = F.gather_row(part_g.edata[dgl.EID], local_eids) edges1 = np.intersect1d(selected_edges, F.asnumpy(local_eids)) edges2 = edge_split(edge_mask, gpb, i) assert np.all(np.sort(edges1) == np.sort(F.asnumpy(edges2))) local_eids = F.asnumpy(local_eids) for e in edges1: assert e in local_eids
def tensor_topo_traverse(): n = g.number_of_nodes() mask = F.copy_to(F.ones((n, 1)), F.cpu()) degree = F.spmm(adjmat, mask) while F.reduce_sum(mask) != 0.: v = F.astype((degree == 0.), F.float32) v = v * mask mask = mask - v frontier = F.copy_to(F.nonzero_1d(F.squeeze(v, 1)), F.cpu()) yield frontier degree -= F.spmm(adjmat, v)
def test_split_even(): g = create_random_graph(10000) num_parts = 4 num_hops = 2 partition_graph(g, 'dist_graph_test', num_parts, '/tmp/dist_graph', num_hops=num_hops, part_method='metis') node_mask = np.random.randint(0, 100, size=g.number_of_nodes()) > 30 edge_mask = np.random.randint(0, 100, size=g.number_of_edges()) > 30 selected_nodes = np.nonzero(node_mask)[0] selected_edges = np.nonzero(edge_mask)[0] all_nodes1 = [] all_nodes2 = [] all_edges1 = [] all_edges2 = [] # The code now collects the roles of all client processes and use the information # to determine how to split the workloads. Here is to simulate the multi-client # use case. def set_roles(num_clients): dgl.distributed.role.CUR_ROLE = 'default' dgl.distributed.role.GLOBAL_RANK = {i: i for i in range(num_clients)} dgl.distributed.role.PER_ROLE_RANK['default'] = { i: i for i in range(num_clients) } for i in range(num_parts): set_roles(num_parts) part_g, node_feats, edge_feats, gpb, _, _, _ = load_partition( '/tmp/dist_graph/dist_graph_test.json', i) local_nids = F.nonzero_1d(part_g.ndata['inner_node']) local_nids = F.gather_row(part_g.ndata[dgl.NID], local_nids) nodes = node_split(node_mask, gpb, rank=i, force_even=True) all_nodes1.append(nodes) subset = np.intersect1d(F.asnumpy(nodes), F.asnumpy(local_nids)) print('part {} get {} nodes and {} are in the partition'.format( i, len(nodes), len(subset))) set_roles(num_parts * 2) nodes1 = node_split(node_mask, gpb, rank=i * 2, force_even=True) nodes2 = node_split(node_mask, gpb, rank=i * 2 + 1, force_even=True) nodes3, _ = F.sort_1d(F.cat([nodes1, nodes2], 0)) all_nodes2.append(nodes3) subset = np.intersect1d(F.asnumpy(nodes), F.asnumpy(nodes3)) print('intersection has', len(subset)) set_roles(num_parts) local_eids = F.nonzero_1d(part_g.edata['inner_edge']) local_eids = F.gather_row(part_g.edata[dgl.EID], local_eids) edges = edge_split(edge_mask, gpb, rank=i, force_even=True) all_edges1.append(edges) subset = np.intersect1d(F.asnumpy(edges), F.asnumpy(local_eids)) print('part {} get {} edges and {} are in the partition'.format( i, len(edges), len(subset))) set_roles(num_parts * 2) edges1 = edge_split(edge_mask, gpb, rank=i * 2, force_even=True) edges2 = edge_split(edge_mask, gpb, rank=i * 2 + 1, force_even=True) edges3, _ = F.sort_1d(F.cat([edges1, edges2], 0)) all_edges2.append(edges3) subset = np.intersect1d(F.asnumpy(edges), F.asnumpy(edges3)) print('intersection has', len(subset)) all_nodes1 = F.cat(all_nodes1, 0) all_edges1 = F.cat(all_edges1, 0) all_nodes2 = F.cat(all_nodes2, 0) all_edges2 = F.cat(all_edges2, 0) all_nodes = np.nonzero(node_mask)[0] all_edges = np.nonzero(edge_mask)[0] assert np.all(all_nodes == F.asnumpy(all_nodes1)) assert np.all(all_edges == F.asnumpy(all_edges1)) assert np.all(all_nodes == F.asnumpy(all_nodes2)) assert np.all(all_edges == F.asnumpy(all_edges2))
def check_partition(part_method, reshuffle): g = create_random_graph(10000) g.ndata['labels'] = F.arange(0, g.number_of_nodes()) g.ndata['feats'] = F.tensor(np.random.randn(g.number_of_nodes(), 10)) g.edata['feats'] = F.tensor(np.random.randn(g.number_of_edges(), 10)) g.update_all(fn.copy_src('feats', 'msg'), fn.sum('msg', 'h')) g.update_all(fn.copy_edge('feats', 'msg'), fn.sum('msg', 'eh')) num_parts = 4 num_hops = 2 partition_graph(g, 'test', num_parts, '/tmp/partition', num_hops=num_hops, part_method=part_method, reshuffle=reshuffle) part_sizes = [] for i in range(num_parts): part_g, node_feats, edge_feats, gpb, _ = load_partition('/tmp/partition/test.json', i) # Check the metadata assert gpb._num_nodes() == g.number_of_nodes() assert gpb._num_edges() == g.number_of_edges() assert gpb.num_partitions() == num_parts gpb_meta = gpb.metadata() assert len(gpb_meta) == num_parts assert len(gpb.partid2nids(i)) == gpb_meta[i]['num_nodes'] assert len(gpb.partid2eids(i)) == gpb_meta[i]['num_edges'] part_sizes.append((gpb_meta[i]['num_nodes'], gpb_meta[i]['num_edges'])) local_nid = gpb.nid2localnid(F.boolean_mask(part_g.ndata[dgl.NID], part_g.ndata['inner_node']), i) assert F.dtype(local_nid) in (F.int64, F.int32) assert np.all(F.asnumpy(local_nid) == np.arange(0, len(local_nid))) local_eid = gpb.eid2localeid(F.boolean_mask(part_g.edata[dgl.EID], part_g.edata['inner_edge']), i) assert F.dtype(local_eid) in (F.int64, F.int32) assert np.all(F.asnumpy(local_eid) == np.arange(0, len(local_eid))) # Check the node map. local_nodes = F.boolean_mask(part_g.ndata[dgl.NID], part_g.ndata['inner_node']) llocal_nodes = F.nonzero_1d(part_g.ndata['inner_node']) local_nodes1 = gpb.partid2nids(i) assert F.dtype(local_nodes1) in (F.int32, F.int64) assert np.all(np.sort(F.asnumpy(local_nodes)) == np.sort(F.asnumpy(local_nodes1))) # Check the edge map. local_edges = F.boolean_mask(part_g.edata[dgl.EID], part_g.edata['inner_edge']) local_edges1 = gpb.partid2eids(i) assert F.dtype(local_edges1) in (F.int32, F.int64) assert np.all(np.sort(F.asnumpy(local_edges)) == np.sort(F.asnumpy(local_edges1))) if reshuffle: part_g.ndata['feats'] = F.gather_row(g.ndata['feats'], part_g.ndata['orig_id']) part_g.edata['feats'] = F.gather_row(g.edata['feats'], part_g.edata['orig_id']) # when we read node data from the original global graph, we should use orig_id. local_nodes = F.boolean_mask(part_g.ndata['orig_id'], part_g.ndata['inner_node']) local_edges = F.boolean_mask(part_g.edata['orig_id'], part_g.edata['inner_edge']) else: part_g.ndata['feats'] = F.gather_row(g.ndata['feats'], part_g.ndata[dgl.NID]) part_g.edata['feats'] = F.gather_row(g.edata['feats'], part_g.edata[dgl.NID]) part_g.update_all(fn.copy_src('feats', 'msg'), fn.sum('msg', 'h')) part_g.update_all(fn.copy_edge('feats', 'msg'), fn.sum('msg', 'eh')) assert F.allclose(F.gather_row(g.ndata['h'], local_nodes), F.gather_row(part_g.ndata['h'], llocal_nodes)) assert F.allclose(F.gather_row(g.ndata['eh'], local_nodes), F.gather_row(part_g.ndata['eh'], llocal_nodes)) for name in ['labels', 'feats']: assert name in node_feats assert node_feats[name].shape[0] == len(local_nodes) assert np.all(F.asnumpy(g.ndata[name])[F.asnumpy(local_nodes)] == F.asnumpy(node_feats[name])) for name in ['feats']: assert name in edge_feats assert edge_feats[name].shape[0] == len(local_edges) assert np.all(F.asnumpy(g.edata[name])[F.asnumpy(local_edges)] == F.asnumpy(edge_feats[name])) if reshuffle: node_map = [] edge_map = [] for i, (num_nodes, num_edges) in enumerate(part_sizes): node_map.append(np.ones(num_nodes) * i) edge_map.append(np.ones(num_edges) * i) node_map = np.concatenate(node_map) edge_map = np.concatenate(edge_map) nid2pid = gpb.nid2partid(F.arange(0, len(node_map))) assert F.dtype(nid2pid) in (F.int32, F.int64) assert np.all(F.asnumpy(nid2pid) == node_map) eid2pid = gpb.eid2partid(F.arange(0, len(edge_map))) assert F.dtype(eid2pid) in (F.int32, F.int64) assert np.all(F.asnumpy(eid2pid) == edge_map)
def check_partition(g, part_method, reshuffle): g.ndata['labels'] = F.arange(0, g.number_of_nodes()) g.ndata['feats'] = F.tensor(np.random.randn(g.number_of_nodes(), 10), F.float32) g.edata['feats'] = F.tensor(np.random.randn(g.number_of_edges(), 10), F.float32) g.update_all(fn.copy_src('feats', 'msg'), fn.sum('msg', 'h')) g.update_all(fn.copy_edge('feats', 'msg'), fn.sum('msg', 'eh')) num_parts = 4 num_hops = 2 orig_nids, orig_eids = partition_graph(g, 'test', num_parts, '/tmp/partition', num_hops=num_hops, part_method=part_method, reshuffle=reshuffle, return_mapping=True) part_sizes = [] shuffled_labels = [] shuffled_edata = [] for i in range(num_parts): part_g, node_feats, edge_feats, gpb, _, ntypes, etypes = load_partition( '/tmp/partition/test.json', i) # Check the metadata assert gpb._num_nodes() == g.number_of_nodes() assert gpb._num_edges() == g.number_of_edges() assert gpb.num_partitions() == num_parts gpb_meta = gpb.metadata() assert len(gpb_meta) == num_parts assert len(gpb.partid2nids(i)) == gpb_meta[i]['num_nodes'] assert len(gpb.partid2eids(i)) == gpb_meta[i]['num_edges'] part_sizes.append((gpb_meta[i]['num_nodes'], gpb_meta[i]['num_edges'])) nid = F.boolean_mask(part_g.ndata[dgl.NID], part_g.ndata['inner_node']) local_nid = gpb.nid2localnid(nid, i) assert F.dtype(local_nid) in (F.int64, F.int32) assert np.all(F.asnumpy(local_nid) == np.arange(0, len(local_nid))) eid = F.boolean_mask(part_g.edata[dgl.EID], part_g.edata['inner_edge']) local_eid = gpb.eid2localeid(eid, i) assert F.dtype(local_eid) in (F.int64, F.int32) assert np.all(F.asnumpy(local_eid) == np.arange(0, len(local_eid))) # Check the node map. local_nodes = F.boolean_mask(part_g.ndata[dgl.NID], part_g.ndata['inner_node']) llocal_nodes = F.nonzero_1d(part_g.ndata['inner_node']) local_nodes1 = gpb.partid2nids(i) assert F.dtype(local_nodes1) in (F.int32, F.int64) assert np.all( np.sort(F.asnumpy(local_nodes)) == np.sort(F.asnumpy( local_nodes1))) assert np.all(F.asnumpy(llocal_nodes) == np.arange(len(llocal_nodes))) # Check the edge map. local_edges = F.boolean_mask(part_g.edata[dgl.EID], part_g.edata['inner_edge']) llocal_edges = F.nonzero_1d(part_g.edata['inner_edge']) local_edges1 = gpb.partid2eids(i) assert F.dtype(local_edges1) in (F.int32, F.int64) assert np.all( np.sort(F.asnumpy(local_edges)) == np.sort(F.asnumpy( local_edges1))) assert np.all(F.asnumpy(llocal_edges) == np.arange(len(llocal_edges))) # Verify the mapping between the reshuffled IDs and the original IDs. part_src_ids, part_dst_ids = part_g.edges() part_src_ids = F.gather_row(part_g.ndata[dgl.NID], part_src_ids) part_dst_ids = F.gather_row(part_g.ndata[dgl.NID], part_dst_ids) part_eids = part_g.edata[dgl.EID] orig_src_ids = F.gather_row(orig_nids, part_src_ids) orig_dst_ids = F.gather_row(orig_nids, part_dst_ids) orig_eids1 = F.gather_row(orig_eids, part_eids) orig_eids2 = g.edge_ids(orig_src_ids, orig_dst_ids) assert F.shape(orig_eids1)[0] == F.shape(orig_eids2)[0] assert np.all(F.asnumpy(orig_eids1) == F.asnumpy(orig_eids2)) if reshuffle: part_g.ndata['feats'] = F.gather_row(g.ndata['feats'], part_g.ndata['orig_id']) part_g.edata['feats'] = F.gather_row(g.edata['feats'], part_g.edata['orig_id']) # when we read node data from the original global graph, we should use orig_id. local_nodes = F.boolean_mask(part_g.ndata['orig_id'], part_g.ndata['inner_node']) local_edges = F.boolean_mask(part_g.edata['orig_id'], part_g.edata['inner_edge']) else: part_g.ndata['feats'] = F.gather_row(g.ndata['feats'], part_g.ndata[dgl.NID]) part_g.edata['feats'] = F.gather_row(g.edata['feats'], part_g.edata[dgl.NID]) part_g.update_all(fn.copy_src('feats', 'msg'), fn.sum('msg', 'h')) part_g.update_all(fn.copy_edge('feats', 'msg'), fn.sum('msg', 'eh')) assert F.allclose(F.gather_row(g.ndata['h'], local_nodes), F.gather_row(part_g.ndata['h'], llocal_nodes)) assert F.allclose(F.gather_row(g.ndata['eh'], local_nodes), F.gather_row(part_g.ndata['eh'], llocal_nodes)) for name in ['labels', 'feats']: assert '_N/' + name in node_feats assert node_feats['_N/' + name].shape[0] == len(local_nodes) true_feats = F.gather_row(g.ndata[name], local_nodes) ndata = F.gather_row(node_feats['_N/' + name], local_nid) assert np.all(F.asnumpy(true_feats) == F.asnumpy(ndata)) for name in ['feats']: assert '_E/' + name in edge_feats assert edge_feats['_E/' + name].shape[0] == len(local_edges) true_feats = F.gather_row(g.edata[name], local_edges) edata = F.gather_row(edge_feats['_E/' + name], local_eid) assert np.all(F.asnumpy(true_feats) == F.asnumpy(edata)) # This only works if node/edge IDs are shuffled. if reshuffle: shuffled_labels.append(node_feats['_N/labels']) shuffled_edata.append(edge_feats['_E/feats']) # Verify that we can reconstruct node/edge data for original IDs. if reshuffle: shuffled_labels = F.asnumpy(F.cat(shuffled_labels, 0)) shuffled_edata = F.asnumpy(F.cat(shuffled_edata, 0)) orig_labels = np.zeros(shuffled_labels.shape, dtype=shuffled_labels.dtype) orig_edata = np.zeros(shuffled_edata.shape, dtype=shuffled_edata.dtype) orig_labels[F.asnumpy(orig_nids)] = shuffled_labels orig_edata[F.asnumpy(orig_eids)] = shuffled_edata assert np.all(orig_labels == F.asnumpy(g.ndata['labels'])) assert np.all(orig_edata == F.asnumpy(g.edata['feats'])) if reshuffle: node_map = [] edge_map = [] for i, (num_nodes, num_edges) in enumerate(part_sizes): node_map.append(np.ones(num_nodes) * i) edge_map.append(np.ones(num_edges) * i) node_map = np.concatenate(node_map) edge_map = np.concatenate(edge_map) nid2pid = gpb.nid2partid(F.arange(0, len(node_map))) assert F.dtype(nid2pid) in (F.int32, F.int64) assert np.all(F.asnumpy(nid2pid) == node_map) eid2pid = gpb.eid2partid(F.arange(0, len(edge_map))) assert F.dtype(eid2pid) in (F.int32, F.int64) assert np.all(F.asnumpy(eid2pid) == edge_map)
def check_rpc_bipartite_etype_sampling_shuffle(tmpdir, num_server): """sample on bipartite via sample_etype_neighbors() which yields non-empty sample results""" generate_ip_config("rpc_ip_config.txt", num_server, num_server) g = create_random_bipartite() num_parts = num_server num_hops = 1 orig_nids, _ = partition_graph(g, 'test_sampling', num_parts, tmpdir, num_hops=num_hops, part_method='metis', reshuffle=True, return_mapping=True) pserver_list = [] ctx = mp.get_context('spawn') for i in range(num_server): p = ctx.Process(target=start_server, args=(i, tmpdir, num_server > 1, 'test_sampling')) p.start() time.sleep(1) pserver_list.append(p) fanout = 3 deg = get_degrees(g, orig_nids['game'], 'game') nids = F.nonzero_1d(deg > 0) block, gpb = start_bipartite_etype_sample_client(0, tmpdir, num_server > 1, fanout, nodes={ 'game': nids, 'user': [0] }) print("Done sampling") for p in pserver_list: p.join() orig_nid_map = { ntype: F.zeros((g.number_of_nodes(ntype), ), dtype=F.int64) for ntype in g.ntypes } orig_eid_map = { etype: F.zeros((g.number_of_edges(etype), ), dtype=F.int64) for etype in g.etypes } for i in range(num_server): part, _, _, _, _, _, _ = load_partition(tmpdir / 'test_sampling.json', i) ntype_ids, type_nids = gpb.map_to_per_ntype(part.ndata[dgl.NID]) for ntype_id, ntype in enumerate(g.ntypes): idx = ntype_ids == ntype_id F.scatter_row_inplace(orig_nid_map[ntype], F.boolean_mask(type_nids, idx), F.boolean_mask(part.ndata['orig_id'], idx)) etype_ids, type_eids = gpb.map_to_per_etype(part.edata[dgl.EID]) for etype_id, etype in enumerate(g.etypes): idx = etype_ids == etype_id F.scatter_row_inplace(orig_eid_map[etype], F.boolean_mask(type_eids, idx), F.boolean_mask(part.edata['orig_id'], idx)) for src_type, etype, dst_type in block.canonical_etypes: src, dst = block.edges(etype=etype) # These are global Ids after shuffling. shuffled_src = F.gather_row(block.srcnodes[src_type].data[dgl.NID], src) shuffled_dst = F.gather_row(block.dstnodes[dst_type].data[dgl.NID], dst) shuffled_eid = block.edges[etype].data[dgl.EID] orig_src = F.asnumpy(F.gather_row(orig_nid_map[src_type], shuffled_src)) orig_dst = F.asnumpy(F.gather_row(orig_nid_map[dst_type], shuffled_dst)) orig_eid = F.asnumpy(F.gather_row(orig_eid_map[etype], shuffled_eid)) # Check the node Ids and edge Ids. orig_src1, orig_dst1 = g.find_edges(orig_eid, etype=etype) assert np.all(F.asnumpy(orig_src1) == orig_src) assert np.all(F.asnumpy(orig_dst1) == orig_dst)
def test_sample_neighbors_etype_homogeneous(format_, direction, replace): num_nodes = 100 rare_cnt = 4 g = create_etype_test_graph(100, 30, rare_cnt) h_g = dgl.to_homogeneous(g) seed_ntype = g.get_ntype_id("u") seeds = F.nonzero_1d(h_g.ndata[dgl.NTYPE] == seed_ntype) fanouts = F.tensor([6, 5, 4, 3, 2], dtype=F.int64) def check_num(h_g, all_src, all_dst, subg, replace, fanouts, direction): src, dst = subg.edges() num_etypes = F.asnumpy(h_g.edata[dgl.ETYPE]).max() etype_array = F.asnumpy(subg.edata[dgl.ETYPE]) src = F.asnumpy(src) dst = F.asnumpy(dst) fanouts = F.asnumpy(fanouts) all_etype_array = F.asnumpy(h_g.edata[dgl.ETYPE]) all_src = F.asnumpy(all_src) all_dst = F.asnumpy(all_dst) src_per_etype = [] dst_per_etype = [] for etype in range(num_etypes): src_per_etype.append(src[etype_array == etype]) dst_per_etype.append(dst[etype_array == etype]) if replace: if direction == 'in': in_degree_per_etype = [np.bincount(d) for d in dst_per_etype] for in_degree, fanout in zip(in_degree_per_etype, fanouts): assert np.all(in_degree == fanout) else: out_degree_per_etype = [np.bincount(s) for s in src_per_etype] for out_degree, fanout in zip(out_degree_per_etype, fanouts): assert np.all(out_degree == fanout) else: if direction == 'in': for v in set(dst): u = src[dst == v] et = etype_array[dst == v] all_u = all_src[all_dst == v] all_et = all_etype_array[all_dst == v] for etype in set(et): u_etype = set(u[et == etype]) all_u_etype = set(all_u[all_et == etype]) assert (len(u_etype) == fanouts[etype]) or (u_etype == all_u_etype) else: for u in set(src): v = dst[src == u] et = etype_array[src == u] all_v = all_dst[all_src == u] all_et = all_etype_array[all_src == u] for etype in set(et): v_etype = set(v[et == etype]) all_v_etype = set(all_v[all_et == etype]) assert (len(v_etype) == fanouts[etype]) or (v_etype == all_v_etype) all_src, all_dst = h_g.edges() h_g = h_g.formats(format_) if (direction, format_) in [('in', 'csr'), ('out', 'csc')]: h_g = h_g.formats(['csc', 'csr', 'coo']) for _ in range(5): subg = dgl.sampling.sample_etype_neighbors( h_g, seeds, dgl.ETYPE, fanouts, replace=replace, edge_dir=direction) check_num(h_g, all_src, all_dst, subg, replace, fanouts, direction)